Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cpu-o3: refactor decoupled FTB #231

Merged
merged 9 commits into from
Dec 23, 2024
2 changes: 1 addition & 1 deletion configs/example/xiangshan.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,7 @@ def setKmhV3IdealParams(args, system):

# ideal decoupled frontend
if args.bp_type is None or args.bp_type == 'DecoupledBPUWithFTB':
# cpu.branchPred.enableTwoTaken = True
cpu.branchPred.enableTwoTaken = True
cpu.branchPred.numBr = 6
cpu.branchPred.predictWidth = 64
cpu.branchPred.tage.enableSC = False # TODO(bug): When numBr changes, enabling SC will trigger an assert
Expand Down
7 changes: 6 additions & 1 deletion src/cpu/o3/fetch.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1315,7 +1315,12 @@ Fetch::tick()
usedUpFetchTargets = !dbsp->trySupplyFetchWithTarget(pc[0]->instAddr());
} else if (isFTBPred()) {
assert(dbpftb);
dbpftb->tick();
// TODO: remove ideal_tick()
if (dbpftb->enableTwoTaken){
dbpftb->ideal_tick();
} else {
dbpftb->tick();
}
usedUpFetchTargets = !dbpftb->trySupplyFetchWithTarget(pc[0]->instAddr(), currentFetchTargetInLoop);
}
}
Expand Down
1 change: 1 addition & 0 deletions src/cpu/pred/BranchPredictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -957,3 +957,4 @@ class DecoupledBPUWithFTB(BranchPredictor):
enableLoopBuffer = Param.Bool(False, "Enable loop buffer to supply inst for loops")
enableLoopPredictor = Param.Bool(False, "Use loop predictor to predict loop exit")
enableJumpAheadPredictor = Param.Bool(False, "Use jump ahead predictor to skip no-need-to-predict blocks")
enableTwoTaken = Param.Bool(False, "Enable predicting two taken blocks per cycle")
149 changes: 130 additions & 19 deletions src/cpu/pred/ftb/decoupled_bpred.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ DecoupledBPUWithFTB::DecoupledBPUWithFTB(const DecoupledBPUWithFTBParams &p)
enableLoopBuffer(p.enableLoopBuffer),
enableLoopPredictor(p.enableLoopPredictor),
enableJumpAheadPredictor(p.enableJumpAheadPredictor),
enableTwoTaken(p.enableTwoTaken),
fetchTargetQueue(p.ftq_size),
fetchStreamQueueSize(p.fsq_size),
numBr(p.numBr),
Expand Down Expand Up @@ -606,8 +607,9 @@ DecoupledBPUWithFTB::tick()
}

if (!receivedPred && numOverrideBubbles == 0 && sentPCHist) {
generateFinalPredAndCreateBubbles();
numOverrideBubbles = generateFinalPredAndCreateBubbles();
}

if (!squashing) {
DPRINTF(DecoupleBP, "DecoupledBPUWithFTB::tick()\n");
DPRINTF(Override, "DecoupledBPUWithFTB::tick()\n");
Expand Down Expand Up @@ -647,7 +649,7 @@ DecoupledBPUWithFTB::tick()

sentPCHist = true;
}


// query loop buffer with start pc
if (enableLoopBuffer && !lb.isActive() &&
Expand All @@ -660,19 +662,118 @@ DecoupledBPUWithFTB::tick()
for (int i = 0; i < numStages; i++) {
printFullFTBPrediction(predsOfEachStage[i]);
}

if (streamQueueFull()) {
DPRINTF(DecoupleBP, "Stream queue is full, don't request prediction\n");
DPRINTF(Override, "Stream queue is full, don't request prediction\n");
}
squashing = false;
}

// ideal_tick() is copied from commit: e7294f1813c331dbce8bcfa4d5eb981f7c8440c5
// TODO: Fix bug in ideal_tick(): Bubbles created by generateFinalPredAndCreateBubbles() are lost in the next tick,
// resulting in almost NO override bubbles. To resolve this, move tryEnqFetchTarget() and tryEnqFetchStream()
// outside the while loop (before decrementing numOverrideBubbles). Additionally, ensure that TWO FTB entries
// generated by generateAndSetNewFetchStream() are saved in the current tick, allowing both entries to enqueue
// to FTQ/FSQ in the next tick.
void
DecoupledBPUWithFTB::ideal_tick()
{
dbpFtbStats.fsqEntryDist.sample(fetchStreamQueue.size(), 1);
if (streamQueueFull()) {
dbpFtbStats.fsqFullCannotEnq++;
}


int predsRemainsToBeMade = enableTwoTaken ? 2 : 1;
// in two taken roofline model, we create max(bubblesOfPreds1, bubblesOfPreds2) bubbles
int tempNumOverrideBubbles = 0;

if (numOverrideBubbles > 0) {
numOverrideBubbles--;
}

while (predsRemainsToBeMade > 0) {
// make one prediction
if (!squashing) {
DPRINTF(DecoupleBP, "DecoupledBPUWithFTB::tick()\n");
DPRINTF(Override, "DecoupledBPUWithFTB::tick()\n");
tryEnqFetchTarget();
tryEnqFetchStream();
} else {
receivedPred = false;
DPRINTF(DecoupleBP, "Squashing, skip this cycle, receivedPred is %d.\n", receivedPred);
DPRINTF(Override, "Squashing, skip this cycle, receivedPred is %d.\n", receivedPred);
}


sentPCHist = false;

if (!receivedPred && !streamQueueFull()) {
if (!enableLoopBuffer || (enableLoopBuffer && !lb.isActive())) {
if (s0PC == ObservingPC) {
DPRINTFV(true, "Predicting block %#lx, id: %lu\n", s0PC, fsqId);
}
DPRINTF(DecoupleBP, "Requesting prediction for stream start=%#lx\n", s0PC);
DPRINTF(Override, "Requesting prediction for stream start=%#lx\n", s0PC);
// put startAddr in preds
for (int i = 0; i < numStages; i++) {
predsOfEachStage[i].bbStart = s0PC;
}
for (int i = 0; i < numComponents; i++) {
components[i]->putPCHistory(s0PC, s0History, predsOfEachStage);
}
} else {
DPRINTF(LoopBuffer, "Do not query bpu when loop buffer is active\n");
DPRINTF(DecoupleBP, "Do not query bpu when loop buffer is active\n");
}


sentPCHist = true;
}


// query loop buffer with start pc
if (enableLoopBuffer && !lb.isActive() &&
lb.streamBeforeLoop.getTakenTarget() == lb.streamBeforeLoop.startPC &&
!lb.streamBeforeLoop.resolved) { // do not activate loop buffer right after squash
lb.tryActivateLoop(s0PC);
}

DPRINTF(Override, "after putPCHistory\n");
for (int i = 0; i < numStages; i++) {
printFullFTBPrediction(predsOfEachStage[i]);
}

if (streamQueueFull()) {
DPRINTF(DecoupleBP, "Stream queue is full, don't request prediction\n");
DPRINTF(Override, "Stream queue is full, don't request prediction\n");
}
squashing = false;


if (!receivedPred && numOverrideBubbles == 0 && sentPCHist) {
tempNumOverrideBubbles = std::max(generateFinalPredAndCreateBubbles(), tempNumOverrideBubbles);
generateAndSetNewFetchStream();
if (!enableTwoTaken) {
numOverrideBubbles = tempNumOverrideBubbles;
} else {
if (predsRemainsToBeMade == 1) {
numOverrideBubbles = tempNumOverrideBubbles;
}
}
}

predsRemainsToBeMade--;
}
}

// this function collects predictions from all stages and generate bubbles
// when loop buffer is active, predictions are from saved stream
void
int
DecoupledBPUWithFTB::generateFinalPredAndCreateBubbles()
{
int bubblesToCreate = 0;
DPRINTF(Override, "In generateFinalPredAndCreateBubbles().\n");

if (!enableLoopBuffer || (enableLoopBuffer && !lb.isActive())) {
Expand Down Expand Up @@ -700,7 +801,7 @@ DecoupledBPUWithFTB::generateFinalPredAndCreateBubbles()
first_hit_stage++;
}
// generate bubbles
numOverrideBubbles = first_hit_stage;
bubblesToCreate = first_hit_stage;
// assign pred source
finalPred.predSource = first_hit_stage;
receivedPred = true;
Expand Down Expand Up @@ -731,14 +832,16 @@ DecoupledBPUWithFTB::generateFinalPredAndCreateBubbles()
printFullFTBPrediction(*chosen);
dbpFtbStats.predsOfEachStage[first_hit_stage]++;
} else {
numOverrideBubbles = 0;
bubblesToCreate = 0;
receivedPred = true;
DPRINTF(LoopBuffer, "Do not generate final pred when loop buffer is active\n");
DPRINTF(DecoupleBP, "Do not generate final pred when loop buffer is active\n");
}

DPRINTF(Override, "Ends generateFinalPredAndCreateBubbles(), numOverrideBubbles is %d, receivedPred is set true.\n", numOverrideBubbles);
DPRINTF(Override, "Ends generateFinalPredAndCreateBubbles(), numOverrideBubbles is %d,"
"receivedPred is set true.\n", bubblesToCreate);

return bubblesToCreate;
}

bool
Expand Down Expand Up @@ -2016,13 +2119,17 @@ DecoupledBPUWithFTB::tryEnqFetchStream()
return;
}
assert(!streamQueueFull());
if (true) {
bool should_create_new_stream = true;
makeNewPrediction(should_create_new_stream);

if (!enableTwoTaken) {
// Make new prediction here and enqueue the fetch stream.
generateAndSetNewFetchStream();
enqueueFetchStream();
} else {
DPRINTF(DecoupleBP || debugFlagOn, "FSQ is full: %lu\n",
fetchStreamQueue.size());
// The stream entry has been set at the previous ideal_tick(),
// so at this point, we only need to enqueue the fetch stream.
enqueueFetchStream();
}

for (int i = 0; i < numStages; i++) {
predsOfEachStage[i].valid = false;
}
Expand Down Expand Up @@ -2235,7 +2342,7 @@ DecoupledBPUWithFTB::makeLoopPredictions(FetchStream &entry, bool &endLoop, bool
// this function enqueues fsq and update s0PC and s0History
// use loop predictor and loop buffer here
void
DecoupledBPUWithFTB::makeNewPrediction(bool create_new_stream)
DecoupledBPUWithFTB::generateAndSetNewFetchStream()
{
DPRINTF(DecoupleBP, "Try to make new prediction\n");
FetchStream entry_new;
Expand All @@ -2249,9 +2356,6 @@ DecoupledBPUWithFTB::makeNewPrediction(bool create_new_stream)
if (finalPred.controlAddr() == ObservingPC || finalPred.controlAddr() == ObservingPC2) {
debugFlagOn = true;
}
DPRINTF(DecoupleBP || debugFlagOn, "Make pred with %s, pred valid: %i, taken: %i\n",
create_new_stream ? "new stream" : "last missing stream",
finalPred.valid, finalPred.isTaken());

// if loop buffer is not activated, use normal prediction from branch predictors
bool endLoop, isDouble, loopConf;
Expand Down Expand Up @@ -2429,14 +2533,21 @@ DecoupledBPUWithFTB::makeNewPrediction(bool create_new_stream)
DPRINTF(LoopBuffer, "now stream before loop:\n");
printStream(lb.streamBeforeLoop);

auto [insert_it, inserted] = fetchStreamQueue.emplace(fsqId, entry);
streamToEnqueue = entry;
}


void
DecoupledBPUWithFTB::enqueueFetchStream()
{
auto [insert_it, inserted] = fetchStreamQueue.emplace(fsqId, streamToEnqueue);
assert(inserted);

dumpFsq("after insert new stream");
DPRINTF(DecoupleBP || debugFlagOn, "Insert fetch stream %lu\n", fsqId);

fsqId++;
printStream(entry);
printStream(streamToEnqueue);

dbpFtbStats.fsqEntryEnqueued++;
}
Expand Down Expand Up @@ -2515,4 +2626,4 @@ DecoupledBPUWithFTB::getPreservedReturnAddr(const DynInstPtr &dynInst)

} // namespace branch_prediction

} // namespace gem5
} // namespace gem5
12 changes: 10 additions & 2 deletions src/cpu/pred/ftb/decoupled_bpred.hh
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,8 @@ class DecoupledBPUWithFTB : public BPredUnit
JumpAheadPredictor jap;
bool enableJumpAheadPredictor{true};

bool enableTwoTaken{false};

private:
std::string _name;

Expand All @@ -204,6 +206,7 @@ class DecoupledBPUWithFTB : public BPredUnit
unsigned fetchStreamQueueSize;
FetchStreamId fsqId{1};
FetchStream lastCommittedStream;
FetchStream streamToEnqueue;

CPU *cpu;

Expand Down Expand Up @@ -285,7 +288,7 @@ class DecoupledBPUWithFTB : public BPredUnit

void tryEnqFetchTarget();

void makeNewPrediction(bool create_new_stream);
void enqueueFetchStream();

void makeLoopPredictions(FetchStream &entry, bool &endLoop, bool &isDouble, bool &loopConf,
std::vector<LoopRedirectInfo> &lpRedirectInfos, std::vector<bool> &fixNotExits,
Expand Down Expand Up @@ -353,7 +356,11 @@ class DecoupledBPUWithFTB : public BPredUnit
return fetchStreamQueue.size() >= fetchStreamQueueSize;
}

void generateFinalPredAndCreateBubbles();
// return number of bubbles created
int generateFinalPredAndCreateBubbles();

// set new fetch stream from final pred
void generateAndSetNewFetchStream();

// const bool dumpLoopPred;

Expand Down Expand Up @@ -532,6 +539,7 @@ class DecoupledBPUWithFTB : public BPredUnit

public:
void tick();
void ideal_tick();

bool trySupplyFetchWithTarget(Addr fetch_demand_pc, bool &fetchTargetInLoop);

Expand Down
Loading