diff --git a/configs/example/xiangshan.py b/configs/example/xiangshan.py index b4400b88b2..cebd264dc5 100644 --- a/configs/example/xiangshan.py +++ b/configs/example/xiangshan.py @@ -343,7 +343,7 @@ def setKmhV3IdealParams(args, system): # ideal decoupled frontend if args.bp_type is None or args.bp_type == 'DecoupledBPUWithFTB': - # cpu.branchPred.enableTwoTaken = True + cpu.branchPred.enableTwoTaken = True cpu.branchPred.numBr = 6 cpu.branchPred.predictWidth = 64 cpu.branchPred.tage.enableSC = False # TODO(bug): When numBr changes, enabling SC will trigger an assert diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc index d8eeb74e06..7611a39106 100644 --- a/src/cpu/o3/fetch.cc +++ b/src/cpu/o3/fetch.cc @@ -1315,7 +1315,12 @@ Fetch::tick() usedUpFetchTargets = !dbsp->trySupplyFetchWithTarget(pc[0]->instAddr()); } else if (isFTBPred()) { assert(dbpftb); - dbpftb->tick(); + // TODO: remove ideal_tick() + if (dbpftb->enableTwoTaken){ + dbpftb->ideal_tick(); + } else { + dbpftb->tick(); + } usedUpFetchTargets = !dbpftb->trySupplyFetchWithTarget(pc[0]->instAddr(), currentFetchTargetInLoop); } } diff --git a/src/cpu/pred/BranchPredictor.py b/src/cpu/pred/BranchPredictor.py index b4f661a11e..7a887aa654 100644 --- a/src/cpu/pred/BranchPredictor.py +++ b/src/cpu/pred/BranchPredictor.py @@ -957,3 +957,4 @@ class DecoupledBPUWithFTB(BranchPredictor): enableLoopBuffer = Param.Bool(False, "Enable loop buffer to supply inst for loops") enableLoopPredictor = Param.Bool(False, "Use loop predictor to predict loop exit") enableJumpAheadPredictor = Param.Bool(False, "Use jump ahead predictor to skip no-need-to-predict blocks") + enableTwoTaken = Param.Bool(False, "Enable predicting two taken blocks per cycle") diff --git a/src/cpu/pred/ftb/decoupled_bpred.cc b/src/cpu/pred/ftb/decoupled_bpred.cc index 50a11791dd..795135991b 100644 --- a/src/cpu/pred/ftb/decoupled_bpred.cc +++ b/src/cpu/pred/ftb/decoupled_bpred.cc @@ -27,6 +27,7 @@ DecoupledBPUWithFTB::DecoupledBPUWithFTB(const DecoupledBPUWithFTBParams &p) enableLoopBuffer(p.enableLoopBuffer), enableLoopPredictor(p.enableLoopPredictor), enableJumpAheadPredictor(p.enableJumpAheadPredictor), + enableTwoTaken(p.enableTwoTaken), fetchTargetQueue(p.ftq_size), fetchStreamQueueSize(p.fsq_size), numBr(p.numBr), @@ -606,8 +607,9 @@ DecoupledBPUWithFTB::tick() } if (!receivedPred && numOverrideBubbles == 0 && sentPCHist) { - generateFinalPredAndCreateBubbles(); + numOverrideBubbles = generateFinalPredAndCreateBubbles(); } + if (!squashing) { DPRINTF(DecoupleBP, "DecoupledBPUWithFTB::tick()\n"); DPRINTF(Override, "DecoupledBPUWithFTB::tick()\n"); @@ -647,7 +649,7 @@ DecoupledBPUWithFTB::tick() sentPCHist = true; } - + // query loop buffer with start pc if (enableLoopBuffer && !lb.isActive() && @@ -660,7 +662,7 @@ DecoupledBPUWithFTB::tick() for (int i = 0; i < numStages; i++) { printFullFTBPrediction(predsOfEachStage[i]); } - + if (streamQueueFull()) { DPRINTF(DecoupleBP, "Stream queue is full, don't request prediction\n"); DPRINTF(Override, "Stream queue is full, don't request prediction\n"); @@ -668,11 +670,110 @@ DecoupledBPUWithFTB::tick() squashing = false; } +// ideal_tick() is copied from commit: e7294f1813c331dbce8bcfa4d5eb981f7c8440c5 +// TODO: Fix bug in ideal_tick(): Bubbles created by generateFinalPredAndCreateBubbles() are lost in the next tick, +// resulting in almost NO override bubbles. To resolve this, move tryEnqFetchTarget() and tryEnqFetchStream() +// outside the while loop (before decrementing numOverrideBubbles). Additionally, ensure that TWO FTB entries +// generated by generateAndSetNewFetchStream() are saved in the current tick, allowing both entries to enqueue +// to FTQ/FSQ in the next tick. +void +DecoupledBPUWithFTB::ideal_tick() +{ + dbpFtbStats.fsqEntryDist.sample(fetchStreamQueue.size(), 1); + if (streamQueueFull()) { + dbpFtbStats.fsqFullCannotEnq++; + } + + + int predsRemainsToBeMade = enableTwoTaken ? 2 : 1; + // in two taken roofline model, we create max(bubblesOfPreds1, bubblesOfPreds2) bubbles + int tempNumOverrideBubbles = 0; + + if (numOverrideBubbles > 0) { + numOverrideBubbles--; + } + + while (predsRemainsToBeMade > 0) { + // make one prediction + if (!squashing) { + DPRINTF(DecoupleBP, "DecoupledBPUWithFTB::tick()\n"); + DPRINTF(Override, "DecoupledBPUWithFTB::tick()\n"); + tryEnqFetchTarget(); + tryEnqFetchStream(); + } else { + receivedPred = false; + DPRINTF(DecoupleBP, "Squashing, skip this cycle, receivedPred is %d.\n", receivedPred); + DPRINTF(Override, "Squashing, skip this cycle, receivedPred is %d.\n", receivedPred); + } + + + sentPCHist = false; + + if (!receivedPred && !streamQueueFull()) { + if (!enableLoopBuffer || (enableLoopBuffer && !lb.isActive())) { + if (s0PC == ObservingPC) { + DPRINTFV(true, "Predicting block %#lx, id: %lu\n", s0PC, fsqId); + } + DPRINTF(DecoupleBP, "Requesting prediction for stream start=%#lx\n", s0PC); + DPRINTF(Override, "Requesting prediction for stream start=%#lx\n", s0PC); + // put startAddr in preds + for (int i = 0; i < numStages; i++) { + predsOfEachStage[i].bbStart = s0PC; + } + for (int i = 0; i < numComponents; i++) { + components[i]->putPCHistory(s0PC, s0History, predsOfEachStage); + } + } else { + DPRINTF(LoopBuffer, "Do not query bpu when loop buffer is active\n"); + DPRINTF(DecoupleBP, "Do not query bpu when loop buffer is active\n"); + } + + + sentPCHist = true; + } + + + // query loop buffer with start pc + if (enableLoopBuffer && !lb.isActive() && + lb.streamBeforeLoop.getTakenTarget() == lb.streamBeforeLoop.startPC && + !lb.streamBeforeLoop.resolved) { // do not activate loop buffer right after squash + lb.tryActivateLoop(s0PC); + } + + DPRINTF(Override, "after putPCHistory\n"); + for (int i = 0; i < numStages; i++) { + printFullFTBPrediction(predsOfEachStage[i]); + } + + if (streamQueueFull()) { + DPRINTF(DecoupleBP, "Stream queue is full, don't request prediction\n"); + DPRINTF(Override, "Stream queue is full, don't request prediction\n"); + } + squashing = false; + + + if (!receivedPred && numOverrideBubbles == 0 && sentPCHist) { + tempNumOverrideBubbles = std::max(generateFinalPredAndCreateBubbles(), tempNumOverrideBubbles); + generateAndSetNewFetchStream(); + if (!enableTwoTaken) { + numOverrideBubbles = tempNumOverrideBubbles; + } else { + if (predsRemainsToBeMade == 1) { + numOverrideBubbles = tempNumOverrideBubbles; + } + } + } + + predsRemainsToBeMade--; + } +} + // this function collects predictions from all stages and generate bubbles // when loop buffer is active, predictions are from saved stream -void +int DecoupledBPUWithFTB::generateFinalPredAndCreateBubbles() { + int bubblesToCreate = 0; DPRINTF(Override, "In generateFinalPredAndCreateBubbles().\n"); if (!enableLoopBuffer || (enableLoopBuffer && !lb.isActive())) { @@ -700,7 +801,7 @@ DecoupledBPUWithFTB::generateFinalPredAndCreateBubbles() first_hit_stage++; } // generate bubbles - numOverrideBubbles = first_hit_stage; + bubblesToCreate = first_hit_stage; // assign pred source finalPred.predSource = first_hit_stage; receivedPred = true; @@ -731,14 +832,16 @@ DecoupledBPUWithFTB::generateFinalPredAndCreateBubbles() printFullFTBPrediction(*chosen); dbpFtbStats.predsOfEachStage[first_hit_stage]++; } else { - numOverrideBubbles = 0; + bubblesToCreate = 0; receivedPred = true; DPRINTF(LoopBuffer, "Do not generate final pred when loop buffer is active\n"); DPRINTF(DecoupleBP, "Do not generate final pred when loop buffer is active\n"); } - DPRINTF(Override, "Ends generateFinalPredAndCreateBubbles(), numOverrideBubbles is %d, receivedPred is set true.\n", numOverrideBubbles); + DPRINTF(Override, "Ends generateFinalPredAndCreateBubbles(), numOverrideBubbles is %d," + "receivedPred is set true.\n", bubblesToCreate); + return bubblesToCreate; } bool @@ -2016,13 +2119,17 @@ DecoupledBPUWithFTB::tryEnqFetchStream() return; } assert(!streamQueueFull()); - if (true) { - bool should_create_new_stream = true; - makeNewPrediction(should_create_new_stream); + + if (!enableTwoTaken) { + // Make new prediction here and enqueue the fetch stream. + generateAndSetNewFetchStream(); + enqueueFetchStream(); } else { - DPRINTF(DecoupleBP || debugFlagOn, "FSQ is full: %lu\n", - fetchStreamQueue.size()); + // The stream entry has been set at the previous ideal_tick(), + // so at this point, we only need to enqueue the fetch stream. + enqueueFetchStream(); } + for (int i = 0; i < numStages; i++) { predsOfEachStage[i].valid = false; } @@ -2235,7 +2342,7 @@ DecoupledBPUWithFTB::makeLoopPredictions(FetchStream &entry, bool &endLoop, bool // this function enqueues fsq and update s0PC and s0History // use loop predictor and loop buffer here void -DecoupledBPUWithFTB::makeNewPrediction(bool create_new_stream) +DecoupledBPUWithFTB::generateAndSetNewFetchStream() { DPRINTF(DecoupleBP, "Try to make new prediction\n"); FetchStream entry_new; @@ -2249,9 +2356,6 @@ DecoupledBPUWithFTB::makeNewPrediction(bool create_new_stream) if (finalPred.controlAddr() == ObservingPC || finalPred.controlAddr() == ObservingPC2) { debugFlagOn = true; } - DPRINTF(DecoupleBP || debugFlagOn, "Make pred with %s, pred valid: %i, taken: %i\n", - create_new_stream ? "new stream" : "last missing stream", - finalPred.valid, finalPred.isTaken()); // if loop buffer is not activated, use normal prediction from branch predictors bool endLoop, isDouble, loopConf; @@ -2429,14 +2533,21 @@ DecoupledBPUWithFTB::makeNewPrediction(bool create_new_stream) DPRINTF(LoopBuffer, "now stream before loop:\n"); printStream(lb.streamBeforeLoop); - auto [insert_it, inserted] = fetchStreamQueue.emplace(fsqId, entry); + streamToEnqueue = entry; +} + + +void +DecoupledBPUWithFTB::enqueueFetchStream() +{ + auto [insert_it, inserted] = fetchStreamQueue.emplace(fsqId, streamToEnqueue); assert(inserted); dumpFsq("after insert new stream"); DPRINTF(DecoupleBP || debugFlagOn, "Insert fetch stream %lu\n", fsqId); fsqId++; - printStream(entry); + printStream(streamToEnqueue); dbpFtbStats.fsqEntryEnqueued++; } @@ -2515,4 +2626,4 @@ DecoupledBPUWithFTB::getPreservedReturnAddr(const DynInstPtr &dynInst) } // namespace branch_prediction -} // namespace gem5 +} // namespace gem5 \ No newline at end of file diff --git a/src/cpu/pred/ftb/decoupled_bpred.hh b/src/cpu/pred/ftb/decoupled_bpred.hh index 61ffaa52d5..3b766cacf1 100644 --- a/src/cpu/pred/ftb/decoupled_bpred.hh +++ b/src/cpu/pred/ftb/decoupled_bpred.hh @@ -195,6 +195,8 @@ class DecoupledBPUWithFTB : public BPredUnit JumpAheadPredictor jap; bool enableJumpAheadPredictor{true}; + bool enableTwoTaken{false}; + private: std::string _name; @@ -204,6 +206,7 @@ class DecoupledBPUWithFTB : public BPredUnit unsigned fetchStreamQueueSize; FetchStreamId fsqId{1}; FetchStream lastCommittedStream; + FetchStream streamToEnqueue; CPU *cpu; @@ -285,7 +288,7 @@ class DecoupledBPUWithFTB : public BPredUnit void tryEnqFetchTarget(); - void makeNewPrediction(bool create_new_stream); + void enqueueFetchStream(); void makeLoopPredictions(FetchStream &entry, bool &endLoop, bool &isDouble, bool &loopConf, std::vector &lpRedirectInfos, std::vector &fixNotExits, @@ -353,7 +356,11 @@ class DecoupledBPUWithFTB : public BPredUnit return fetchStreamQueue.size() >= fetchStreamQueueSize; } - void generateFinalPredAndCreateBubbles(); + // return number of bubbles created + int generateFinalPredAndCreateBubbles(); + + // set new fetch stream from final pred + void generateAndSetNewFetchStream(); // const bool dumpLoopPred; @@ -532,6 +539,7 @@ class DecoupledBPUWithFTB : public BPredUnit public: void tick(); + void ideal_tick(); bool trySupplyFetchWithTarget(Addr fetch_demand_pc, bool &fetchTargetInLoop);