From bab9b68b064037a326e581fe4118c43dd2d641ca Mon Sep 17 00:00:00 2001
From: lixin <1037997956@qq.com>
Date: Tue, 26 Nov 2024 11:13:05 +0800
Subject: [PATCH 01/10] cpu-o3: Transform the lsqunit

Transform the load/store execution logic into a multi-stage pipeline form

Change-Id: Iaf7558ad75ed8fe2bbf4a776359db113b6126453
---
 src/cpu/o3/iew.cc        | 203 +++++++++++++++++----------------------
 src/cpu/o3/iew.hh        |  11 +++
 src/cpu/o3/inst_queue.cc |   2 +-
 src/cpu/o3/lsq.cc        |  41 +++++++-
 src/cpu/o3/lsq.hh        |   9 ++
 src/cpu/o3/lsq_unit.cc   | 186 ++++++++++++++++++++++++++++++++++-
 src/cpu/o3/lsq_unit.hh   |  53 +++++++++-
 7 files changed, 386 insertions(+), 119 deletions(-)

diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc
index 9e32532ec8..2715558765 100644
--- a/src/cpu/o3/iew.cc
+++ b/src/cpu/o3/iew.cc
@@ -1326,6 +1326,84 @@ IEW::printAvailableInsts()
     std::cout << "\n";
 }
 
+void
+IEW::SquashCheckAfterExe(DynInstPtr inst)
+{
+    ThreadID tid = inst->threadNumber;
+
+    if (!fetchRedirect[tid] ||
+        !execWB->squash[tid] ||
+        execWB->squashedSeqNum[tid] > inst->seqNum) {
+
+        // Prevent testing for misprediction on load instructions,
+        // that have not been executed.
+        bool loadNotExecuted = !inst->isExecuted() && inst->isLoad();
+
+        if (inst->mispredicted() && !loadNotExecuted) {
+            fetchRedirect[tid] = true;
+
+            DPRINTF(IEW, "[tid:%i] [sn:%llu] Execute: "
+                    "Branch mispredict detected.\n",
+                    tid, inst->seqNum);
+            DPRINTF(IEW, "[tid:%i] [sn:%llu] "
+                    "Predicted target was PC: %s\n",
+                    tid, inst->seqNum, inst->readPredTarg());
+            DPRINTF(IEW, "[tid:%i] [sn:%llu] Execute: "
+                    "Redirecting fetch to PC: %s\n",
+                    tid, inst->seqNum, inst->pcState());
+            // If incorrect, then signal the ROB that it must be squashed.
+            squashDueToBranch(inst, tid);
+
+            ppMispredict->notify(inst);
+
+            if (inst->readPredTaken()) {
+                iewStats.predictedTakenIncorrect++;
+            } else {
+                iewStats.predictedNotTakenIncorrect++;
+            }
+        } else if (ldstQueue.violation(tid)) {
+            assert(inst->isMemRef());
+            // If there was an ordering violation, then get the
+            // DynInst that caused the violation.  Note that this
+            // clears the violation signal.
+            DynInstPtr violator;
+            violator = ldstQueue.getMemDepViolator(tid);
+
+            DPRINTF(IEW, "LDSTQ detected a violation. Violator PC: %s "
+                    "[sn:%lli], inst PC: %s [sn:%lli]. Addr is: %#x.\n",
+                    violator->pcState(), violator->seqNum,
+                    inst->pcState(), inst->seqNum, inst->physEffAddr);
+
+            fetchRedirect[tid] = true;
+
+            // Tell the instruction queue that a violation has occured.
+            instQueue.violation(inst, violator);
+
+            // Squash.
+            squashDueToMemOrder(violator, tid);
+
+            ++iewStats.memOrderViolationEvents;
+        }
+    } else {
+        // Reset any state associated with redirects that will not
+        // be used.
+        if (ldstQueue.violation(tid)) {
+            assert(inst->isMemRef());
+
+            DynInstPtr violator = ldstQueue.getMemDepViolator(tid);
+
+            DPRINTF(IEW, "LDSTQ detected a violation.  Violator PC: "
+                    "%s, inst PC: %s.  Addr is: %#x.\n",
+                    violator->pcState(), inst->pcState(),
+                    inst->physEffAddr);
+            DPRINTF(IEW, "Violation will not be handled because "
+                    "already squashing\n");
+
+            ++iewStats.memOrderViolationEvents;
+        }
+    }
+}
+
 void
 IEW::executeInsts()
 {
@@ -1401,53 +1479,15 @@ IEW::executeInsts()
                     // instruction must be deferred.
                     DPRINTF(IEW, "Execute: Delayed translation, deferring "
                             "store.\n");
-                    instQueue.deferMemInst(inst);
+                    deferMemInst(inst);
                     continue;
                 }
             } else if (inst->isLoad()) {
-                // Loads will mark themselves as executed, and their writeback
-                // event adds the instruction to the queue to commit
-                fault = ldstQueue.executeLoad(inst);
-
-                if (inst->isTranslationDelayed() &&
-                    fault == NoFault) {
-                    // A hw page table walk is currently going on; the
-                    // instruction must be deferred.
-                    DPRINTF(IEW, "Execute: Delayed translation, deferring "
-                            "load.\n");
-                    instQueue.deferMemInst(inst);
-                    continue;
-                }
-
-                if (inst->isDataPrefetch() || inst->isInstPrefetch()) {
-                    inst->fault = NoFault;
-                }
+                // add this load inst to loadpipe S0.
+                ldstQueue.issueToLoadPipe(inst);
             } else if (inst->isStore()) {
-                fault = ldstQueue.executeStore(inst);
-
-                if (inst->isTranslationDelayed() &&
-                    fault == NoFault) {
-                    // A hw page table walk is currently going on; the
-                    // instruction must be deferred.
-                    DPRINTF(IEW, "Execute: Delayed translation, deferring "
-                            "store.\n");
-                    instQueue.deferMemInst(inst);
-                    continue;
-                }
-
-                // If the store had a fault then it may not have a mem req
-                if (fault != NoFault || !inst->readPredicate() ||
-                        !inst->isStoreConditional()) {
-                    // If the instruction faulted, then we need to send it
-                    // along to commit without the instruction completing.
-                    // Send this instruction to commit, also make sure iew
-                    // stage realizes there is activity.
-                    inst->setExecuted();
-                    instToCommit(inst);
-                    activityThisCycle();
-                }
-
-                instQueue.notifyExecuted(inst);
+                // add this store inst to storepipe S0.
+                ldstQueue.issueToStorePipe(inst);
 
                 // Store conditionals will mark themselves as
                 // executed, and their writeback event will add the
@@ -1486,81 +1526,14 @@ IEW::executeInsts()
         // This probably needs to prioritize the redirects if a different
         // scheduler is used.  Currently the scheduler schedules the oldest
         // instruction first, so the branch resolution order will be correct.
-        ThreadID tid = inst->threadNumber;
-
-        if (!fetchRedirect[tid] ||
-            !execWB->squash[tid] ||
-            execWB->squashedSeqNum[tid] > inst->seqNum) {
-
-            // Prevent testing for misprediction on load instructions,
-            // that have not been executed.
-            bool loadNotExecuted = !inst->isExecuted() && inst->isLoad();
-
-            if (inst->mispredicted() && !loadNotExecuted) {
-                fetchRedirect[tid] = true;
-
-                DPRINTF(IEW, "[tid:%i] [sn:%llu] Execute: "
-                        "Branch mispredict detected.\n",
-                        tid, inst->seqNum);
-                DPRINTF(IEW, "[tid:%i] [sn:%llu] "
-                        "Predicted target was PC: %s\n",
-                        tid, inst->seqNum, inst->readPredTarg());
-                DPRINTF(IEW, "[tid:%i] [sn:%llu] Execute: "
-                        "Redirecting fetch to PC: %s\n",
-                        tid, inst->seqNum, inst->pcState());
-                // If incorrect, then signal the ROB that it must be squashed.
-                squashDueToBranch(inst, tid);
-
-                ppMispredict->notify(inst);
-
-                if (inst->readPredTaken()) {
-                    iewStats.predictedTakenIncorrect++;
-                } else {
-                    iewStats.predictedNotTakenIncorrect++;
-                }
-            } else if (ldstQueue.violation(tid)) {
-                assert(inst->isMemRef());
-                // If there was an ordering violation, then get the
-                // DynInst that caused the violation.  Note that this
-                // clears the violation signal.
-                DynInstPtr violator;
-                violator = ldstQueue.getMemDepViolator(tid);
-
-                DPRINTF(IEW, "LDSTQ detected a violation. Violator PC: %s "
-                        "[sn:%lli], inst PC: %s [sn:%lli]. Addr is: %#x.\n",
-                        violator->pcState(), violator->seqNum,
-                        inst->pcState(), inst->seqNum, inst->physEffAddr);
-
-                fetchRedirect[tid] = true;
-
-                // Tell the instruction queue that a violation has occured.
-                instQueue.violation(inst, violator);
-
-                // Squash.
-                squashDueToMemOrder(violator, tid);
-
-                ++iewStats.memOrderViolationEvents;
-            }
-        } else {
-            // Reset any state associated with redirects that will not
-            // be used.
-            if (ldstQueue.violation(tid)) {
-                assert(inst->isMemRef());
-
-                DynInstPtr violator = ldstQueue.getMemDepViolator(tid);
-
-                DPRINTF(IEW, "LDSTQ detected a violation.  Violator PC: "
-                        "%s, inst PC: %s.  Addr is: %#x.\n",
-                        violator->pcState(), inst->pcState(),
-                        inst->physEffAddr);
-                DPRINTF(IEW, "Violation will not be handled because "
-                        "already squashing\n");
-
-                ++iewStats.memOrderViolationEvents;
-            }
+        if (!(inst->isLoad() || inst->isStore())) {
+            // Load/Store will call this in `lsq_unit.cc` after execution
+            SquashCheckAfterExe(inst);
         }
     }
 
+    ldstQueue.executePipeSx();
+
     // Update and record activity if we processed any instructions.
     if (inst_num) {
         if (exeStatus == Idle) {
diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh
index f41dfb9492..89825e217a 100644
--- a/src/cpu/o3/iew.hh
+++ b/src/cpu/o3/iew.hh
@@ -252,6 +252,17 @@ class IEW
 
     bool flushAllStores(ThreadID tid) { return ldstQueue.flushAllStores(tid); }
 
+    /** Check if we need to squash after a load/store/branch is executed. */
+    void SquashCheckAfterExe(DynInstPtr inst);
+
+    void notifyExecuted(const DynInstPtr &inst) { instQueue.notifyExecuted(inst); }
+
+    /**
+     * Defers a memory instruction when its DTB translation incurs a hw
+     * page table walk.
+     */
+    void deferMemInst(const DynInstPtr &deferred_inst) { instQueue.deferMemInst(deferred_inst); }
+
     /** Check misprediction  */
     void checkMisprediction(const DynInstPtr &inst);
 
diff --git a/src/cpu/o3/inst_queue.cc b/src/cpu/o3/inst_queue.cc
index ae7647ad7a..725acc5fd9 100644
--- a/src/cpu/o3/inst_queue.cc
+++ b/src/cpu/o3/inst_queue.cc
@@ -697,7 +697,7 @@ InstructionQueue::scheduleReadyInsts()
         assert(op_latency < 64);
         DPRINTF(Schedule, "[sn:%llu] start execute %u cycles\n", issued_inst->seqNum, op_latency);
         cpu->perfCCT->updateInstPos(issued_inst->seqNum, PerfRecord::AtFU);
-        if (op_latency <= 1) {
+        if (op_latency <= 1 || issued_inst->isLoad() || issued_inst->isStore()) {
             i2e_info->size++;
             instsToExecute.push_back(issued_inst);
         }
diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc
index 3c3abac3b9..36140ca179 100644
--- a/src/cpu/o3/lsq.cc
+++ b/src/cpu/o3/lsq.cc
@@ -121,9 +121,10 @@ LSQ::LSQ(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams &params)
     }
 
     thread.reserve(numThreads);
+    // TODO: Parameterize the load/store pipeline stages
     for (ThreadID tid = 0; tid < numThreads; tid++) {
         thread.emplace_back(maxLQEntries, maxSQEntries, params.SbufferEntries,
-            params.SbufferEvictThreshold, params.storeBufferInactiveThreshold);
+            params.SbufferEvictThreshold, params.storeBufferInactiveThreshold, 4, 5);
         thread[tid].init(cpu, iew_ptr, params, this, tid);
         thread[tid].setDcachePort(&dcachePort);
     }
@@ -190,6 +191,15 @@ LSQ::tick()
 
     usedLoadPorts = 0;
     usedStorePorts = 0;
+    // tick lsq_unit
+    std::list<ThreadID>::iterator threads = activeThreads->begin();
+    std::list<ThreadID>::iterator end = activeThreads->end();
+
+    while (threads != end) {
+        ThreadID tid = *threads++;
+        thread[tid].tick();
+    }
+
 }
 Tick
 LSQ::getLastConflictCheckTick()
@@ -280,6 +290,35 @@ LSQ::insertStore(const DynInstPtr &store_inst)
     thread[tid].insertStore(store_inst);
 }
 
+void
+LSQ::issueToLoadPipe(const DynInstPtr &inst)
+{
+    ThreadID tid = inst->threadNumber;
+
+    thread[tid].issueToLoadPipe(inst);
+}
+
+void
+LSQ::issueToStorePipe(const DynInstPtr &inst)
+{
+    ThreadID tid = inst->threadNumber;
+
+    thread[tid].issueToStorePipe(inst);
+}
+
+void
+LSQ::executePipeSx()
+{
+    std::list<ThreadID>::iterator threads = activeThreads->begin();
+    std::list<ThreadID>::iterator end = activeThreads->end();
+
+    while (threads != end) {
+        ThreadID tid = *threads++;
+
+        thread[tid].executePipeSx();
+    }
+}
+
 Fault
 LSQ::executeLoad(const DynInstPtr &inst)
 {
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
index 079cb54e52..6df7099cbd 100644
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -748,6 +748,15 @@ class LSQ
     /** Executes a store. */
     Fault executeStore(const DynInstPtr &inst);
 
+    /** Iq issues a load to load pipeline. */
+    void issueToLoadPipe(const DynInstPtr &inst);
+
+    /** Iq issues a store to store pipeline. */
+    void issueToStorePipe(const DynInstPtr &inst);
+
+    /** Process instructions in each load/store pipeline stages. */
+    void executePipeSx();
+
     /**
      * Commits loads up until the given sequence number for a specific thread.
      */
diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc
index 86a40eebfc..37c1b5a046 100644
--- a/src/cpu/o3/lsq_unit.cc
+++ b/src/cpu/o3/lsq_unit.cc
@@ -430,7 +430,7 @@ LSQUnit::completeDataAccess(PacketPtr pkt)
 }
 
 LSQUnit::LSQUnit(uint32_t lqEntries, uint32_t sqEntries, uint32_t sbufferEntries, uint32_t sbufferEvictThreshold,
-    uint64_t storeBufferInactiveThreshold)
+    uint64_t storeBufferInactiveThreshold, uint32_t ldPipeStages, uint32_t stPipeStages)
     : sbufferEvictThreshold(sbufferEvictThreshold),
       sbufferEntries(sbufferEntries),
       storeBufferWritebackInactive(0),
@@ -438,6 +438,8 @@ LSQUnit::LSQUnit(uint32_t lqEntries, uint32_t sqEntries, uint32_t sbufferEntries
       lsqID(-1),
       storeQueue(sqEntries),
       loadQueue(lqEntries),
+      loadPipe(ldPipeStages - 1, 0),
+      storePipe(stPipeStages - 1, 0),
       storesToWB(0),
       htmStarts(0),
       htmStops(0),
@@ -452,9 +454,27 @@ LSQUnit::LSQUnit(uint32_t lqEntries, uint32_t sqEntries, uint32_t sbufferEntries
     // reserve space, we want if sq will be full, sbuffer will start evicting
     sqFullUpperLimit = sqEntries - 4;
     sqFullLowerLimit = sqFullUpperLimit - 4;
+
+    loadPipeSx.resize(ldPipeStages);
+    storePipeSx.resize(stPipeStages);
+
+    for (int i = 0; i < ldPipeStages; i++) {
+        loadPipeSx[i] = loadPipe.getWire(-i);
+    }
+    for (int i = 0; i < stPipeStages; i++) {
+        storePipeSx[i] = storePipe.getWire(-i);
+    }
+    assert(ldPipeStages >= 4 && stPipeStages >= 5);
     assert(sqFullLowerLimit > 0);
 }
 
+void
+LSQUnit::tick()
+{
+    loadPipe.advance();
+    storePipe.advance();
+}
+
 void
 LSQUnit::init(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams &params,
         LSQ *lsq_ptr, unsigned id)
@@ -900,8 +920,140 @@ LSQUnit::checkViolations(typename LoadQueue::iterator& loadIt,
     return NoFault;
 }
 
+void
+LSQUnit::issueToLoadPipe(const DynInstPtr &inst)
+{
+    // push to loadPipeS0
+    assert(loadPipeSx[0]->size < MaxWidth);
+    loadPipeSx[0]->insts[loadPipeSx[0]->size++] = inst;
+    DPRINTF(LSQUnit, "issueToLoadPipe: [sn:%lli]\n", inst->seqNum);
+    dumpLoadPipe();
+}
+
+void
+LSQUnit::issueToStorePipe(const DynInstPtr &inst)
+{
+    // push to storePipeS0
+    assert(storePipeSx[0]->size < MaxWidth);
+    storePipeSx[0]->insts[storePipeSx[0]->size++] = inst;
+    DPRINTF(LSQUnit, "issueToStorePipe: [sn:%lli]\n", inst->seqNum);
+    dumpStorePipe();
+}
+
+void
+LSQUnit::executeLoadPipeSx()
+{
+    // TODO: execute operations in each load pipelines
+    Fault fault = NoFault;
+    for (int i = 0; i < loadPipeSx.size(); i++) {
+        auto& stage = loadPipeSx[i];
+        switch (i) {
+            case 0:
+                break;
+            case 1:
+                for (int j = 0; j < stage->size; j++) {
+                    auto& inst = stage->insts[j];
+                    if (!inst->isSquashed()) {
+                        // Loads will mark themselves as executed, and their writeback
+                        // event adds the instruction to the queue to commit
+                        fault = executeLoad(inst);
+                        if (inst->isTranslationDelayed() &&
+                            fault == NoFault) {
+                            // A hw page table walk is currently going on; the
+                            // instruction must be deferred.
+                            DPRINTF(LSQUnit, "Execute: Delayed translation, deferring "
+                            "load.\n");
+                            iewStage->deferMemInst(inst);
+                            continue;
+                        }
+                        if (inst->isDataPrefetch() || inst->isInstPrefetch()) {
+                            inst->fault = NoFault;
+                        }
+                        iewStage->SquashCheckAfterExe(inst);
+                    } else {
+                        DPRINTF(LSQUnit, "Execute: Instruction was squashed. PC: %s, [tid:%i]"
+                                        " [sn:%llu]\n", inst->pcState(), inst->threadNumber,
+                                        inst->seqNum);
+                        inst->setExecuted();
+                        inst->setCanCommit();
+                    }
+                }
+                break;
+            case 2:
+                break;
+            case 3:
+                break;
+            default:
+                panic("unsupported loadpipe length");
+        }
+    }
+}
 
+void
+LSQUnit::executeStorePipeSx()
+{
+    // TODO: execute operations in each store pipelines
+    Fault fault = NoFault;
+    for (int i = 0; i < storePipeSx.size(); i++) {
+        auto& stage = storePipeSx[i];
+        switch (i) {
+            case 0:
+                break;
+            case 1:
+                for (int j = 0; j < stage->size; j++) {
+                    auto& inst = stage->insts[j];
+                    if (!inst->isSquashed()) {
+                        fault = executeStore(inst);
+                        if (inst->isTranslationDelayed() &&
+                            fault == NoFault) {
+                            // A hw page table walk is currently going on; the
+                            // instruction must be deferred.
+                            DPRINTF(LSQUnit, "Execute: Delayed translation, deferring "
+                                    "store.\n");
+                            iewStage->deferMemInst(inst);
+                            continue;
+                        }
+
+                        // If the store had a fault then it may not have a mem req
+                        if (fault != NoFault || !inst->readPredicate() ||
+                                !inst->isStoreConditional()) {
+                            // If the instruction faulted, then we need to send it
+                            // along to commit without the instruction completing.
+                            // Send this instruction to commit, also make sure iew
+                            // stage realizes there is activity.
+                            inst->setExecuted();
+                            iewStage->instToCommit(inst);
+                            iewStage->activityThisCycle();
+                        }
+                        iewStage->notifyExecuted(inst);
+                        iewStage->SquashCheckAfterExe(inst);
+                    } else {
+                        DPRINTF(LSQUnit, "Execute: Instruction was squashed. PC: %s, [tid:%i]"
+                                        " [sn:%llu]\n", inst->pcState(), inst->threadNumber,
+                                        inst->seqNum);
+                        inst->setExecuted();
+                        inst->setCanCommit();
+                    }
+                }
+                break;
+            case 2:
+                break;
+            case 3:
+                break;
+            case 4:
+                break;
+            default:
+                panic("unsupported storepipe length");
+        }
+    }
+}
 
+void
+LSQUnit::executePipeSx()
+{
+    executeLoadPipeSx();
+    executeStorePipeSx();
+}
 
 Fault
 LSQUnit::executeLoad(const DynInstPtr &inst)
@@ -1968,6 +2120,38 @@ LSQUnit::recvRetry()
     }
 }
 
+void
+LSQUnit::dumpLoadPipe()
+{
+    DPRINTF(LSQUnit, "Dumping LoadPipe:\n");
+    for (int i = 0; i < loadPipeSx.size(); i++) {
+        DPRINTF(LSQUnit, "Load S%d:, size: %d\n", i, loadPipeSx[i]->size);
+        for (int j = 0; j < loadPipeSx[i]->size; j++) {
+            DPRINTF(LSQUnit, "  PC: %s, [tid:%i] [sn:%lli]\n",
+                    loadPipeSx[i]->insts[j]->pcState(),
+                    loadPipeSx[i]->insts[j]->threadNumber,
+                    loadPipeSx[i]->insts[j]->seqNum
+            );
+        }
+    }
+}
+
+void
+LSQUnit::dumpStorePipe()
+{
+    DPRINTF(LSQUnit, "Dumping StorePipe:\n");
+    for (int i = 0; i < storePipeSx.size(); i++) {
+        DPRINTF(LSQUnit, "Store S%d:, size: %d\n", i, storePipeSx[i]->size);
+        for (int j = 0; j < storePipeSx[i]->size; j++) {
+            DPRINTF(LSQUnit, "  PC: %s, [tid:%i] [sn:%lli]\n",
+                    storePipeSx[i]->insts[j]->pcState(),
+                    storePipeSx[i]->insts[j]->threadNumber,
+                    storePipeSx[i]->insts[j]->seqNum
+            );
+        }
+    }
+}
+
 void
 LSQUnit::dumpInsts() const
 {
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
index 802d359cb6..b96119e1bb 100644
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -290,7 +290,8 @@ class LSQUnit
   public:
     /** Constructs an LSQ unit. init() must be called prior to use. */
     LSQUnit(uint32_t lqEntries, uint32_t sqEntries, uint32_t sbufferEntries,
-      uint32_t sbufferEvictThreshold, uint64_t storeBufferInactiveThreshold);
+      uint32_t sbufferEvictThreshold, uint64_t storeBufferInactiveThreshold,
+      uint32_t ldPipeStages, uint32_t stPipeStages);
 
     /** We cannot copy LSQUnit because it has stats for which copy
      * contructor is deleted explicitly. However, STL vector requires
@@ -342,6 +343,9 @@ class LSQUnit
     /** Executes a load instruction. */
     Fault executeLoad(const DynInstPtr &inst);
 
+    /** Iq issues a load to load pipeline. */
+    void issueToLoadPipe(const DynInstPtr &inst);
+
     Fault executeLoad(int lq_idx) { panic("Not implemented"); return NoFault; }
 
     bool triggerStorePFTrain(int sq_idx);
@@ -349,6 +353,9 @@ class LSQUnit
     /** Executes a store instruction. */
     Fault executeStore(const DynInstPtr& inst);
 
+    /** Iq issues a store to store pipeline. */
+    void issueToStorePipe(const DynInstPtr &inst);
+
     /** Commits the head load. */
     void commitLoad();
     /** Commits loads older than a specific sequence number. */
@@ -487,9 +494,29 @@ class LSQUnit
 
     bool sbufferSendPacket(PacketPtr data_pkt);
 
+    /** Debugging function to dump instructions in the LoadPipe. */
+    void dumpLoadPipe();
+
+    /** Debugging function to dump instructions in the storePipe. */
+    void dumpStorePipe();
+
     /** Debugging function to dump instructions in the LSQ. */
     void dumpInsts() const;
 
+    /** Ticks
+     *  causing load/store pipe to run for one cycle.
+     */
+    void tick();
+
+    /** Process instructions in each load pipeline stages. */
+    void executeLoadPipeSx();
+
+    /** Process instructions in each store pipeline stages. */
+    void executeStorePipeSx();
+
+    /** Wrap function. */
+    void executePipeSx();
+
     /** Schedule event for the cpu. */
     void schedule(Event& ev, Tick when);
 
@@ -586,6 +613,30 @@ class LSQUnit
     /** The load queue. */
     LoadQueue loadQueue;
 
+    /** Struct that defines the information passed through Load Pipeline. */
+    struct LoadPipeStruct
+    {
+        int size;
+
+        DynInstPtr insts[MaxWidth];
+    };
+    /** The load pipeline TimeBuffer. */
+    TimeBuffer<LoadPipeStruct> loadPipe;
+    /** Each stage in load pipeline. loadPipeSx[0] means load pipe S0 */
+    std::vector<TimeBuffer<LoadPipeStruct>::wire> loadPipeSx;
+
+    /** Struct that defines the information passed through Store Pipeline. */
+    struct StorePipeStruct
+    {
+        int size;
+
+        DynInstPtr insts[MaxWidth];
+    };
+    /** The store pipeline TimeBuffer. */
+    TimeBuffer<StorePipeStruct> storePipe;
+    /** Each stage in store pipeline. storePipeSx[0] means store pipe S0 */
+    std::vector<TimeBuffer<StorePipeStruct>::wire> storePipeSx;
+
   private:
     /** The number of places to shift addresses in the LSQ before checking
      * for dependency violations

From 9b3f0ff12e25b393201702acc2004378c1ff5cf0 Mon Sep 17 00:00:00 2001
From: lixin <1037997956@qq.com>
Date: Mon, 9 Dec 2024 11:27:29 +0800
Subject: [PATCH 02/10] arch-riscv: change fence's opType

Originally, the fence instruction will be dispatched to mem's dispatchQueue,
but its opType is No_OpClass, which will cause it to wait for the integer issue queue IQ2(IntMisc) to have free items before it can continue execution.

If the subsequent instructions of the fence instruction occupy the intIQ2, the fence cannot be executed and cpu stucks.

Therefore, change the opType of the fence instruction to MemReadOp to prevent this situation (in fact, the fence will not be dispatched to IQ)

Change-Id: Ie38a901e038db9906c43f78675e69391e847c88b
---
 src/arch/riscv/isa/decoder.isa | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/arch/riscv/isa/decoder.isa b/src/arch/riscv/isa/decoder.isa
index 858ecece1e..c5f98cd5b0 100644
--- a/src/arch/riscv/isa/decoder.isa
+++ b/src/arch/riscv/isa/decoder.isa
@@ -613,7 +613,7 @@ decode QUADRANT default Unknown::unknown() {
         0x03: decode FUNCT3 {
             format FenceOp {
                 0x0: fence({{
-                }}, uint64_t, IsReadBarrier, IsWriteBarrier, No_OpClass);
+                }}, uint64_t, IsReadBarrier, IsWriteBarrier, MemReadOp);
                 0x1: fence_i({{
                 }}, uint64_t, IsNonSpeculative, IsSerializeAfter, No_OpClass);
             }

From 5fc37d5a5b4e97b53430e1be7caa1d328ab4ddda Mon Sep 17 00:00:00 2001
From: lixin <1037997956@qq.com>
Date: Mon, 9 Dec 2024 11:38:47 +0800
Subject: [PATCH 03/10] cpu-o3: Split operations in the ldst pipeline

Now initiateAcc only does tlb access and is located at s0 of the load/store pipeline.

Load makes cache access and query violations at s1, receives the cache response at s2, and writes back at s3.
Store updates sq and query violations at s1, and writes back at s4.

AMO operations are now executed using `executeAmo`.

Change-Id: Iac678b7de3a690329f279c70fdcd22be4ed22715
---
 src/cpu/o3/iew.cc      |   2 +-
 src/cpu/o3/lsq.cc      |  32 ++-
 src/cpu/o3/lsq.hh      |  19 +-
 src/cpu/o3/lsq_unit.cc | 599 +++++++++++++++++++++++++++++------------
 src/cpu/o3/lsq_unit.hh |  27 +-
 5 files changed, 480 insertions(+), 199 deletions(-)

diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc
index 2715558765..dfaf13e63e 100644
--- a/src/cpu/o3/iew.cc
+++ b/src/cpu/o3/iew.cc
@@ -1471,7 +1471,7 @@ IEW::executeInsts()
             // Tell the LDSTQ to execute this instruction (if it is a load).
             if (inst->isAtomic()) {
                 // AMOs are treated like store requests
-                fault = ldstQueue.executeStore(inst);
+                fault = ldstQueue.executeAmo(inst);
 
                 if (inst->isTranslationDelayed() &&
                     fault == NoFault) {
diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc
index 36140ca179..0043894702 100644
--- a/src/cpu/o3/lsq.cc
+++ b/src/cpu/o3/lsq.cc
@@ -43,6 +43,7 @@
 
 #include <algorithm>
 #include <csignal>
+#include <cstdint>
 #include <list>
 #include <string>
 
@@ -51,6 +52,7 @@
 #include "base/trace.hh"
 #include "cpu/o3/cpu.hh"
 #include "cpu/o3/dyn_inst.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/iew.hh"
 #include "cpu/o3/limits.hh"
 #include "debug/Drain.hh"
@@ -320,19 +322,11 @@ LSQ::executePipeSx()
 }
 
 Fault
-LSQ::executeLoad(const DynInstPtr &inst)
+LSQ::executeAmo(const DynInstPtr &inst)
 {
     ThreadID tid = inst->threadNumber;
 
-    return thread[tid].executeLoad(inst);
-}
-
-Fault
-LSQ::executeStore(const DynInstPtr &inst)
-{
-    ThreadID tid = inst->threadNumber;
-
-    return thread[tid].executeStore(inst);
+    return thread[tid].executeAmo(inst);
 }
 
 void
@@ -562,8 +556,12 @@ LSQ::recvFunctionalCustomSignal(PacketPtr pkt, int sig)
     LSQRequest *request = dynamic_cast<LSQRequest*>(pkt->getPrimarySenderState());
     panic_if(!request, "Got packet back with unknown sender state\n");
     if (sig == DcacheRespType::Miss) {
-        // notify cache miss
-        iewStage->loadCancel(request->instruction());
+        if (request->instruction()->isLoad()) {
+            // notify cache miss
+            iewStage->loadCancel(request->instruction());
+            // set cache miss flag in pipeline
+            thread[request->_port.lsqID].setFlagInPipeLine(request->instruction(), LdStFlags::CacheMiss);
+        }
     } else {
         panic("unsupported sig %d in recvFunctionalCustomSignal\n", sig);
     }
@@ -955,6 +953,14 @@ LSQ::pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
         request->initiateTranslation();
     }
 
+    if (!isLoad && !isAtomic) {
+        // store inst temporally saves its data in memData
+        inst->memData = new uint8_t[size];
+        memcpy(inst->memData, data, size);
+    }
+
+    inst->effSize = size;
+
     if (!isLoad && !inst->isVector() && size > 1 && addr % size != 0) {
         warn( "Store misaligned: size: %u, Addr: %#lx, code: %d\n", size,
             addr, RiscvISA::ExceptionCode::STORE_ADDR_MISALIGNED);
@@ -964,7 +970,7 @@ LSQ::pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
     }
 
     /* This is the place were instructions get the effAddr. */
-    if (request->isTranslationComplete()) {
+    if (inst->isAtomic() && request->isTranslationComplete()) {
         if (request->isMemAccessRequired()) {
             inst->effAddr = request->getVaddr();
             inst->effSize = size;
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
index 6df7099cbd..f2213a3f90 100644
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -76,6 +76,18 @@ class IEW;
 class LSQUnit;
 class StoreBufferEntry;
 
+/** The Flag of Load/Store inst in Pipeline. */
+enum LdStFlags
+{
+    Valid = 0,
+    Replayed,
+    CacheMiss,
+    Squashed,
+    Num_Flags
+};
+
+constexpr uint64_t LdStFlagNum = LdStFlags::Num_Flags;
+
 class LSQ
 {
   public:
@@ -742,11 +754,8 @@ class LSQ
     /** Inserts a store into the LSQ. */
     void insertStore(const DynInstPtr &store_inst);
 
-    /** Executes a load. */
-    Fault executeLoad(const DynInstPtr &inst);
-
-    /** Executes a store. */
-    Fault executeStore(const DynInstPtr &inst);
+    /** Executes an amo inst. */
+    Fault executeAmo(const DynInstPtr &inst);
 
     /** Iq issues a load to load pipeline. */
     void issueToLoadPipe(const DynInstPtr &inst);
diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc
index 37c1b5a046..158a022201 100644
--- a/src/cpu/o3/lsq_unit.cc
+++ b/src/cpu/o3/lsq_unit.cc
@@ -920,12 +920,48 @@ LSQUnit::checkViolations(typename LoadQueue::iterator& loadIt,
     return NoFault;
 }
 
+void
+LSQUnit::setFlagInPipeLine(DynInstPtr inst, LdStFlags f)
+{
+    bool found = false;
+    if (inst->isLoad()) {
+        for (int i = (loadPipeSx.size() - 1); i >= 0; i--) {
+            for (int j = 0; j < loadPipeSx[i]->size; j++) {
+                if (inst == loadPipeSx[i]->insts[j]) {
+                    found = true;
+                    (loadPipeSx[i]->flags[j])[f] = true;
+                    break;
+                }
+            }
+        }
+    } else {
+        for (int i = (storePipeSx.size() - 1); i >= 0; i--) {
+            for (int j = 0; j < storePipeSx[i]->size; j++) {
+                if (inst == storePipeSx[i]->insts[j]) {
+                    found = true;
+                    (storePipeSx[i]->flags[j])[f] = true;
+                    break;
+                }
+            }
+        }
+    }
+
+    if (!found) {
+        warn("[sn:%ld] Can not found corresponding inst in PipeLine, isLoad: %d\n", inst->seqNum, inst->isLoad());
+    }
+}
+
 void
 LSQUnit::issueToLoadPipe(const DynInstPtr &inst)
 {
     // push to loadPipeS0
     assert(loadPipeSx[0]->size < MaxWidth);
-    loadPipeSx[0]->insts[loadPipeSx[0]->size++] = inst;
+    int idx = loadPipeSx[0]->size;
+
+    loadPipeSx[0]->insts[idx] = inst;
+    loadPipeSx[0]->flags[idx][LdStFlags::Valid] = true;
+    loadPipeSx[0]->size++;
+
     DPRINTF(LSQUnit, "issueToLoadPipe: [sn:%lli]\n", inst->seqNum);
     dumpLoadPipe();
 }
@@ -935,143 +971,75 @@ LSQUnit::issueToStorePipe(const DynInstPtr &inst)
 {
     // push to storePipeS0
     assert(storePipeSx[0]->size < MaxWidth);
-    storePipeSx[0]->insts[storePipeSx[0]->size++] = inst;
+    int idx = storePipeSx[0]->size;
+
+    storePipeSx[0]->insts[idx] = inst;
+    storePipeSx[0]->flags[idx][LdStFlags::Valid] = true;
+    storePipeSx[0]->size++;
+
     DPRINTF(LSQUnit, "issueToStorePipe: [sn:%lli]\n", inst->seqNum);
     dumpStorePipe();
 }
 
-void
-LSQUnit::executeLoadPipeSx()
-{
-    // TODO: execute operations in each load pipelines
-    Fault fault = NoFault;
-    for (int i = 0; i < loadPipeSx.size(); i++) {
-        auto& stage = loadPipeSx[i];
-        switch (i) {
-            case 0:
-                break;
-            case 1:
-                for (int j = 0; j < stage->size; j++) {
-                    auto& inst = stage->insts[j];
-                    if (!inst->isSquashed()) {
-                        // Loads will mark themselves as executed, and their writeback
-                        // event adds the instruction to the queue to commit
-                        fault = executeLoad(inst);
-                        if (inst->isTranslationDelayed() &&
-                            fault == NoFault) {
-                            // A hw page table walk is currently going on; the
-                            // instruction must be deferred.
-                            DPRINTF(LSQUnit, "Execute: Delayed translation, deferring "
-                            "load.\n");
-                            iewStage->deferMemInst(inst);
-                            continue;
-                        }
-                        if (inst->isDataPrefetch() || inst->isInstPrefetch()) {
-                            inst->fault = NoFault;
-                        }
-                        iewStage->SquashCheckAfterExe(inst);
-                    } else {
-                        DPRINTF(LSQUnit, "Execute: Instruction was squashed. PC: %s, [tid:%i]"
-                                        " [sn:%llu]\n", inst->pcState(), inst->threadNumber,
-                                        inst->seqNum);
-                        inst->setExecuted();
-                        inst->setCanCommit();
-                    }
-                }
-                break;
-            case 2:
-                break;
-            case 3:
-                break;
-            default:
-                panic("unsupported loadpipe length");
-        }
-    }
-}
-
-void
-LSQUnit::executeStorePipeSx()
+Fault
+LSQUnit::loadPipeS0(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag)
 {
-    // TODO: execute operations in each store pipelines
-    Fault fault = NoFault;
-    for (int i = 0; i < storePipeSx.size(); i++) {
-        auto& stage = storePipeSx[i];
-        switch (i) {
-            case 0:
-                break;
-            case 1:
-                for (int j = 0; j < stage->size; j++) {
-                    auto& inst = stage->insts[j];
-                    if (!inst->isSquashed()) {
-                        fault = executeStore(inst);
-                        if (inst->isTranslationDelayed() &&
-                            fault == NoFault) {
-                            // A hw page table walk is currently going on; the
-                            // instruction must be deferred.
-                            DPRINTF(LSQUnit, "Execute: Delayed translation, deferring "
-                                    "store.\n");
-                            iewStage->deferMemInst(inst);
-                            continue;
-                        }
+    DPRINTF(LSQUnit, "LoadPipeS0: Executing load PC %s, [sn:%lli] "
+            "flags: valid[%d], replayed[%d], cachemiss[%d], squashed[%d]\n",
+            inst->pcState(), inst->seqNum,
+            flag[LdStFlags::Valid], flag[LdStFlags::Replayed],
+            flag[LdStFlags::CacheMiss], flag[LdStFlags::Squashed]);
+    assert(!inst->isSquashed());
 
-                        // If the store had a fault then it may not have a mem req
-                        if (fault != NoFault || !inst->readPredicate() ||
-                                !inst->isStoreConditional()) {
-                            // If the instruction faulted, then we need to send it
-                            // along to commit without the instruction completing.
-                            // Send this instruction to commit, also make sure iew
-                            // stage realizes there is activity.
-                            inst->setExecuted();
-                            iewStage->instToCommit(inst);
-                            iewStage->activityThisCycle();
-                        }
-                        iewStage->notifyExecuted(inst);
-                        iewStage->SquashCheckAfterExe(inst);
-                    } else {
-                        DPRINTF(LSQUnit, "Execute: Instruction was squashed. PC: %s, [tid:%i]"
-                                        " [sn:%llu]\n", inst->pcState(), inst->threadNumber,
-                                        inst->seqNum);
-                        inst->setExecuted();
-                        inst->setCanCommit();
-                    }
-                }
-                break;
-            case 2:
-                break;
-            case 3:
-                break;
-            case 4:
-                break;
-            default:
-                panic("unsupported storepipe length");
-        }
-    }
-}
+    Fault load_fault = NoFault;
+    // Now initiateAcc only does TLB access
+    load_fault = inst->initiateAcc();
 
-void
-LSQUnit::executePipeSx()
-{
-    executeLoadPipeSx();
-    executeStorePipeSx();
+    return load_fault;
 }
 
 Fault
-LSQUnit::executeLoad(const DynInstPtr &inst)
+LSQUnit::loadPipeS1(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag)
 {
-    // Execute a specific load.
-    Fault load_fault = NoFault;
-
-    DPRINTF(LSQUnit, "Executing load PC %s, [sn:%lli]\n",
-            inst->pcState(), inst->seqNum);
-
+    DPRINTF(LSQUnit, "LoadPipeS1: Executing load PC %s, [sn:%lli] "
+            "flags: valid[%d], replayed[%d], cachemiss[%d], squashed[%d]\n",
+            inst->pcState(), inst->seqNum,
+            flag[LdStFlags::Valid], flag[LdStFlags::Replayed],
+            flag[LdStFlags::CacheMiss], flag[LdStFlags::Squashed]);
     assert(!inst->isSquashed());
 
-    load_fault = inst->initiateAcc();
+    Fault load_fault = inst->getFault();
+    LSQRequest* request = inst->savedRequest;
+
+    // Cache access
+    if (request && request->isTranslationComplete()) {
+        if (request->isMemAccessRequired()) {
+            inst->effAddr = request->getVaddr();
+            inst->effAddrValid(true);
+
+            Fault fault;
+            fault = read(request, inst->lqIdx);
+            // inst->getFault() may have the first-fault of a
+            // multi-access split request at this point.
+            // Overwrite that only if we got another type of fault
+            // (e.g. re-exec).
+            if (fault != NoFault) {
+                inst->getFault() = fault;
+                load_fault = fault;
+            }
+        } else {
+            inst->setMemAccPredicate(false);
+            // Commit will have to clean up whatever happened.  Set this
+            // instruction as executed.
+            inst->setExecuted();
+        }
+    }
 
     if (!inst->translationCompleted()) {
+        // TLB miss
         iewStage->loadCancel(inst);
     } else {
-        DPRINTF(LSQUnit, "load tlb hit [sn:%lli]\n",
+        DPRINTF(LSQUnit, "LoadPipeS1: load tlb hit [sn:%lli]\n",
                 inst->seqNum);
     }
 
@@ -1108,7 +1076,7 @@ LSQUnit::executeLoad(const DynInstPtr &inst)
         // commit.
         if (!inst->readPredicate())
             inst->forwardOldRegs();
-        DPRINTF(LSQUnit, "Load [sn:%lli] not executed from %s\n",
+        DPRINTF(LSQUnit, "LoadPipeS1: Load [sn:%lli] not executed from %s\n",
                 inst->seqNum,
                 (load_fault != NoFault ? "fault" : "predication"));
         if (!(inst->hasRequest() && inst->strictlyOrdered()) ||
@@ -1123,87 +1091,175 @@ LSQUnit::executeLoad(const DynInstPtr &inst)
             ++it;
 
             if (checkLoads)
-                return checkViolations(it, inst);
+                load_fault = checkViolations(it, inst);
         }
     }
 
     return load_fault;
 }
 
-bool
-LSQUnit::triggerStorePFTrain(int sq_idx)
+Fault
+LSQUnit::loadPipeS2(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag)
+{
+    Fault fault = inst->getFault();
+    DPRINTF(LSQUnit, "LoadPipeS2: Executing load PC %s, [sn:%lli] "
+            "flags: valid[%d], replayed[%d], cachemiss[%d], squashed[%d]\n",
+            inst->pcState(), inst->seqNum,
+            flag[LdStFlags::Valid], flag[LdStFlags::Replayed],
+            flag[LdStFlags::CacheMiss], flag[LdStFlags::Squashed]);
+    assert(!inst->isSquashed());
+    return fault;
+}
+
+Fault
+LSQUnit::loadPipeS3(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag)
+{
+    Fault fault = inst->getFault();
+    DPRINTF(LSQUnit, "LoadPipeS3: Executing load PC %s, [sn:%lli] "
+            "flags: valid[%d], replayed[%d], cachemiss[%d], squashed[%d]\n",
+            inst->pcState(), inst->seqNum,
+            flag[LdStFlags::Valid], flag[LdStFlags::Replayed],
+            flag[LdStFlags::CacheMiss], flag[LdStFlags::Squashed]);
+    assert(!inst->isSquashed());
+    return fault;
+}
+
+void
+LSQUnit::executeLoadPipeSx()
 {
-    auto inst = storeQueue[sq_idx].instruction();
-    assert(inst->translationCompleted());
-    Addr vaddr = inst->effAddr;
-    Addr pc = inst->pcState().instAddr();
-    // create request
-    RequestPtr req =
-        std::make_shared<Request>(vaddr, 1, Request::STORE_PF_TRAIN, inst->requestorId(), pc, inst->contextId());
-    req->setPaddr(inst->physEffAddr);
+    // TODO: execute operations in each load pipelines
+    Fault fault = NoFault;
+    for (int i = 0; i < loadPipeSx.size(); i++) {
+        auto& stage = loadPipeSx[i];
+        for (int j = 0; j < stage->size; j++) {
+            auto& inst = stage->insts[j];
+            auto& flag = stage->flags[j];
+            if (!inst->isSquashed()) {
+                switch (i) {
+                    case 0:
+                        fault = loadPipeS0(inst, flag);
+                        break;
+                    case 1:
+                        // Loads will mark themselves as executed, and their writeback
+                        // event adds the instruction to the queue to commit
+                        fault = loadPipeS1(inst, flag);
 
-    // create packet
-    PacketPtr pkt = Packet::createPFtrain(req);
+                        if (inst->isTranslationDelayed() && fault == NoFault) {
+                            // A hw page table walk is currently going on; the
+                            // instruction must be deferred.
+                            DPRINTF(LSQUnit, "Execute: Delayed translation, deferring "
+                            "load.\n");
+                            iewStage->deferMemInst(inst);
+                            flag[LdStFlags::Replayed] = true;
+                        }
 
-    // send packet
-    bool success = dcachePort->sendTimingReq(pkt);
-    assert(success); // must be true
+                        if (inst->isDataPrefetch() || inst->isInstPrefetch()) {
+                            inst->fault = NoFault;
+                        }
 
-    return true;
+                        iewStage->SquashCheckAfterExe(inst);
+                        break;
+                    case 2:
+                        fault = loadPipeS2(inst, flag);
+                        break;
+                    case 3:
+                        fault = loadPipeS3(inst, flag);
+                        break;
+                    default:
+                        panic("unsupported loadpipe length");
+                }
+            } else {
+                DPRINTF(LSQUnit, "Execute: Instruction was squashed. PC: %s, [tid:%i]"
+                                " [sn:%llu]\n", inst->pcState(), inst->threadNumber,
+                                inst->seqNum);
+                inst->setExecuted();
+                inst->setCanCommit();
+                flag[LdStFlags::Squashed] = true;
+            }
+        }
+    }
 }
 
 Fault
-LSQUnit::executeStore(const DynInstPtr &store_inst)
+LSQUnit::storePipeS0(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag)
 {
     // Make sure that a store exists.
     assert(storeQueue.size() != 0);
+    assert(!inst->isSquashed());
+
+    DPRINTF(LSQUnit, "StorePipeS0: Executing store PC %s [sn:%lli] "
+            "flags: valid[%d], replayed[%d], squashed[%d]\n",
+            inst->pcState(), inst->seqNum,
+            flag[LdStFlags::Valid], flag[LdStFlags::Replayed], flag[LdStFlags::Squashed]);
 
-    ssize_t store_idx = store_inst->sqIdx;
+    // Now initiateAcc only does TLB access
+    Fault store_fault = inst->initiateAcc();
 
-    DPRINTF(LSQUnit, "Executing store PC %s [sn:%lli]\n",
-            store_inst->pcState(), store_inst->seqNum);
+    return store_fault;
+}
 
-    assert(!store_inst->isSquashed());
+Fault
+LSQUnit::storePipeS1(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag)
+{
+    // Make sure that a store exists.
+    assert(storeQueue.size() != 0);
+
+    ssize_t store_idx = inst->sqIdx;
+    LSQRequest* request = inst->savedRequest;
+
+    DPRINTF(LSQUnit, "StorePipeS1: Executing store PC %s [sn:%lli] "
+            "flags: valid[%d], replayed[%d], squashed[%d]\n",
+            inst->pcState(), inst->seqNum,
+            flag[LdStFlags::Valid], flag[LdStFlags::Replayed], flag[LdStFlags::Squashed]);
 
     // Check the recently completed loads to see if any match this store's
     // address.  If so, then we have a memory ordering violation.
-    typename LoadQueue::iterator loadIt = store_inst->lqIt;
+    typename LoadQueue::iterator loadIt = inst->lqIt;
 
-    Fault store_fault = store_inst->initiateAcc();
+    /* This is the place were instructions get the effAddr. */
+    if (request && request->isTranslationComplete()) {
+        if (request->isMemAccessRequired() && (inst->getFault() == NoFault)) {
+            inst->effAddr = request->getVaddr();
+            inst->effAddrValid(true);
 
-    if (store_inst->isTranslationDelayed() &&
+            if (cpu->checker) {
+                inst->reqToVerify = std::make_shared<Request>(*request->req());
+            }
+            Fault fault;
+            fault = write(request, inst->memData, inst->sqIdx);
+            // release temporal data
+            delete [] inst->memData;
+            inst->memData = nullptr;
+
+            if (fault != NoFault)
+                inst->getFault() = fault;
+        }
+    }
+
+    Fault store_fault = inst->getFault();
+
+    if (inst->isTranslationDelayed() &&
         store_fault == NoFault)
         return store_fault;
 
-    if (!store_inst->readPredicate()) {
-        DPRINTF(LSQUnit, "Store [sn:%lli] not executed from predication\n",
-                store_inst->seqNum);
-        store_inst->forwardOldRegs();
+    if (!inst->readPredicate()) {
+        DPRINTF(LSQUnit, "StorePipeS1: Store [sn:%lli] not executed from predication\n",
+                inst->seqNum);
+        inst->forwardOldRegs();
         return store_fault;
     }
 
     if (storeQueue[store_idx].size() == 0) {
-        DPRINTF(LSQUnit,"Fault on Store PC %s, [sn:%lli], Size = 0\n",
-                store_inst->pcState(), store_inst->seqNum);
-
-        if (store_inst->isAtomic()) {
-            // If the instruction faulted, then we need to send it along
-            // to commit without the instruction completing.
-            if (!(store_inst->hasRequest() && store_inst->strictlyOrdered()) ||
-                store_inst->isAtCommit()) {
-                store_inst->setExecuted();
-            }
-            iewStage->instToCommit(store_inst);
-            iewStage->activityThisCycle();
-        }
+        DPRINTF(LSQUnit, "StorePipeS1: Fault on Store PC %s, [sn:%lli], Size = 0\n",
+                inst->pcState(), inst->seqNum);
 
         return store_fault;
     }
 
     assert(store_fault == NoFault);
 
-    if (store_inst->isStoreConditional() || store_inst->isAtomic()) {
-        // Store conditionals and Atomics need to set themselves as able to
+    if (inst->isStoreConditional()) {
+        // Store conditionals need to set themselves as able to
         // writeback if we haven't had a fault by here.
         storeQueue[store_idx].canWB() = true;
 
@@ -1214,8 +1270,195 @@ LSQUnit::executeStore(const DynInstPtr &store_inst)
         }
     }
 
-    return checkViolations(loadIt, store_inst);
+    return checkViolations(loadIt, inst);
+}
+
+Fault
+LSQUnit::storePipeS2(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag)
+{
+    Fault fault = inst->getFault();
+    assert(!inst->isSquashed());
+
+    DPRINTF(LSQUnit, "StorePipeS2: Executing store PC %s [sn:%lli] "
+            "flags: valid[%d], replayed[%d], squashed[%d]\n",
+            inst->pcState(), inst->seqNum,
+            flag[LdStFlags::Valid], flag[LdStFlags::Replayed], flag[LdStFlags::Squashed]);
+    return fault;
+}
+
+Fault
+LSQUnit::storePipeS3(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag)
+{
+    Fault fault = inst->getFault();
+    assert(!inst->isSquashed());
+
+    DPRINTF(LSQUnit, "StorePipeS3: Executing store PC %s [sn:%lli] "
+            "flags: valid[%d], replayed[%d], squashed[%d]\n",
+            inst->pcState(), inst->seqNum,
+            flag[LdStFlags::Valid], flag[LdStFlags::Replayed], flag[LdStFlags::Squashed]);
+    return fault;
+}
+
+Fault
+LSQUnit::storePipeS4(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag)
+{
+    Fault fault = inst->getFault();
+    assert(!inst->isSquashed());
+
+    DPRINTF(LSQUnit, "StorePipeS4: Executing store PC %s [sn:%lli] "
+            "flags: valid[%d], replayed[%d], squashed[%d]\n",
+            inst->pcState(), inst->seqNum,
+            flag[LdStFlags::Valid], flag[LdStFlags::Replayed], flag[LdStFlags::Squashed]);
+    return fault;
+}
+
+void
+LSQUnit::executeStorePipeSx()
+{
+    // TODO: execute operations in each store pipelines
+    Fault fault = NoFault;
+    for (int i = 0; i < storePipeSx.size(); i++) {
+        auto& stage = storePipeSx[i];
+        for (int j = 0; j < stage->size; j++) {
+            auto& inst = stage->insts[j];
+            auto& flag = stage->flags[j];
+            if (!inst->isSquashed()) {
+                switch (i) {
+                    case 0:
+                        fault = storePipeS0(inst, flag);
+                        break;
+                    case 1:
+                        fault = storePipeS1(inst, flag);
+                        if (inst->isTranslationDelayed() && fault == NoFault) {
+                            // A hw page table walk is currently going on; the
+                            // instruction must be deferred.
+                            DPRINTF(LSQUnit, "Execute: Delayed translation, deferring "
+                                    "store.\n");
+                            iewStage->deferMemInst(inst);
+                            flag[LdStFlags::Replayed] = true;
+                            continue;
+                        }
+
+                        iewStage->notifyExecuted(inst);
+                        iewStage->SquashCheckAfterExe(inst);
+                        break;
+                    case 2:
+                        fault = storePipeS2(inst, flag);
+                        break;
+                    case 3:
+                        fault = storePipeS3(inst, flag);
+                        // If the store had a fault then it may not have a mem req
+                        if (fault != NoFault || !inst->readPredicate() || !inst->isStoreConditional()) {
+                            // If the instruction faulted, then we need to send it
+                            // along to commit without the instruction completing.
+                            // Send this instruction to commit, also make sure iew
+                            // stage realizes there is activity.
+                            if (!flag[LdStFlags::Replayed]) {
+                                inst->setExecuted();
+                                iewStage->instToCommit(inst);
+                                iewStage->activityThisCycle();
+                            }
+                        }
+                        break;
+                    case 4:
+                        fault = storePipeS4(inst, flag);
+                        break;
+                    default:
+                        panic("unsupported storepipe length");
+                }
+            } else {
+                DPRINTF(LSQUnit, "Execute: Instruction was squashed. PC: %s, [tid:%i]"
+                                " [sn:%llu]\n", inst->pcState(), inst->threadNumber,
+                                inst->seqNum);
+                inst->setExecuted();
+                inst->setCanCommit();
+                flag[LdStFlags::Squashed] = true;
+            }
+        }
+    }
+}
+
+void
+LSQUnit::executePipeSx()
+{
+    executeLoadPipeSx();
+    executeStorePipeSx();
+}
+
+bool
+LSQUnit::triggerStorePFTrain(int sq_idx)
+{
+    auto inst = storeQueue[sq_idx].instruction();
+    assert(inst->translationCompleted());
+    Addr vaddr = inst->effAddr;
+    Addr pc = inst->pcState().instAddr();
+    // create request
+    RequestPtr req =
+        std::make_shared<Request>(vaddr, 1, Request::STORE_PF_TRAIN, inst->requestorId(), pc, inst->contextId());
+    req->setPaddr(inst->physEffAddr);
+
+    // create packet
+    PacketPtr pkt = Packet::createPFtrain(req);
+
+    // send packet
+    bool success = dcachePort->sendTimingReq(pkt);
+    assert(success); // must be true
+
+    return true;
+}
+
+Fault
+LSQUnit::executeAmo(const DynInstPtr &amo_inst)
+{
+    // Make sure that a store exists.
+    assert(storeQueue.size() != 0);
+
+    ssize_t amo_idx = amo_inst->sqIdx;
+
+    DPRINTF(LSQUnit, "Executing AMO PC %s [sn:%lli]\n",
+            amo_inst->pcState(), amo_inst->seqNum);
+
+    assert(!amo_inst->isSquashed());
+
+    // Check the recently completed loads to see if any match this amo's
+    // address.  If so, then we have a memory ordering violation.
+    typename LoadQueue::iterator loadIt = amo_inst->lqIt;
+
+    Fault amo_fault = amo_inst->initiateAcc();
+
+    if (amo_inst->isTranslationDelayed() && amo_fault == NoFault)
+        return amo_fault;
+
+    if (!amo_inst->readPredicate()) {
+        DPRINTF(LSQUnit, "AMO [sn:%lli] not executed from predication\n",
+                amo_inst->seqNum);
+        amo_inst->forwardOldRegs();
+        return amo_fault;
+    }
+
+    if (storeQueue[amo_idx].size() == 0) {
+        DPRINTF(LSQUnit,"Fault on AMO PC %s, [sn:%lli], Size = 0\n",
+                amo_inst->pcState(), amo_inst->seqNum);
+
+        // If the amo instruction faulted, then we need to send it along
+        // to commit without the instruction completing.
+        if (!(amo_inst->hasRequest() && amo_inst->strictlyOrdered()) ||
+            amo_inst->isAtCommit()) {
+            amo_inst->setExecuted();
+        }
+        iewStage->instToCommit(amo_inst);
+        iewStage->activityThisCycle();
+
+        return amo_fault;
+    }
+
+    assert(amo_fault == NoFault);
+
+    // Atomics need to set themselves as able to writeback if we haven't had a fault by here.
+    storeQueue[amo_idx].canWB() = true;
+    ++storesToWB;
 
+    return checkViolations(loadIt, amo_inst);
 }
 
 void
@@ -2127,10 +2370,15 @@ LSQUnit::dumpLoadPipe()
     for (int i = 0; i < loadPipeSx.size(); i++) {
         DPRINTF(LSQUnit, "Load S%d:, size: %d\n", i, loadPipeSx[i]->size);
         for (int j = 0; j < loadPipeSx[i]->size; j++) {
-            DPRINTF(LSQUnit, "  PC: %s, [tid:%i] [sn:%lli]\n",
+            DPRINTF(LSQUnit, "  PC: %s, [tid:%i] [sn:%lli] "
+                    "flags: valid[%d], replayed[%d], cachemiss[%d], squashed[%d]\n",
                     loadPipeSx[i]->insts[j]->pcState(),
                     loadPipeSx[i]->insts[j]->threadNumber,
-                    loadPipeSx[i]->insts[j]->seqNum
+                    loadPipeSx[i]->insts[j]->seqNum,
+                    (loadPipeSx[i]->flags[j])[LdStFlags::Valid],
+                    (loadPipeSx[i]->flags[j])[LdStFlags::Replayed],
+                    (loadPipeSx[i]->flags[j])[LdStFlags::CacheMiss],
+                    (loadPipeSx[i]->flags[j])[LdStFlags::Squashed]
             );
         }
     }
@@ -2143,10 +2391,14 @@ LSQUnit::dumpStorePipe()
     for (int i = 0; i < storePipeSx.size(); i++) {
         DPRINTF(LSQUnit, "Store S%d:, size: %d\n", i, storePipeSx[i]->size);
         for (int j = 0; j < storePipeSx[i]->size; j++) {
-            DPRINTF(LSQUnit, "  PC: %s, [tid:%i] [sn:%lli]\n",
+            DPRINTF(LSQUnit, "  PC: %s, [tid:%i] [sn:%lli] "
+                    "flags: valid[%d], replayed[%d], squashed[%d]\n",
                     storePipeSx[i]->insts[j]->pcState(),
                     storePipeSx[i]->insts[j]->threadNumber,
-                    storePipeSx[i]->insts[j]->seqNum
+                    storePipeSx[i]->insts[j]->seqNum,
+                    (storePipeSx[i]->flags[j])[LdStFlags::Valid],
+                    (storePipeSx[i]->flags[j])[LdStFlags::Replayed],
+                    (storePipeSx[i]->flags[j])[LdStFlags::Squashed]
             );
         }
     }
@@ -2562,6 +2814,7 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx)
     }
     if (!request->isSent()) {
         iewStage->blockMemInst(load_inst);
+        setFlagInPipeLine(load_inst, LdStFlags::Replayed);
     }
 
     return NoFault;
@@ -2572,9 +2825,9 @@ LSQUnit::write(LSQRequest *request, uint8_t *data, ssize_t store_idx)
 {
     assert(storeQueue[store_idx].valid());
 
-    DPRINTF(LSQUnit, "Doing write to store idx %i, addr %#x | storeHead:%i "
+    DPRINTF(LSQUnit, "Doing write to store idx %i, addr %#x | storeHead:%i, size: %d"
             "[sn:%llu]\n",
-            store_idx - 1, request->req()->getPaddr(), storeQueue.head() - 1,
+            store_idx - 1, request->req()->getPaddr(), storeQueue.head() - 1, request->_size,
             storeQueue[store_idx].instruction()->seqNum);
 
     storeQueue[store_idx].setRequest(request);
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
index b96119e1bb..e9a0c94612 100644
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -43,6 +43,7 @@
 #define __CPU_O3_LSQ_UNIT_HH__
 
 #include <algorithm>
+#include <bitset>
 #include <cstdint>
 #include <cstring>
 #include <map>
@@ -62,6 +63,7 @@
 #include "cpu/o3/comm.hh"
 #include "cpu/o3/cpu.hh"
 #include "cpu/o3/dyn_inst_ptr.hh"
+#include "cpu/o3/limits.hh"
 #include "cpu/o3/lsq.hh"
 #include "cpu/timebuf.hh"
 #include "debug/HtmCpu.hh"
@@ -340,18 +342,13 @@ class LSQUnit
      */
     void checkSnoop(PacketPtr pkt);
 
-    /** Executes a load instruction. */
-    Fault executeLoad(const DynInstPtr &inst);
-
     /** Iq issues a load to load pipeline. */
     void issueToLoadPipe(const DynInstPtr &inst);
 
-    Fault executeLoad(int lq_idx) { panic("Not implemented"); return NoFault; }
-
     bool triggerStorePFTrain(int sq_idx);
 
-    /** Executes a store instruction. */
-    Fault executeStore(const DynInstPtr& inst);
+    /** Executes an amo instruction. */
+    Fault executeAmo(const DynInstPtr& inst);
 
     /** Iq issues a store to store pipeline. */
     void issueToStorePipe(const DynInstPtr &inst);
@@ -511,9 +508,20 @@ class LSQUnit
     /** Process instructions in each load pipeline stages. */
     void executeLoadPipeSx();
 
+    Fault loadPipeS0(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag);
+    Fault loadPipeS1(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag);
+    Fault loadPipeS2(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag);
+    Fault loadPipeS3(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag);
+
     /** Process instructions in each store pipeline stages. */
     void executeStorePipeSx();
 
+    Fault storePipeS0(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag);
+    Fault storePipeS1(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag);
+    Fault storePipeS2(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag);
+    Fault storePipeS3(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag);
+    Fault storePipeS4(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag);
+
     /** Wrap function. */
     void executePipeSx();
 
@@ -619,6 +627,7 @@ class LSQUnit
         int size;
 
         DynInstPtr insts[MaxWidth];
+        std::bitset<LdStFlagNum> flags[MaxWidth];
     };
     /** The load pipeline TimeBuffer. */
     TimeBuffer<LoadPipeStruct> loadPipe;
@@ -631,12 +640,16 @@ class LSQUnit
         int size;
 
         DynInstPtr insts[MaxWidth];
+        std::bitset<LdStFlagNum> flags[MaxWidth];
     };
     /** The store pipeline TimeBuffer. */
     TimeBuffer<StorePipeStruct> storePipe;
     /** Each stage in store pipeline. storePipeSx[0] means store pipe S0 */
     std::vector<TimeBuffer<StorePipeStruct>::wire> storePipeSx;
 
+    /** Find inst in Load/Store Pipeline, set corresponding flag to true */
+    void setFlagInPipeLine(DynInstPtr inst, LdStFlags f);
+
   private:
     /** The number of places to shift addresses in the LSQ before checking
      * for dependency violations

From 61adddf6f3404283fce652c4d18f7ceac4b2097c Mon Sep 17 00:00:00 2001
From: lixin <1037997956@qq.com>
Date: Thu, 21 Nov 2024 18:37:35 +0800
Subject: [PATCH 04/10] cpu-o3: replay cache missed load from replayQ

This commit is only for normal load. The uncache/amo load is the same as the original process.

Change-Id: Idc98ee18a6e94a39774ebba0f772820699b834de
---
 src/cpu/o3/dyn_inst.hh           | 13 +++++
 src/cpu/o3/iew.cc                |  6 +++
 src/cpu/o3/iew.hh                |  3 ++
 src/cpu/o3/inst_queue.cc         | 49 ++++++++++++++++++-
 src/cpu/o3/inst_queue.hh         | 21 +++++++++
 src/cpu/o3/lsq.cc                | 81 ++++++++++++++++++++++----------
 src/mem/cache/base.cc            | 10 ++--
 src/mem/packet.hh                |  2 +
 src/mem/request.hh               |  1 +
 src/mem/ruby/system/RubyPort.cc  |  1 +
 src/mem/ruby/system/Sequencer.cc |  6 +--
 11 files changed, 160 insertions(+), 33 deletions(-)

diff --git a/src/cpu/o3/dyn_inst.hh b/src/cpu/o3/dyn_inst.hh
index f519b0c504..310d7c1e9c 100644
--- a/src/cpu/o3/dyn_inst.hh
+++ b/src/cpu/o3/dyn_inst.hh
@@ -194,6 +194,7 @@ class DynInst : public ExecContext, public RefCounted
         NotAnInst,
         TranslationStarted,
         TranslationCompleted,
+        CacheRefilledAfterMiss,
         PossibleLoadViolation,
         HitExternalSnoop,
         EffAddrValid,
@@ -462,6 +463,14 @@ class DynInst : public ExecContext, public RefCounted
     }
     void translationCompleted(bool f) { instFlags[TranslationCompleted] = f; }
 
+    /** True if Dcache refilled after Dcache miss. */
+    bool
+    cacheRefilledAfterMiss() const
+    {
+        return instFlags[CacheRefilledAfterMiss];
+    }
+    void cacheRefilledAfterMiss(bool f) { instFlags[CacheRefilledAfterMiss] = f; }
+
     /** True if this address was found to match a previous load and they issued
      * out of order. If that happend, then it's only a problem if an incoming
      * snoop invalidate modifies the line, in which case we need to squash.
@@ -1395,6 +1404,10 @@ class DynInst : public ExecContext, public RefCounted
         return squashVer.getVersion();
     }
 
+    ssize_t getLqIdx()
+    {
+        return lqIdx;
+    }
 
     Addr getPC()
     {
diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc
index dfaf13e63e..b3fa6f517c 100644
--- a/src/cpu/o3/iew.cc
+++ b/src/cpu/o3/iew.cc
@@ -682,6 +682,12 @@ IEW::blockMemInst(const DynInstPtr& inst)
     instQueue.blockMemInst(inst);
 }
 
+void
+IEW::cacheMissLdReplay(const DynInstPtr& inst)
+{
+    instQueue.cacheMissLdReplay(inst);
+}
+
 void
 IEW::cacheUnblocked()
 {
diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh
index 89825e217a..32c62c65d2 100644
--- a/src/cpu/o3/iew.hh
+++ b/src/cpu/o3/iew.hh
@@ -209,6 +209,9 @@ class IEW
     /** Moves memory instruction onto the list of cache blocked instructions */
     void blockMemInst(const DynInstPtr &inst);
 
+    /** Moves load instruction onto the Set of cache missed instructions */
+    void cacheMissLdReplay(const DynInstPtr &inst);
+
     /** Notifies that the cache has become unblocked */
     void cacheUnblocked();
 
diff --git a/src/cpu/o3/inst_queue.cc b/src/cpu/o3/inst_queue.cc
index 725acc5fd9..803ee9f17f 100644
--- a/src/cpu/o3/inst_queue.cc
+++ b/src/cpu/o3/inst_queue.cc
@@ -94,6 +94,12 @@ InstructionQueue::FUCompletion::description() const
     return "Functional unit completion";
 }
 
+size_t
+InstructionQueue::CacheMissLdInstsHash::operator()(const DynInstPtr& ptr) const
+{
+    return ptr->getLqIdx();
+}
+
 InstructionQueue::InstructionQueue(CPU *cpu_ptr, IEW *iew_ptr,
         const BaseO3CPUParams &params)
     : cpu(cpu_ptr),
@@ -352,6 +358,7 @@ InstructionQueue::resetState()
 
     nonSpecInsts.clear();
     deferredMemInsts.clear();
+    cacheMissLdInsts.clear();
     blockedMemInsts.clear();
     retryMemInsts.clear();
     wbOutstanding = 0;
@@ -650,6 +657,10 @@ InstructionQueue::scheduleReadyInsts()
     IssueStruct *i2e_info = issueToExecuteQueue->access(0);
 
     DynInstPtr mem_inst;
+    while ((mem_inst = getCacheMissInstToExecute())) {
+        mem_inst->issueQue->retryMem(mem_inst);
+    }
+
     while ((mem_inst = getDeferredMemInstToExecute())) {
         mem_inst->issueQue->retryMem(mem_inst);
     }
@@ -721,7 +732,7 @@ InstructionQueue::scheduleReadyInsts()
     // @todo If the way deferred memory instructions are handeled due to
     // translation changes then the deferredMemInsts condition should be
     // removed from the code below.
-    if (total_issued || !retryMemInsts.empty() || !deferredMemInsts.empty()) {
+    if (total_issued || !retryMemInsts.empty() || !deferredMemInsts.empty() || !cacheMissLdInsts.empty()) {
         cpu->activityThisCycle();
     } else {
         DPRINTF(IQ, "Not able to schedule any instructions.\n");
@@ -860,6 +871,19 @@ InstructionQueue::deferMemInst(const DynInstPtr &deferred_inst)
     deferredMemInsts.push_back(deferred_inst);
 }
 
+void
+InstructionQueue::cacheMissLdReplay(const DynInstPtr &deferred_inst)
+{
+    DPRINTF(IQ, "Get Cache Missed Load, insert to Replay Queue "
+            "[sn:%llu]\n", deferred_inst->seqNum);
+    // Reset DTB translation state
+    deferred_inst->translationStarted(false);
+    deferred_inst->translationCompleted(false);
+
+    deferred_inst->clearCanIssue();
+    cacheMissLdInsts.insert(deferred_inst);
+}
+
 void
 InstructionQueue::blockMemInst(const DynInstPtr &blocked_inst)
 {
@@ -902,6 +926,29 @@ InstructionQueue::getDeferredMemInstToExecute()
     return nullptr;
 }
 
+DynInstPtr
+InstructionQueue::getCacheMissInstToExecute()
+{
+    for (auto it = cacheMissLdInsts.begin(); it != cacheMissLdInsts.end();
+         ++it) {
+        if ((*it)->cacheRefilledAfterMiss() || (*it)->isSquashed()) {
+            DPRINTF(IQ, "CacheMissed load inst [sn:%llu] PC %s is ready to "
+                    "execute\n", (*it)->seqNum, (*it)->pcState());
+            DynInstPtr mem_inst = std::move(*it);
+            cacheMissLdInsts.erase(it);
+            return mem_inst;
+        }
+        if (!(*it)->cacheRefilledAfterMiss()) {
+            DPRINTF(
+                IQ,
+                "CacheMissed load inst [sn:%llu] PC %s has not been waken up "
+                "by Dcache\n",
+                (*it)->seqNum, (*it)->pcState());
+        }
+    }
+    return nullptr;
+}
+
 DynInstPtr
 InstructionQueue::getBlockedMemInstToExecute()
 {
diff --git a/src/cpu/o3/inst_queue.hh b/src/cpu/o3/inst_queue.hh
index 0d1b780d61..0d0f333e43 100644
--- a/src/cpu/o3/inst_queue.hh
+++ b/src/cpu/o3/inst_queue.hh
@@ -45,6 +45,7 @@
 #include <list>
 #include <map>
 #include <queue>
+#include <unordered_set>
 #include <vector>
 
 #include "base/statistics.hh"
@@ -199,6 +200,11 @@ class InstructionQueue
      */
     DynInstPtr getDeferredMemInstToExecute();
 
+    /** Gets a load instruction that was referred due to Dcache miss
+     *  if it is now ready to execute.  NULL if none available.
+     */
+    DynInstPtr getCacheMissInstToExecute();
+
     /** Gets a memory instruction that was blocked on the cache. NULL if none
      *  available.
      */
@@ -242,6 +248,11 @@ class InstructionQueue
      */
     void deferMemInst(const DynInstPtr &deferred_inst);
 
+    /**
+     * Defers a load instruction when Dcache miss.
+     */
+    void cacheMissLdReplay(const DynInstPtr &deferred_inst);
+
     /**  Defers a memory instruction when it is cache blocked. */
     void blockMemInst(const DynInstPtr &blocked_inst);
 
@@ -302,6 +313,16 @@ class InstructionQueue
      */
     std::list<DynInstPtr> deferredMemInsts;
 
+    /** Set of load instructions waiting for Dcache refill
+     *    use unordered_set to prevent repeat enqueue,
+     *    SplitDataRequest may call `cacheMissLdReplay` multiple times.
+     */
+    struct CacheMissLdInstsHash
+    {
+      size_t operator()(const DynInstPtr& ptr) const;
+    };
+    std::unordered_set<DynInstPtr, CacheMissLdInstsHash> cacheMissLdInsts;
+
     /** List of instructions that have been cache blocked. */
     std::list<DynInstPtr> blockedMemInsts;
 
diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc
index 0043894702..97bc0b73a3 100644
--- a/src/cpu/o3/lsq.cc
+++ b/src/cpu/o3/lsq.cc
@@ -555,13 +555,26 @@ LSQ::recvFunctionalCustomSignal(PacketPtr pkt, int sig)
 
     LSQRequest *request = dynamic_cast<LSQRequest*>(pkt->getPrimarySenderState());
     panic_if(!request, "Got packet back with unknown sender state\n");
-    if (sig == DcacheRespType::Miss) {
-        if (request->instruction()->isLoad()) {
-            // notify cache miss
-            iewStage->loadCancel(request->instruction());
+    if (sig == DcacheRespType::Miss || sig == DcacheRespType::Block_Not_Ready) {
+        DPRINTF(LSQ, "recvFunctionalCustomSignal: Resp type: %d, [sn:%ld], lqidx: %ld\n",
+                sig, request->instruction()->seqNum, request->instruction()->lqIdx);
+        if (request->mainReq()->isLLSC() || request->mainReq()->isUncacheable()) {
+            // do not replay Amo/Uncache Load
+            DPRINTF(LSQ, "Recv Amo/Uncache Load: [sn:%ld], No Need to Replay\n",
+                    request->instruction()->seqNum);
+        } else {
+            // clear state in this instruction
+            request->instruction()->cacheRefilledAfterMiss(false);
+            request->instruction()->effAddrValid(false);
+            // clear request in loadQueue
+            thread[request->_port.lsqID].loadQueue[request->instruction()->lqIdx].setRequest(nullptr);
             // set cache miss flag in pipeline
             thread[request->_port.lsqID].setFlagInPipeLine(request->instruction(), LdStFlags::CacheMiss);
+            // insert to missed load replay queue
+            iewStage->cacheMissLdReplay(request->instruction());
         }
+        // cancel subsequent dependent insts of this load
+        iewStage->loadCancel(request->instruction());
     } else {
         panic("unsupported sig %d in recvFunctionalCustomSignal\n", sig);
     }
@@ -1394,21 +1407,32 @@ LSQ::SbufferRequest::recvTimingResp(PacketPtr pkt)
 bool
 LSQ::SingleDataRequest::recvTimingResp(PacketPtr pkt)
 {
+    LSQRequest *request = dynamic_cast<LSQRequest*>(pkt->senderState);
+    bool isNormalLd = isLoad() && !request->mainReq()->isLLSC() && !request->mainReq()->isUncacheable();
     // Dump inst num, request addr, and packet addr
-    DPRINTF(LSQ, "Single Req::recvTimingResp: inst: %llu, pkt: %#lx\n", pkt->req->getReqInstSeqNum(),
-            pkt->getAddr());
+    DPRINTF(LSQ, "Single Req::recvTimingResp: inst: %llu, pkt: %#lx, isLoad: %d, "
+                "isLLSC: %d, isUncache: %d, isCacheSatisfied: %d\n",
+                pkt->req->getReqInstSeqNum(), pkt->getAddr(), isLoad(), request->mainReq()->isLLSC(),
+                request->mainReq()->isUncacheable(), pkt->cacheSatisfied);
     assert(_numOutstandingPackets == 1);
-    flags.set(Flag::Complete);
-    assert(pkt == _packets.front());
-    forward();
-    _port.completeDataAccess(pkt);
-    _hasStaleTranslation = false;
+    if (isNormalLd && !pkt->cacheSatisfied) {
+        // Data in Dcache is ready, wake up missed load in replay queue
+        LSQRequest::_inst->cacheRefilledAfterMiss(true);
+        discard();
+    } else {
+        flags.set(Flag::Complete);
+        assert(pkt == _packets.front());
+        forward();
+        _port.completeDataAccess(pkt);
+        _hasStaleTranslation = false;
+    }
     return true;
 }
 
 bool
 LSQ::SplitDataRequest::recvTimingResp(PacketPtr pkt)
 {
+    LSQRequest *request = dynamic_cast<LSQRequest*>(pkt->senderState);
     DPRINTF(LSQ, "Spilt Req::recvTimingResp: inst: %llu, pkt: %#lx\n", pkt->req->getReqInstSeqNum(),
             pkt->getAddr());
     uint32_t pktIdx = 0;
@@ -1417,21 +1441,28 @@ LSQ::SplitDataRequest::recvTimingResp(PacketPtr pkt)
     assert(pktIdx < _packets.size());
     numReceivedPackets++;
     if (numReceivedPackets == _packets.size()) {
-        flags.set(Flag::Complete);
-        /* Assemble packets. */
-        PacketPtr resp = isLoad()
-            ? Packet::createRead(_mainReq)
-            : Packet::createWrite(_mainReq);
-        if (isLoad())
-            resp->dataStatic(_inst->memData);
-        else
-            resp->dataStatic(_data);
-        resp->senderState = this;
-        forward();
-        _port.completeDataAccess(resp);
-        delete resp;
+        bool isNormalLd = isLoad() && !request->mainReq()->isLLSC() && !request->mainReq()->isUncacheable();
+        if (isNormalLd && !pkt->cacheSatisfied) {
+            // Data in Dcache is ready, wake up missed load in replay queue
+            LSQRequest::_inst->cacheRefilledAfterMiss(true);
+            discard();
+        } else {
+            flags.set(Flag::Complete);
+            /* Assemble packets. */
+            PacketPtr resp = isLoad()
+                ? Packet::createRead(_mainReq)
+                : Packet::createWrite(_mainReq);
+            if (isLoad())
+                resp->dataStatic(_inst->memData);
+            else
+                resp->dataStatic(_data);
+            resp->senderState = this;
+            forward();
+            _port.completeDataAccess(resp);
+            delete resp;
+            _hasStaleTranslation = false;
+        }
     }
-    _hasStaleTranslation = false;
     return true;
 }
 
diff --git a/src/mem/cache/base.cc b/src/mem/cache/base.cc
index ab0abeb998..02f46389ed 100644
--- a/src/mem/cache/base.cc
+++ b/src/mem/cache/base.cc
@@ -642,14 +642,16 @@ BaseCache::recvTimingReq(PacketPtr pkt)
         }
 
         handleTimingReqHit(pkt, blk, request_time, first_acc_after_pf);
-        if (cacheLevel == 1 && pkt->isResponse() && pkt->isRead() && lat > 1) {
-            // send cache miss signal
-            cpuSidePort.sendCustomSignal(pkt, DcacheRespType::Miss);
+        if (cacheLevel == 1 && pkt->isResponse() && pkt->isRead() && !pkt->isWrite() && lat > 1) {
+            // cache block not ready, send cancel signal
+            cpuSidePort.sendCustomSignal(pkt, DcacheRespType::Block_Not_Ready);
+            pkt->cacheSatisfied = false;
         }
     } else {
-        if (cacheLevel == 1 && pkt->needsResponse() && pkt->isRead()) {
+        if (cacheLevel == 1 && pkt->needsResponse() && pkt->isRead() && !pkt->isWrite()) {
             // send cache miss signal
             cpuSidePort.sendCustomSignal(pkt, DcacheRespType::Miss);
+            pkt->cacheSatisfied = false;
         }
 
         // ArchDB: for now we only track packet which has PC
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index a62d05de04..8964904215 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -1598,6 +1598,8 @@ class Packet : public Printable
 
     bool tagReadFail = false;
 
+    bool cacheSatisfied = true;
+
     bool fromBOP() const { return pfSource == PrefetchSourceType::HWP_BOP; }
     
     PrefetchSourceType getPFSource() const { return static_cast<PrefetchSourceType>(pfSource); }
diff --git a/src/mem/request.hh b/src/mem/request.hh
index acbd793c0a..075949a2d9 100644
--- a/src/mem/request.hh
+++ b/src/mem/request.hh
@@ -91,6 +91,7 @@ enum DcacheRespType
 {
     NONE = 0,
     Hit,
+    Block_Not_Ready,
     Miss,
     NUM_Resp_Type
 };
diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc
index e4ee650f85..50b97c742a 100644
--- a/src/mem/ruby/system/RubyPort.cc
+++ b/src/mem/ruby/system/RubyPort.cc
@@ -497,6 +497,7 @@ RubyPort::ruby_custom_signal_callback(PacketPtr pkt)
 
     DPRINTF(RubyPort, "Sent custom signal back to LSQ with sender state %#lx\n", sender_state);
     port->sendCustomSignal(pkt, DcacheRespType::Miss);
+    pkt->cacheSatisfied = false;
 }
 
 void
diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc
index 8f6213b70c..0b442ad1f5 100644
--- a/src/mem/ruby/system/Sequencer.cc
+++ b/src/mem/ruby/system/Sequencer.cc
@@ -383,7 +383,7 @@ Sequencer::insertRequest(PacketPtr pkt, RubyRequestType primary_type,
 
     if (seq_req_list.size() > 1) {
         if (cache_block_busy) {
-            if (pkt->isRead()) {
+            if (pkt->isRead() && !pkt->isWrite()) {
                 DPRINTF(RubySequencer, "Pkt %#lx %s is delayed because blk is busy doing ruby stuff\n",
                     pkt, pkt->cmdString());
                 ruby_custom_signal_callback(pkt);
@@ -649,7 +649,7 @@ Sequencer::notifyMissCallback(Addr address, bool is_upgrade, bool is_busy)
 
     // cancel pending loads' speculation
     for (auto &seq_req: seq_req_list) {
-        if (seq_req.pkt->isRead()) {
+        if (seq_req.pkt->isRead() && !seq_req.pkt->isWrite()) {
             ruby_custom_signal_callback(seq_req.pkt);
             stat.loadcancel++;
         }
@@ -693,7 +693,7 @@ Sequencer::TBEFullCancel(Addr address)
 
     // cancel pending loads' speculation
     for (auto &seq_req: seq_req_list) {
-        if (seq_req.pkt->isRead()) {
+        if (seq_req.pkt->isRead() && !seq_req.pkt->isWrite()) {
             ruby_custom_signal_callback(seq_req.pkt);
             stat.loadcancel++;
         }

From 2adee2e4985763581d19493074291a8c56340657 Mon Sep 17 00:00:00 2001
From: lixin <1037997956@qq.com>
Date: Fri, 13 Dec 2024 11:19:10 +0800
Subject: [PATCH 05/10] arch: use strictly order-preserving LRSC

Add a fence before and after the LRSC instruction.

Change-Id: I66021d0a5a653d2a7e30cd262166363a84184ed6
---
 src/arch/riscv/isa/formats/amo.isa | 34 ++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/src/arch/riscv/isa/formats/amo.isa b/src/arch/riscv/isa/formats/amo.isa
index f7e9b5bcc6..b104c07657 100644
--- a/src/arch/riscv/isa/formats/amo.isa
+++ b/src/arch/riscv/isa/formats/amo.isa
@@ -151,6 +151,36 @@ def template LRSCMacroConstructor {{
     }
 }};
 
+// Strictly order-preserving LRSC
+def template LRSCStrictMacroConstructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst):
+        %(base_class)s("%(mnemonic)s", machInst, %(op_class)s)
+    {
+        %(constructor)s;
+
+        StaticInstPtr rel_fence;
+        StaticInstPtr lrsc;
+        StaticInstPtr acq_fence;
+
+        rel_fence = new MemFenceMicro(machInst, No_OpClass);
+        rel_fence->setFlag(IsFirstMicroop);
+        rel_fence->setFlag(IsReadBarrier);
+        rel_fence->setFlag(IsWriteBarrier);
+        rel_fence->setFlag(IsDelayedCommit);
+
+        // set up atomic rmw op
+        lrsc = new %(class_name)sMicro(machInst, this);
+        lrsc->setFlag(IsDelayedCommit);
+
+        acq_fence = new MemFenceMicro(machInst, No_OpClass);
+        acq_fence->setFlag(IsLastMicroop);
+        acq_fence->setFlag(IsReadBarrier);
+        acq_fence->setFlag(IsWriteBarrier);
+
+        microops = {rel_fence, lrsc, acq_fence};
+    }
+}};
+
 def template LRSCMicroConstructor {{
     %(class_name)s::%(class_name)sMicro::%(class_name)sMicro(
         ExtMachInst machInst, %(class_name)s *_p)
@@ -435,7 +465,7 @@ def format LoadReserved(memacc_code, postacc_code={{ }}, ea_code={{EA = Rs1;}},
     macro_iop = InstObjParams(name, Name, 'LoadReserved', macro_ea_code,
                               macro_inst_flags)
     header_output = LRSCDeclare.subst(macro_iop)
-    decoder_output = LRSCMacroConstructor.subst(macro_iop)
+    decoder_output = LRSCStrictMacroConstructor.subst(macro_iop)
     decode_block = BasicDecode.subst(macro_iop)
 
     exec_output = ''
@@ -463,7 +493,7 @@ def format StoreCond(memacc_code, postacc_code={{ }}, ea_code={{EA = Rs1;}},
     macro_iop = InstObjParams(name, Name, 'StoreCond', macro_ea_code,
                               macro_inst_flags)
     header_output = LRSCDeclare.subst(macro_iop)
-    decoder_output = LRSCMacroConstructor.subst(macro_iop)
+    decoder_output = LRSCStrictMacroConstructor.subst(macro_iop)
     decode_block = BasicDecode.subst(macro_iop)
 
     exec_output = ''

From 72a0a53c38669bb0046e7e95a043538eea8afcf9 Mon Sep 17 00:00:00 2001
From: lixin <1037997956@qq.com>
Date: Tue, 17 Dec 2024 13:34:15 +0800
Subject: [PATCH 06/10] mem: let load has certain latency in ruby cahche

Change-Id: Ifc1a586df8beab65772d48a75106155f9e723cba
---
 src/mem/ruby/system/RubyPort.cc | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc
index 50b97c742a..f5790d608e 100644
--- a/src/mem/ruby/system/RubyPort.cc
+++ b/src/mem/ruby/system/RubyPort.cc
@@ -49,6 +49,7 @@
 #include "mem/packet_access.hh"
 #include "mem/ruby/protocol/AccessPermission.hh"
 #include "mem/ruby/slicc_interface/AbstractController.hh"
+#include "mem/ruby/slicc_interface/RubySlicc_Util.hh"
 #include "mem/simple_mem.hh"
 #include "sim/full_system.hh"
 #include "sim/system.hh"
@@ -676,7 +677,12 @@ RubyPort::MemResponsePort::hitCallback(PacketPtr pkt)
         // Send a response in the same cycle. There is no need to delay the
         // response because the response latency is already incurred in the
         // Ruby protocol.
-        schedTimingResp(pkt, curTick());
+        if (pkt->isRead() && !pkt->isWrite() && !pkt->fromCache()) {
+            // send resp right now, make sure load has certain latency
+            respQueue.sendTiming(pkt);
+        } else {
+            schedTimingResp(pkt, curTick());
+        }
     } else {
         delete pkt;
     }

From 5ec5505b6a8a11f357740dcee2a966143da9d447 Mon Sep 17 00:00:00 2001
From: lixin <1037997956@qq.com>
Date: Tue, 17 Dec 2024 13:43:35 +0800
Subject: [PATCH 07/10] cpu-o3: tune the behavior of the ldst pipeline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adjust cache miss load replay logic: replay all loads cannot get data at
load s2, now we don’t need cache to send `sendCustomSignal` when miss.

Add RAW nuke replay at load s1&s2

Move most of the writeback logic to load s2 and actually writeback at s3

Change-Id: Idfd3480969958826f4820349168f17c9522f791e
---
 src/cpu/o3/dyn_inst.hh   |  10 +-
 src/cpu/o3/inst_queue.cc |   4 +-
 src/cpu/o3/lsq.cc        |  94 ++++++-----
 src/cpu/o3/lsq.hh        |  36 ++++-
 src/cpu/o3/lsq_unit.cc   | 340 ++++++++++++++++++++++++++-------------
 src/cpu/o3/lsq_unit.hh   |  40 +++--
 6 files changed, 355 insertions(+), 169 deletions(-)

diff --git a/src/cpu/o3/dyn_inst.hh b/src/cpu/o3/dyn_inst.hh
index 310d7c1e9c..5bcdd14d34 100644
--- a/src/cpu/o3/dyn_inst.hh
+++ b/src/cpu/o3/dyn_inst.hh
@@ -194,7 +194,7 @@ class DynInst : public ExecContext, public RefCounted
         NotAnInst,
         TranslationStarted,
         TranslationCompleted,
-        CacheRefilledAfterMiss,
+        WaitingCacheRefill,
         PossibleLoadViolation,
         HitExternalSnoop,
         EffAddrValid,
@@ -463,13 +463,13 @@ class DynInst : public ExecContext, public RefCounted
     }
     void translationCompleted(bool f) { instFlags[TranslationCompleted] = f; }
 
-    /** True if Dcache refilled after Dcache miss. */
+    /** True if inst is waiting for Dcache refill. */
     bool
-    cacheRefilledAfterMiss() const
+    waitingCacheRefill() const
     {
-        return instFlags[CacheRefilledAfterMiss];
+        return instFlags[WaitingCacheRefill];
     }
-    void cacheRefilledAfterMiss(bool f) { instFlags[CacheRefilledAfterMiss] = f; }
+    void waitingCacheRefill(bool f) { instFlags[WaitingCacheRefill] = f; }
 
     /** True if this address was found to match a previous load and they issued
      * out of order. If that happend, then it's only a problem if an incoming
diff --git a/src/cpu/o3/inst_queue.cc b/src/cpu/o3/inst_queue.cc
index 803ee9f17f..090765a79d 100644
--- a/src/cpu/o3/inst_queue.cc
+++ b/src/cpu/o3/inst_queue.cc
@@ -931,14 +931,14 @@ InstructionQueue::getCacheMissInstToExecute()
 {
     for (auto it = cacheMissLdInsts.begin(); it != cacheMissLdInsts.end();
          ++it) {
-        if ((*it)->cacheRefilledAfterMiss() || (*it)->isSquashed()) {
+        if (!(*it)->waitingCacheRefill() || (*it)->isSquashed()) {
             DPRINTF(IQ, "CacheMissed load inst [sn:%llu] PC %s is ready to "
                     "execute\n", (*it)->seqNum, (*it)->pcState());
             DynInstPtr mem_inst = std::move(*it);
             cacheMissLdInsts.erase(it);
             return mem_inst;
         }
-        if (!(*it)->cacheRefilledAfterMiss()) {
+        if ((*it)->waitingCacheRefill()) {
             DPRINTF(
                 IQ,
                 "CacheMissed load inst [sn:%llu] PC %s has not been waken up "
diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc
index 97bc0b73a3..ad09bf8288 100644
--- a/src/cpu/o3/lsq.cc
+++ b/src/cpu/o3/lsq.cc
@@ -42,6 +42,7 @@
 #include "cpu/o3/lsq.hh"
 
 #include <algorithm>
+#include <cassert>
 #include <csignal>
 #include <cstdint>
 #include <list>
@@ -65,6 +66,7 @@
 #include "debug/TagReadFail.hh"
 #include "debug/Writeback.hh"
 #include "mem/packet_access.hh"
+#include "mem/request.hh"
 #include "params/BaseO3CPU.hh"
 
 namespace gem5
@@ -558,21 +560,6 @@ LSQ::recvFunctionalCustomSignal(PacketPtr pkt, int sig)
     if (sig == DcacheRespType::Miss || sig == DcacheRespType::Block_Not_Ready) {
         DPRINTF(LSQ, "recvFunctionalCustomSignal: Resp type: %d, [sn:%ld], lqidx: %ld\n",
                 sig, request->instruction()->seqNum, request->instruction()->lqIdx);
-        if (request->mainReq()->isLLSC() || request->mainReq()->isUncacheable()) {
-            // do not replay Amo/Uncache Load
-            DPRINTF(LSQ, "Recv Amo/Uncache Load: [sn:%ld], No Need to Replay\n",
-                    request->instruction()->seqNum);
-        } else {
-            // clear state in this instruction
-            request->instruction()->cacheRefilledAfterMiss(false);
-            request->instruction()->effAddrValid(false);
-            // clear request in loadQueue
-            thread[request->_port.lsqID].loadQueue[request->instruction()->lqIdx].setRequest(nullptr);
-            // set cache miss flag in pipeline
-            thread[request->_port.lsqID].setFlagInPipeLine(request->instruction(), LdStFlags::CacheMiss);
-            // insert to missed load replay queue
-            iewStage->cacheMissLdReplay(request->instruction());
-        }
         // cancel subsequent dependent insts of this load
         iewStage->loadCancel(request->instruction());
     } else {
@@ -1252,7 +1239,7 @@ LSQ::LSQRequest::LSQRequest(
     : _state(State::NotIssued),
     numTranslatedFragments(0),
     numInTranslationFragments(0),
-    _port(*port), _inst(inst), _data(data),
+    _port(*port), _inst(inst), _data(data), _fwd_data_pkt(nullptr),
     _res(res), _addr(addr), _size(size),
     _flags(flags_),
     _numOutstandingPackets(0),
@@ -1341,6 +1328,9 @@ LSQ::LSQRequest::~LSQRequest()
 
     for (auto r: _packets)
         delete r;
+
+    if (_fwd_data_pkt)
+        delete  _fwd_data_pkt;
 };
 
 ContextID
@@ -1407,23 +1397,28 @@ LSQ::SbufferRequest::recvTimingResp(PacketPtr pkt)
 bool
 LSQ::SingleDataRequest::recvTimingResp(PacketPtr pkt)
 {
-    LSQRequest *request = dynamic_cast<LSQRequest*>(pkt->senderState);
-    bool isNormalLd = isLoad() && !request->mainReq()->isLLSC() && !request->mainReq()->isUncacheable();
+    bool isNormalLd = this->isNormalLd();
     // Dump inst num, request addr, and packet addr
     DPRINTF(LSQ, "Single Req::recvTimingResp: inst: %llu, pkt: %#lx, isLoad: %d, "
-                "isLLSC: %d, isUncache: %d, isCacheSatisfied: %d\n",
-                pkt->req->getReqInstSeqNum(), pkt->getAddr(), isLoad(), request->mainReq()->isLLSC(),
-                request->mainReq()->isUncacheable(), pkt->cacheSatisfied);
+                "isLLSC: %d, isUncache: %d, isCacheSatisfied: %d, data: %d\n",
+                pkt->req->getReqInstSeqNum(), pkt->getAddr(), isLoad(), mainReq()->isLLSC(),
+                mainReq()->isUncacheable(), pkt->cacheSatisfied, *(pkt->getPtr<uint64_t*>()));
     assert(_numOutstandingPackets == 1);
-    if (isNormalLd && !pkt->cacheSatisfied) {
+    if (isNormalLd && LSQRequest::_inst->waitingCacheRefill()) {
         // Data in Dcache is ready, wake up missed load in replay queue
-        LSQRequest::_inst->cacheRefilledAfterMiss(true);
+        LSQRequest::_inst->waitingCacheRefill(false);
         discard();
     } else {
         flags.set(Flag::Complete);
         assert(pkt == _packets.front());
-        forward();
-        _port.completeDataAccess(pkt);
+        if (isNormalLd) {
+            // cache satisfied load, assemblePackets at load s2
+            _port.setFlagInPipeLine(_inst, LdStFlags::CacheHit);
+        } else {
+            // cache satisfied other kinds of request
+            assert(pkt == mainPacket());
+            assemblePackets();
+        }
         _hasStaleTranslation = false;
     }
     return true;
@@ -1432,7 +1427,6 @@ LSQ::SingleDataRequest::recvTimingResp(PacketPtr pkt)
 bool
 LSQ::SplitDataRequest::recvTimingResp(PacketPtr pkt)
 {
-    LSQRequest *request = dynamic_cast<LSQRequest*>(pkt->senderState);
     DPRINTF(LSQ, "Spilt Req::recvTimingResp: inst: %llu, pkt: %#lx\n", pkt->req->getReqInstSeqNum(),
             pkt->getAddr());
     uint32_t pktIdx = 0;
@@ -1441,31 +1435,49 @@ LSQ::SplitDataRequest::recvTimingResp(PacketPtr pkt)
     assert(pktIdx < _packets.size());
     numReceivedPackets++;
     if (numReceivedPackets == _packets.size()) {
-        bool isNormalLd = isLoad() && !request->mainReq()->isLLSC() && !request->mainReq()->isUncacheable();
-        if (isNormalLd && !pkt->cacheSatisfied) {
+        bool isNormalLd = this->isNormalLd();
+        if (isNormalLd && LSQRequest::_inst->waitingCacheRefill()) {
             // Data in Dcache is ready, wake up missed load in replay queue
-            LSQRequest::_inst->cacheRefilledAfterMiss(true);
+            LSQRequest::_inst->waitingCacheRefill(false);
             discard();
         } else {
             flags.set(Flag::Complete);
-            /* Assemble packets. */
-            PacketPtr resp = isLoad()
-                ? Packet::createRead(_mainReq)
-                : Packet::createWrite(_mainReq);
-            if (isLoad())
-                resp->dataStatic(_inst->memData);
-            else
-                resp->dataStatic(_data);
-            resp->senderState = this;
-            forward();
-            _port.completeDataAccess(resp);
-            delete resp;
+            if (isNormalLd) {
+                // cache satisfied load, assemblePackets at load s2
+                _port.setFlagInPipeLine(_inst, LdStFlags::CacheHit);
+            } else {
+                // Assemble packets, cache satisfied other kinds of request
+                assemblePackets();
+            }
             _hasStaleTranslation = false;
         }
     }
     return true;
 }
 
+void
+LSQ::SingleDataRequest::assemblePackets()
+{
+    forward();
+    _port.completeDataAccess(mainPacket());
+}
+
+void
+LSQ::SplitDataRequest::assemblePackets()
+{
+    PacketPtr resp = isLoad()
+        ? Packet::createRead(_mainReq)
+        : Packet::createWrite(_mainReq);
+    if (isLoad())
+        resp->dataStatic(_inst->memData);
+    else
+        resp->dataStatic(_data);
+    resp->senderState = this;
+    forward();
+    _port.completeDataAccess(resp);
+    delete resp;
+}
+
 void
 LSQ::SbufferRequest::buildPackets()
 {
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
index f2213a3f90..5181fc9b2f 100644
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -47,6 +47,8 @@
 #include <list>
 #include <map>
 #include <queue>
+#include <string>
+#include <utility>
 #include <vector>
 
 #include <boost/compute/detail/lru_cache.hpp>
@@ -60,6 +62,7 @@
 #include "cpu/o3/dyn_inst_xsmeta.hh"
 #include "cpu/utils.hh"
 #include "enums/SMTQueuePolicy.hh"
+#include "mem/packet.hh"
 #include "mem/port.hh"
 #include "sim/sim_object.hh"
 
@@ -81,13 +84,32 @@ enum LdStFlags
 {
     Valid = 0,
     Replayed,
-    CacheMiss,
+    CacheHit,
+    Nuke,
+    FullForward,
+    LocalAccess,
+    HasFault,
+    readNotPredicate,
+    readMemAccNotPredicate,
     Squashed,
     Num_Flags
 };
 
 constexpr uint64_t LdStFlagNum = LdStFlags::Num_Flags;
 
+const std::string LdStFlagName[LdStFlagNum] = {
+    "Valid",
+    "Replayed",
+    "CacheHit",
+    "Nuke",
+    "FullForward",
+    "LocalAccess",
+    "HasFault",
+    "readNotPredicate",
+    "readMemAccNotPredicate",
+    "Squashed"
+};
+
 class LSQ
 {
   public:
@@ -268,6 +290,7 @@ class LSQ
         PacketDataPtr _data;
         std::vector<PacketPtr> _packets;
         std::vector<RequestPtr> _reqs;
+        PacketPtr _fwd_data_pkt;
         std::vector<Fault> _fault;
         uint64_t* _res;
         const Addr _addr;
@@ -475,6 +498,8 @@ class LSQ
          */
         virtual bool isCacheBlockHit(Addr blockAddr, Addr cacheBlockMask) = 0;
 
+        virtual void assemblePackets() { panic("assemblePackets not implemented!\n"); }
+
         /** Update the status to reflect that a packet was sent. */
         void
         packetSent()
@@ -606,6 +631,13 @@ class LSQ
             flags.set(Flag::Complete);
         }
 
+        /* Load instrutcion which is not LR or MMIO type of Load. */
+        bool
+        isNormalLd()
+        {
+            return isLoad() && !mainReq()->isLLSC() && !mainReq()->isUncacheable();
+        }
+
         virtual std::string name() const { return "LSQRequest"; }
     };
 
@@ -625,6 +657,7 @@ class LSQ
         virtual void finish(const Fault &fault, const RequestPtr &req,
                 gem5::ThreadContext* tc, BaseMMU::Mode mode);
         virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void assemblePackets();
         virtual bool sendPacketToCache();
         virtual void buildPackets();
         virtual Cycles handleLocalAccess(
@@ -690,6 +723,7 @@ class LSQ
         virtual void finish(const Fault &fault, const RequestPtr &req,
                 gem5::ThreadContext* tc, BaseMMU::Mode mode);
         virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void assemblePackets();
         virtual void initiateTranslation();
         virtual bool sendPacketToCache();
         virtual void buildPackets();
diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc
index 158a022201..238097afa9 100644
--- a/src/cpu/o3/lsq_unit.cc
+++ b/src/cpu/o3/lsq_unit.cc
@@ -41,15 +41,21 @@
 
 #include "cpu/o3/lsq_unit.hh"
 
+#include <cassert>
+
 #include "arch/generic/debugfaults.hh"
 #include "arch/riscv/faults.hh"
+#include "base/logging.hh"
 #include "base/str.hh"
 #include "base/trace.hh"
+#include "base/types.hh"
 #include "config/the_isa.hh"
 #include "cpu/base.hh"
 #include "cpu/checker/cpu.hh"
 #include "cpu/golden_global_mem.hh"
 #include "cpu/o3/dyn_inst.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
+#include "cpu/o3/issue_queue.hh"
 #include "cpu/o3/limits.hh"
 #include "cpu/o3/lsq.hh"
 #include "cpu/utils.hh"
@@ -567,6 +573,8 @@ LSQUnit::LSQUnitStats::LSQUnitStats(statistics::Group *parent)
                "Number of loads that had data forwarded from stores"),
       ADD_STAT(squashedLoads, statistics::units::Count::get(),
                "Number of loads squashed"),
+      ADD_STAT(pipeRawNukeReplay, statistics::units::Count::get(),
+               "Number of pipeline detected raw nuke"),
       ADD_STAT(ignoredResponses, statistics::units::Count::get(),
                "Number of memory responses ignored because the instruction is "
                "squashed"),
@@ -724,6 +732,27 @@ LSQUnit::insertStore(const DynInstPtr& store_inst)
     storeQueue.back().set(store_inst);
 }
 
+bool
+LSQUnit::pipeLineNukeCheck(const DynInstPtr &load_inst, const DynInstPtr &store_inst)
+{
+    Addr load_eff_addr1 = load_inst->effAddr >> depCheckShift;
+    Addr load_eff_addr2 = (load_inst->effAddr + load_inst->effSize - 1) >> depCheckShift;
+
+    Addr store_eff_addr1 = store_inst->effAddr >> depCheckShift;
+    Addr store_eff_addr2 = (store_inst->effAddr + store_inst->effSize - 1) >> depCheckShift;
+
+    LSQRequest* store_req = store_inst->savedRequest;
+    bool load_need_check = load_inst->effAddrValid() && (load_inst->lqIt >= store_inst->lqIt);
+    bool store_need_check = store_req && store_req->isTranslationComplete() &&
+                            store_req->isMemAccessRequired() && (store_inst->getFault() == NoFault);
+    if (load_need_check && store_need_check) {
+        if (load_eff_addr1 <= store_eff_addr2 && store_eff_addr1 <= load_eff_addr2) {
+            return true;
+        }
+    }
+    return false;
+}
+
 DynInstPtr
 LSQUnit::getMemDepViolator()
 {
@@ -834,6 +863,23 @@ LSQUnit::checkSnoop(PacketPtr pkt)
     return;
 }
 
+bool
+LSQUnit::skipNukeReplay(const DynInstPtr& load_inst)
+{
+    // if the load_inst has been marked as `Nuke`
+    // load will be replayed, so no Raw violation happens.
+    for (int i = 1; i <= 2; i++) {
+        // check loadPipe s1 & s2
+        auto& stage = loadPipeSx[i];
+        for (int j = 0; j < stage->size; j++) {
+            if (load_inst == stage->insts[j] && stage->flags[j][LdStFlags::Nuke]) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
 Fault
 LSQUnit::checkViolations(typename LoadQueue::iterator& loadIt,
         const DynInstPtr& inst)
@@ -896,6 +942,14 @@ LSQUnit::checkViolations(typename LoadQueue::iterator& loadIt,
                 if (memDepViolator && ld_inst->seqNum > memDepViolator->seqNum)
                     break;
 
+                // if this load has been marked as Nuke, the load will then be replayed
+                // So next time this load replaying to pipeline will forward from store correctly
+                // And no RAW violation happens
+                if (skipNukeReplay(ld_inst)) {
+                    ++loadIt;
+                    continue;
+                }
+
                 DPRINTF(LSQUnit,
                         "ld_eff_addr1: %#x, ld_eff_addr2: %#x, "
                         "inst_eff_addr1: %#x, inst_eff_addr2: %#x\n",
@@ -947,7 +1001,7 @@ LSQUnit::setFlagInPipeLine(DynInstPtr inst, LdStFlags f)
     }
 
     if (!found) {
-        warn("[sn:%ld] Can not found corresponding inst in PipeLine, isLoad: %d\n", inst->seqNum, inst->isLoad());
+        panic("[sn:%ld] Can not found corresponding inst in PipeLine, isLoad: %d\n", inst->seqNum, inst->isLoad());
     }
 }
 
@@ -982,13 +1036,10 @@ LSQUnit::issueToStorePipe(const DynInstPtr &inst)
 }
 
 Fault
-LSQUnit::loadPipeS0(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag)
+LSQUnit::loadPipeS0(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag)
 {
-    DPRINTF(LSQUnit, "LoadPipeS0: Executing load PC %s, [sn:%lli] "
-            "flags: valid[%d], replayed[%d], cachemiss[%d], squashed[%d]\n",
-            inst->pcState(), inst->seqNum,
-            flag[LdStFlags::Valid], flag[LdStFlags::Replayed],
-            flag[LdStFlags::CacheMiss], flag[LdStFlags::Squashed]);
+    DPRINTF(LSQUnit, "LoadPipeS0: Executing load PC %s, [sn:%lli] flags: %s\n",
+            inst->pcState(), inst->seqNum, getLdStFlagStr(flag));
     assert(!inst->isSquashed());
 
     Fault load_fault = NoFault;
@@ -999,13 +1050,10 @@ LSQUnit::loadPipeS0(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag
 }
 
 Fault
-LSQUnit::loadPipeS1(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag)
+LSQUnit::loadPipeS1(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag)
 {
-    DPRINTF(LSQUnit, "LoadPipeS1: Executing load PC %s, [sn:%lli] "
-            "flags: valid[%d], replayed[%d], cachemiss[%d], squashed[%d]\n",
-            inst->pcState(), inst->seqNum,
-            flag[LdStFlags::Valid], flag[LdStFlags::Replayed],
-            flag[LdStFlags::CacheMiss], flag[LdStFlags::Squashed]);
+    DPRINTF(LSQUnit, "LoadPipeS1: Executing load PC %s, [sn:%lli] flags: %s\n",
+            inst->pcState(), inst->seqNum, getLdStFlagStr(flag));
     assert(!inst->isSquashed());
 
     Fault load_fault = inst->getFault();
@@ -1044,11 +1092,7 @@ LSQUnit::loadPipeS1(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag
     }
 
     if (load_fault == NoFault && !inst->readMemAccPredicate()) {
-        assert(inst->readPredicate());
-        inst->setExecuted();
-        inst->completeAcc(nullptr);
-        iewStage->instToCommit(inst);
-        iewStage->activityThisCycle();
+        flag[LdStFlags::readMemAccNotPredicate] = true;
         return NoFault;
     }
 
@@ -1067,26 +1111,20 @@ LSQUnit::loadPipeS1(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag
         return NoFault;
     }
 
-    // If the instruction faulted or predicated false, then we need to send it
-    // along to commit without the instruction completing.
     if (load_fault != NoFault || !inst->readPredicate()) {
-        // Send this instruction to commit, also make sure iew stage
-        // realizes there is activity.  Mark it as executed unless it
-        // is a strictly ordered load that needs to hit the head of
-        // commit.
-        if (!inst->readPredicate())
-            inst->forwardOldRegs();
-        DPRINTF(LSQUnit, "LoadPipeS1: Load [sn:%lli] not executed from %s\n",
-                inst->seqNum,
-                (load_fault != NoFault ? "fault" : "predication"));
-        if (!(inst->hasRequest() && inst->strictlyOrdered()) ||
-            inst->isAtCommit()) {
-            inst->setExecuted();
-        }
-        iewStage->instToCommit(inst);
-        iewStage->activityThisCycle();
+        flag[LdStFlags::HasFault] = load_fault != NoFault;
+        flag[LdStFlags::readNotPredicate] = !inst->readPredicate();
     } else {
         if (inst->effAddrValid()) {
+            // raw violation check (nuke replay)
+            for (int i = 0; i < storePipeSx[1]->size; i++) {
+                auto& store_inst = storePipeSx[1]->insts[i];
+                if (pipeLineNukeCheck(inst, store_inst)) {
+                    flag[LdStFlags::Nuke] = true;
+                    break;
+                }
+            }
+            // rar violation check
             auto it = inst->lqIt;
             ++it;
 
@@ -1099,27 +1137,117 @@ LSQUnit::loadPipeS1(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag
 }
 
 Fault
-LSQUnit::loadPipeS2(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag)
+LSQUnit::loadPipeS2(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag)
 {
     Fault fault = inst->getFault();
-    DPRINTF(LSQUnit, "LoadPipeS2: Executing load PC %s, [sn:%lli] "
-            "flags: valid[%d], replayed[%d], cachemiss[%d], squashed[%d]\n",
-            inst->pcState(), inst->seqNum,
-            flag[LdStFlags::Valid], flag[LdStFlags::Replayed],
-            flag[LdStFlags::CacheMiss], flag[LdStFlags::Squashed]);
+    DPRINTF(LSQUnit, "LoadPipeS2: Executing load PC %s, [sn:%lli] flags: %s\n",
+            inst->pcState(), inst->seqNum, getLdStFlagStr(flag));
     assert(!inst->isSquashed());
+    LSQRequest* request = inst->savedRequest;
+
+    if (flag[LdStFlags::readMemAccNotPredicate]) {
+        assert(inst->readPredicate() && fault == NoFault);
+        inst->setExecuted();
+        inst->completeAcc(nullptr);
+        iewStage->instToCommit(inst);
+        iewStage->activityThisCycle();
+        return NoFault;
+    }
+
+    // If the instruction faulted or predicated false, then we need to send it
+    // along to commit without the instruction completing.
+    if (flag[LdStFlags::HasFault] || flag[LdStFlags::readNotPredicate]) {
+        // Send this instruction to commit, also make sure iew stage
+        // realizes there is activity.  Mark it as executed unless it
+        // is a strictly ordered load that needs to hit the head of
+        // commit.
+        if (flag[LdStFlags::readNotPredicate])
+            inst->forwardOldRegs();
+        DPRINTF(LSQUnit, "LoadPipeS2: Load [sn:%lli] not executed from %s\n",
+                inst->seqNum, (fault != NoFault ? "fault" : "predication"));
+        if (!(inst->hasRequest() && inst->strictlyOrdered()) || inst->isAtCommit()) {
+            inst->setExecuted();
+        }
+        iewStage->instToCommit(inst);
+        iewStage->activityThisCycle();
+        return fault;
+    }
+
+    if (flag[LdStFlags::Replayed] || flag[LdStFlags::LocalAccess]) {
+        return fault;
+    }
+
+    // raw violation check (nuke replay)
+    for (int i = 0; i < storePipeSx[1]->size; i++) {
+        auto& store_inst = storePipeSx[1]->insts[i];
+        if (pipeLineNukeCheck(inst, store_inst)) {
+            flag[LdStFlags::Nuke] = true;
+            break;
+        }
+    }
+
+    // check if cache hit & get cache response?
+    // NOTE: cache miss replay has higher priority than nuke replay!
+    if (request && request->isNormalLd() && !flag[LdStFlags::FullForward] && !flag[LdStFlags::CacheHit]) {
+        // cannot get cache data at load s2, replay this load
+        // clear state in this instruction
+        inst->effAddrValid(false);
+        // set it as waiting for dcache refill
+        inst->waitingCacheRefill(true);
+        // clear request in loadQueue
+        loadQueue[inst->lqIdx].setRequest(nullptr);
+        // set cache miss & replayed flag in pipeline
+        flag[Replayed] = true;
+        // insert to missed load replay queue
+        iewStage->cacheMissLdReplay(inst);
+        // cancel subsequent dependent insts of this load
+        iewStage->loadCancel(inst);
+        return fault;
+    }
+
+    if (flag[LdStFlags::Nuke]) {
+        // replay load if nuke happens
+        request->discard();
+        inst->savedRequest = nullptr;
+        // clear state in this instruction
+        inst->translationStarted(false);
+        inst->translationCompleted(false);
+        inst->clearCanIssue();
+        inst->effAddrValid(false);
+        // clear request in loadQueue
+        loadQueue[inst->lqIdx].setRequest(nullptr);
+        // set replayed flag in pipeline
+        flag[LdStFlags::Replayed] = true;
+        // nuke fast replay
+        inst->issueQue->retryMem(inst);
+        stats.pipeRawNukeReplay++;
+        // cancel subsequent dependent insts of this load
+        iewStage->loadCancel(inst);
+    } else {
+        // no nuke happens, prepare the inst data
+        request = inst->savedRequest;
+        if (flag[LdStFlags::FullForward]) {
+            // this load gets full data from sq
+            assert(request && request->_fwd_data_pkt);
+            writeback(inst, request->_fwd_data_pkt);
+            request->writebackDone();
+        } else {
+            if (request && request->isNormalLd()) {
+                // assemble cache & sbuffer forwarded data and completeDataAcess
+                request->assemblePackets();
+            }
+        }
+    }
+
     return fault;
 }
 
 Fault
-LSQUnit::loadPipeS3(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag)
+LSQUnit::loadPipeS3(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag)
 {
     Fault fault = inst->getFault();
-    DPRINTF(LSQUnit, "LoadPipeS3: Executing load PC %s, [sn:%lli] "
-            "flags: valid[%d], replayed[%d], cachemiss[%d], squashed[%d]\n",
-            inst->pcState(), inst->seqNum,
-            flag[LdStFlags::Valid], flag[LdStFlags::Replayed],
-            flag[LdStFlags::CacheMiss], flag[LdStFlags::Squashed]);
+    DPRINTF(LSQUnit, "LoadPipeS3: Executing load PC %s, [sn:%lli] flags: %s\n",
+            inst->pcState(), inst->seqNum, getLdStFlagStr(flag));
     assert(!inst->isSquashed());
     return fault;
 }
@@ -1152,15 +1280,14 @@ LSQUnit::executeLoadPipeSx()
                             iewStage->deferMemInst(inst);
                             flag[LdStFlags::Replayed] = true;
                         }
-
-                        if (inst->isDataPrefetch() || inst->isInstPrefetch()) {
-                            inst->fault = NoFault;
-                        }
-
                         iewStage->SquashCheckAfterExe(inst);
                         break;
                     case 2:
                         fault = loadPipeS2(inst, flag);
+
+                        if (inst->isDataPrefetch() || inst->isInstPrefetch()) {
+                            inst->fault = NoFault;
+                        }
                         break;
                     case 3:
                         fault = loadPipeS3(inst, flag);
@@ -1181,16 +1308,14 @@ LSQUnit::executeLoadPipeSx()
 }
 
 Fault
-LSQUnit::storePipeS0(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag)
+LSQUnit::storePipeS0(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag)
 {
     // Make sure that a store exists.
     assert(storeQueue.size() != 0);
     assert(!inst->isSquashed());
 
-    DPRINTF(LSQUnit, "StorePipeS0: Executing store PC %s [sn:%lli] "
-            "flags: valid[%d], replayed[%d], squashed[%d]\n",
-            inst->pcState(), inst->seqNum,
-            flag[LdStFlags::Valid], flag[LdStFlags::Replayed], flag[LdStFlags::Squashed]);
+    DPRINTF(LSQUnit, "StorePipeS0: Executing store PC %s [sn:%lli] flags: %s\n",
+            inst->pcState(), inst->seqNum, getLdStFlagStr(flag));
 
     // Now initiateAcc only does TLB access
     Fault store_fault = inst->initiateAcc();
@@ -1199,7 +1324,7 @@ LSQUnit::storePipeS0(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &fla
 }
 
 Fault
-LSQUnit::storePipeS1(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag)
+LSQUnit::storePipeS1(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag)
 {
     // Make sure that a store exists.
     assert(storeQueue.size() != 0);
@@ -1207,10 +1332,8 @@ LSQUnit::storePipeS1(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &fla
     ssize_t store_idx = inst->sqIdx;
     LSQRequest* request = inst->savedRequest;
 
-    DPRINTF(LSQUnit, "StorePipeS1: Executing store PC %s [sn:%lli] "
-            "flags: valid[%d], replayed[%d], squashed[%d]\n",
-            inst->pcState(), inst->seqNum,
-            flag[LdStFlags::Valid], flag[LdStFlags::Replayed], flag[LdStFlags::Squashed]);
+    DPRINTF(LSQUnit, "StorePipeS1: Executing store PC %s [sn:%lli] flags: %s\n",
+            inst->pcState(), inst->seqNum, getLdStFlagStr(flag));
 
     // Check the recently completed loads to see if any match this store's
     // address.  If so, then we have a memory ordering violation.
@@ -1246,13 +1369,14 @@ LSQUnit::storePipeS1(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &fla
         DPRINTF(LSQUnit, "StorePipeS1: Store [sn:%lli] not executed from predication\n",
                 inst->seqNum);
         inst->forwardOldRegs();
+        flag[LdStFlags::readNotPredicate] = true;
         return store_fault;
     }
 
     if (storeQueue[store_idx].size() == 0) {
         DPRINTF(LSQUnit, "StorePipeS1: Fault on Store PC %s, [sn:%lli], Size = 0\n",
                 inst->pcState(), inst->seqNum);
-
+        flag[LdStFlags::HasFault] = true;
         return store_fault;
     }
 
@@ -1274,41 +1398,48 @@ LSQUnit::storePipeS1(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &fla
 }
 
 Fault
-LSQUnit::storePipeS2(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag)
+LSQUnit::storePipeS2(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag)
 {
     Fault fault = inst->getFault();
     assert(!inst->isSquashed());
 
-    DPRINTF(LSQUnit, "StorePipeS2: Executing store PC %s [sn:%lli] "
-            "flags: valid[%d], replayed[%d], squashed[%d]\n",
-            inst->pcState(), inst->seqNum,
-            flag[LdStFlags::Valid], flag[LdStFlags::Replayed], flag[LdStFlags::Squashed]);
+    DPRINTF(LSQUnit, "StorePipeS2: Executing store PC %s [sn:%lli] flags: %s\n",
+            inst->pcState(), inst->seqNum, getLdStFlagStr(flag));
     return fault;
 }
 
 Fault
-LSQUnit::storePipeS3(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag)
+LSQUnit::storePipeS3(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag)
 {
     Fault fault = inst->getFault();
     assert(!inst->isSquashed());
 
-    DPRINTF(LSQUnit, "StorePipeS3: Executing store PC %s [sn:%lli] "
-            "flags: valid[%d], replayed[%d], squashed[%d]\n",
-            inst->pcState(), inst->seqNum,
-            flag[LdStFlags::Valid], flag[LdStFlags::Replayed], flag[LdStFlags::Squashed]);
+    DPRINTF(LSQUnit, "StorePipeS3: Executing store PC %s [sn:%lli] flags: %s\n",
+            inst->pcState(), inst->seqNum, getLdStFlagStr(flag));
     return fault;
 }
 
 Fault
-LSQUnit::storePipeS4(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag)
+LSQUnit::storePipeS4(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag)
 {
     Fault fault = inst->getFault();
     assert(!inst->isSquashed());
 
-    DPRINTF(LSQUnit, "StorePipeS4: Executing store PC %s [sn:%lli] "
-            "flags: valid[%d], replayed[%d], squashed[%d]\n",
-            inst->pcState(), inst->seqNum,
-            flag[LdStFlags::Valid], flag[LdStFlags::Replayed], flag[LdStFlags::Squashed]);
+    DPRINTF(LSQUnit, "StorePipeS4: Executing store PC %s [sn:%lli] flags: %s\n",
+            inst->pcState(), inst->seqNum, getLdStFlagStr(flag));
+
+    // If the store had a fault then it may not have a mem req
+    if (fault != NoFault || !inst->readPredicate() || !inst->isStoreConditional()) {
+        // If the instruction faulted, then we need to send it
+        // along to commit without the instruction completing.
+        // Send this instruction to commit, also make sure iew
+        // stage realizes there is activity.
+        if (!flag[LdStFlags::Replayed]) {
+            inst->setExecuted();
+            iewStage->instToCommit(inst);
+            iewStage->activityThisCycle();
+        }
+    }
     return fault;
 }
 
@@ -1347,18 +1478,6 @@ LSQUnit::executeStorePipeSx()
                         break;
                     case 3:
                         fault = storePipeS3(inst, flag);
-                        // If the store had a fault then it may not have a mem req
-                        if (fault != NoFault || !inst->readPredicate() || !inst->isStoreConditional()) {
-                            // If the instruction faulted, then we need to send it
-                            // along to commit without the instruction completing.
-                            // Send this instruction to commit, also make sure iew
-                            // stage realizes there is activity.
-                            if (!flag[LdStFlags::Replayed]) {
-                                inst->setExecuted();
-                                iewStage->instToCommit(inst);
-                                iewStage->activityThisCycle();
-                            }
-                        }
                         break;
                     case 4:
                         fault = storePipeS4(inst, flag);
@@ -2370,15 +2489,11 @@ LSQUnit::dumpLoadPipe()
     for (int i = 0; i < loadPipeSx.size(); i++) {
         DPRINTF(LSQUnit, "Load S%d:, size: %d\n", i, loadPipeSx[i]->size);
         for (int j = 0; j < loadPipeSx[i]->size; j++) {
-            DPRINTF(LSQUnit, "  PC: %s, [tid:%i] [sn:%lli] "
-                    "flags: valid[%d], replayed[%d], cachemiss[%d], squashed[%d]\n",
+            DPRINTF(LSQUnit, "  PC: %s, [tid:%i] [sn:%lli] flags: %s\n",
                     loadPipeSx[i]->insts[j]->pcState(),
                     loadPipeSx[i]->insts[j]->threadNumber,
                     loadPipeSx[i]->insts[j]->seqNum,
-                    (loadPipeSx[i]->flags[j])[LdStFlags::Valid],
-                    (loadPipeSx[i]->flags[j])[LdStFlags::Replayed],
-                    (loadPipeSx[i]->flags[j])[LdStFlags::CacheMiss],
-                    (loadPipeSx[i]->flags[j])[LdStFlags::Squashed]
+                    getLdStFlagStr(loadPipeSx[i]->flags[j])
             );
         }
     }
@@ -2391,14 +2506,11 @@ LSQUnit::dumpStorePipe()
     for (int i = 0; i < storePipeSx.size(); i++) {
         DPRINTF(LSQUnit, "Store S%d:, size: %d\n", i, storePipeSx[i]->size);
         for (int j = 0; j < storePipeSx[i]->size; j++) {
-            DPRINTF(LSQUnit, "  PC: %s, [tid:%i] [sn:%lli] "
-                    "flags: valid[%d], replayed[%d], squashed[%d]\n",
+            DPRINTF(LSQUnit, "  PC: %s, [tid:%i] [sn:%lli] flags: %s\n",
                     storePipeSx[i]->insts[j]->pcState(),
                     storePipeSx[i]->insts[j]->threadNumber,
                     storePipeSx[i]->insts[j]->seqNum,
-                    (storePipeSx[i]->flags[j])[LdStFlags::Valid],
-                    (storePipeSx[i]->flags[j])[LdStFlags::Replayed],
-                    (storePipeSx[i]->flags[j])[LdStFlags::Squashed]
+                    getLdStFlagStr(storePipeSx[i]->flags[j])
             );
         }
     }
@@ -2490,6 +2602,7 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx)
         // rescheduled eventually
         iewStage->rescheduleMemInst(load_inst);
         load_inst->effAddrValid(false);
+        setFlagInPipeLine(load_inst, LdStFlags::Replayed);
         ++stats.rescheduledLoads;
         DPRINTF(LSQUnit, "Strictly ordered load [sn:%lli] PC %s\n",
                 load_inst->seqNum, load_inst->pcState());
@@ -2538,6 +2651,7 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx)
 
         WritebackEvent *wb = new WritebackEvent(load_inst, main_pkt, this);
         cpu->schedule(wb, cpu->clockEdge(delay));
+        setFlagInPipeLine(load_inst, LdStFlags::LocalAccess);
         return NoFault;
     }
 
@@ -2689,13 +2803,12 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx)
                     request->discard();
                 }
 
-                WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt,
-                        this);
-
-                // We'll say this has a 1 cycle load-store forwarding latency
-                // for now.
-                // @todo: Need to make this a parameter.
-                cpu->schedule(wb, curTick());
+                // set FullForward flag, save the forward result(data_pkt) in _fwd_data_pkt
+                // then this load will be written back at s2
+                // @todo: make sure _fwd_data_pkt no memory leak!
+                assert(request->_fwd_data_pkt == nullptr);
+                request->_fwd_data_pkt = data_pkt;
+                setFlagInPipeLine(load_inst, LdStFlags::FullForward);
 
                 // Don't need to do anything special for split loads.
                 ++stats.forwLoads;
@@ -2725,6 +2838,7 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx)
                 // rescheduled eventually
                 iewStage->rescheduleMemInst(load_inst);
                 load_inst->effAddrValid(false);
+                setFlagInPipeLine(load_inst, LdStFlags::Replayed);
                 ++stats.rescheduledLoads;
 
                 // Do not generate a writeback event as this instruction is not
@@ -2772,9 +2886,13 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx)
                     request->discard();
                 }
 
-                WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt,
-                        this);
-                cpu->schedule(wb, curTick());
+                // set FullForward flag, save the forward result(data_pkt) in _fwd_data_pkt
+                // then this load will be written back at s2
+                // @todo: make sure _fwd_data_pkt no memory leak!
+                assert(request->_fwd_data_pkt == nullptr);
+                request->_fwd_data_pkt = data_pkt;
+                setFlagInPipeLine(load_inst, LdStFlags::FullForward);
+
                 return NoFault;
             }
             // if not fully forward, need to clear buffer
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
index e9a0c94612..8b8b6f3fea 100644
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -391,6 +391,12 @@ class LSQUnit
     /** Returns the memory ordering violator. */
     DynInstPtr getMemDepViolator();
 
+    /** Check if store should skip this raw violation because of nuke replay. */
+    bool skipNukeReplay(const DynInstPtr& load_inst);
+
+    /** Check if there exists raw nuke between load and store. */
+    bool pipeLineNukeCheck(const DynInstPtr &load_inst, const DynInstPtr &store_inst);
+
     /** Returns the number of free LQ entries. */
     unsigned numFreeLoadEntries();
 
@@ -446,6 +452,19 @@ class LSQUnit
     /** Returns the number of stores to writeback. */
     int numStoresToSbuffer() { return storesToWB; }
 
+    /** get description string from load/store pipeLine flag. */
+    std::string getLdStFlagStr(const std::bitset<LdStFlagNum>& flag) {
+        std::string res{};
+        for (int i = 0; i < LdStFlagNum; i++) {
+            if (flag.test(i)) {
+                res += LdStFlagName[i] + ": [1] ";
+            } else {
+                res += LdStFlagName[i] + ": [0] ";
+            }
+        }
+        return res;
+    }
+
     /** Returns if the LSQ unit will writeback on this cycle. */
     bool
     willWB()
@@ -508,19 +527,19 @@ class LSQUnit
     /** Process instructions in each load pipeline stages. */
     void executeLoadPipeSx();
 
-    Fault loadPipeS0(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag);
-    Fault loadPipeS1(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag);
-    Fault loadPipeS2(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag);
-    Fault loadPipeS3(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag);
+    Fault loadPipeS0(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag);
+    Fault loadPipeS1(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag);
+    Fault loadPipeS2(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag);
+    Fault loadPipeS3(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag);
 
     /** Process instructions in each store pipeline stages. */
     void executeStorePipeSx();
 
-    Fault storePipeS0(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag);
-    Fault storePipeS1(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag);
-    Fault storePipeS2(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag);
-    Fault storePipeS3(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag);
-    Fault storePipeS4(const DynInstPtr &inst, const std::bitset<LdStFlagNum> &flag);
+    Fault storePipeS0(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag);
+    Fault storePipeS1(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag);
+    Fault storePipeS2(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag);
+    Fault storePipeS3(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag);
+    Fault storePipeS4(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag);
 
     /** Wrap function. */
     void executePipeSx();
@@ -722,6 +741,9 @@ class LSQUnit
         /** Total number of squashed loads. */
         statistics::Scalar squashedLoads;
 
+        /** Total number of pipeline detected raw nuke. */
+        statistics::Scalar pipeRawNukeReplay;
+
         /** Total number of responses from the memory system that are
          * ignored due to the instruction already being squashed. */
         statistics::Scalar ignoredResponses;

From df4e7a7f80ee3aa026e0553de5e64f1efa049a5a Mon Sep 17 00:00:00 2001
From: lixin <1037997956@qq.com>
Date: Tue, 17 Dec 2024 19:04:18 +0800
Subject: [PATCH 08/10] mem: send TimingResp in advance

use `hint_wakeup_ahead_cycles` in Cache.py to control it
now `hint_wakeup_ahead_cycles` is set to 3

Change-Id: Ie93de7cbe66ce09988101a44db819d1cad1d27d2
---
 configs/example/xiangshan.py |  1 +
 src/mem/cache/Cache.py       |  3 +++
 src/mem/cache/base.cc        |  3 ++-
 src/mem/cache/base.hh        |  7 +++++--
 src/mem/cache/cache.cc       | 13 +++++++++++--
 5 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/configs/example/xiangshan.py b/configs/example/xiangshan.py
index b4400b88b2..a0e00583af 100644
--- a/configs/example/xiangshan.py
+++ b/configs/example/xiangshan.py
@@ -362,6 +362,7 @@ def setKmhV3IdealParams(args, system):
         if args.caches:
             cpu.icache.size = '128kB'
             cpu.dcache.size = '128kB'
+            cpu.dcache.hint_wakeup_ahead_cycles = 0;
             cpu.icache.enable_wayprediction = False
             cpu.dcache.enable_wayprediction = False
             cpu.dcache.tag_load_read_ports = 100 # 3->100
diff --git a/src/mem/cache/Cache.py b/src/mem/cache/Cache.py
index 79adc68fae..45e185e8a1 100644
--- a/src/mem/cache/Cache.py
+++ b/src/mem/cache/Cache.py
@@ -154,6 +154,9 @@ class BaseCache(ClockedObject):
 
     tag_load_read_ports = Param.Unsigned(3, "Total tag read ports for load/prefetcher(in L1 Cache)")
 
+    hint_wakeup_ahead_cycles = Param.Unsigned(3, "How many cycles " \
+        "giving a response to LSU waking up the missed load in advance")
+
     force_hit = Param.Bool(False, "Force some PC to hit in L1")
     way_entries = Param.MemorySize(
         "64",
diff --git a/src/mem/cache/base.cc b/src/mem/cache/base.cc
index 02f46389ed..c3d5adce35 100644
--- a/src/mem/cache/base.cc
+++ b/src/mem/cache/base.cc
@@ -145,9 +145,10 @@ BaseCache::BaseCache(const BaseCacheParams &p, unsigned blk_size)
       missCount(p.max_miss_count),
       addrRanges(p.addr_ranges.begin(), p.addr_ranges.end()),
       archDBer(p.arch_db),
+      cacheLevel(p.cache_level),
+      hintWakeUpAheadCycles(p.hint_wakeup_ahead_cycles),
       system(p.system),
       stats(*this),
-      cacheLevel(p.cache_level),
       forceHit(p.force_hit)
 {
     // the MSHR queue has no reserve entries as we check the MSHR
diff --git a/src/mem/cache/base.hh b/src/mem/cache/base.hh
index f40411785d..cc15f85e46 100644
--- a/src/mem/cache/base.hh
+++ b/src/mem/cache/base.hh
@@ -1052,6 +1052,11 @@ class BaseCache : public ClockedObject, CacheAccessor
     /** ArchDB */
     ArchDBer *archDBer;
 
+    /** Cache Level, 1 means L1 */
+    const unsigned cacheLevel{0};
+
+    Cycles hintWakeUpAheadCycles;
+
     int squashedWays;
 
   public:
@@ -1503,8 +1508,6 @@ class BaseCache : public ClockedObject, CacheAccessor
 
   private:
 
-    const unsigned cacheLevel{0};
-
     //const unsigned maxCacheLevel;
 
     const bool dumpMissPC{false};
diff --git a/src/mem/cache/cache.cc b/src/mem/cache/cache.cc
index d34b367fce..c8e3bad0ab 100644
--- a/src/mem/cache/cache.cc
+++ b/src/mem/cache/cache.cc
@@ -56,6 +56,7 @@
 #include "debug/CacheTags.hh"
 #include "debug/CacheVerbose.hh"
 #include "enums/Clusivity.hh"
+#include "mem/cache/base.hh"
 #include "mem/cache/cache_blk.hh"
 #include "mem/cache/mshr.hh"
 #include "mem/cache/tags/base.hh"
@@ -802,8 +803,16 @@ Cache::serviceMSHRTargets(MSHR *mshr, const PacketPtr pkt, CacheBlk *blk)
                 // responseLatency is the latency of the return path
                 // from lower level caches/memory to an upper level cache or
                 // the core.
-                completion_time += clockEdge(responseLatency) +
-                    (transfer_offset ? pkt->payloadDelay : 0);
+                if ((cacheLevel == 1 && !isReadOnly) &&
+                    tgt_pkt->isRead() && !tgt_pkt->isWrite() && !tgt_pkt->isLLSC()) {
+                    // Send TimingResp to LSU a few cycles in advance so that it can be replayed from ReplayQ earlier.
+                    assert(hintWakeUpAheadCycles <= responseLatency);
+                    completion_time += clockEdge(responseLatency - hintWakeUpAheadCycles) +
+                        (transfer_offset ? pkt->payloadDelay : 0);
+                } else {
+                    completion_time += clockEdge(responseLatency) +
+                        (transfer_offset ? pkt->payloadDelay : 0);
+                }
 
                 assert(!tgt_pkt->req->isUncacheable());
 

From 93517e81ed41998187f91242d10f8aec4258c842 Mon Sep 17 00:00:00 2001
From: lixin <1037997956@qq.com>
Date: Wed, 18 Dec 2024 16:33:26 +0800
Subject: [PATCH 09/10] cpu-o3: add params to control nuke and miss replay

set `EnableLdMissReplay` to True to enable replaying missed load from
replayQ

set `EnablePipeNukeCheck` to True to detect raw nuke replay in loadpipe

NOTE: if `Enableldmissreplay` is False, `EnablePipeNukeCheck` can't be
set as True

Change-Id: Ic4235bffba01d5dc4c39cec8ae92f2d27b28d98a
---
 configs/example/xiangshan.py |  4 +++-
 src/cpu/o3/BaseO3CPU.py      |  2 ++
 src/cpu/o3/lsq.cc            | 15 +++++++++++----
 src/cpu/o3/lsq.hh            |  6 ++++++
 src/cpu/o3/lsq_unit.cc       | 22 +++++++++++++---------
 src/cpu/o3/lsq_unit.hh       |  5 ++++-
 6 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/configs/example/xiangshan.py b/configs/example/xiangshan.py
index a0e00583af..6d19885413 100644
--- a/configs/example/xiangshan.py
+++ b/configs/example/xiangshan.py
@@ -337,8 +337,10 @@ def setKmhV3IdealParams(args, system):
         cpu.mmu.itb.size = 96
         
         cpu.BankConflictCheck = False   # real bank conflict 0.2 score
+        cpu.EnableLdMissReplay = False
+        cpu.EnablePipeNukeCheck = False
 
-        cpu.scheduler = IdealScheduler()    
+        cpu.scheduler = IdealScheduler()
         # use centralized load/store issue queue, for hmmer
 
         # ideal decoupled frontend
diff --git a/src/cpu/o3/BaseO3CPU.py b/src/cpu/o3/BaseO3CPU.py
index 718939d85f..9f14fc94de 100644
--- a/src/cpu/o3/BaseO3CPU.py
+++ b/src/cpu/o3/BaseO3CPU.py
@@ -188,6 +188,8 @@ def support_take_over(cls):
     LFSTEntrySize = Param.Unsigned(4,"The number of store table inst in every entry of LFST can contain")
     SSITSize = Param.Unsigned(8192, "Store set ID table size")
     BankConflictCheck = Param.Bool(True, "open Bank conflict check")
+    EnableLdMissReplay = Param.Bool(True, "Replay Cache missed load instrution from ReplayQ if True")
+    EnablePipeNukeCheck = Param.Bool(True, "Replay load if Raw violation is detected in loadPipe if True")
 
 
     numRobs = Param.Unsigned(1, "Number of Reorder Buffers");
diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc
index ad09bf8288..f35a63ee93 100644
--- a/src/cpu/o3/lsq.cc
+++ b/src/cpu/o3/lsq.cc
@@ -86,6 +86,8 @@ LSQ::LSQ(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams &params)
       cacheLoadPorts(params.cacheLoadPorts), usedLoadPorts(0),lastConflictCheckTick(0),
       recentlyloadAddr(8),
       enableBankConflictCheck(params.BankConflictCheck),
+      _enableLdMissReplay(params.EnableLdMissReplay),
+      _enablePipeNukeCheck(params.EnablePipeNukeCheck),
       waitingForStaleTranslation(false),
       staleTranslationWaitTxnId(0),
       lsqPolicy(params.smtLSQPolicy),
@@ -99,6 +101,9 @@ LSQ::LSQ(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams &params)
       numThreads(params.numThreads)
 {
     assert(numThreads > 0 && numThreads <= MaxThreads);
+    if (!_enableLdMissReplay && _enablePipeNukeCheck) {
+        panic("LSQ can not support pipeline nuke replay when EnableLdMissReplay is False");
+    }
 
     //**********************************************
     //************ Handle SMT Parameters ***********
@@ -1398,20 +1403,21 @@ bool
 LSQ::SingleDataRequest::recvTimingResp(PacketPtr pkt)
 {
     bool isNormalLd = this->isNormalLd();
+    bool enableLdMissReplay = this->_port.getLsq()->enableLdMissReplay();
     // Dump inst num, request addr, and packet addr
     DPRINTF(LSQ, "Single Req::recvTimingResp: inst: %llu, pkt: %#lx, isLoad: %d, "
                 "isLLSC: %d, isUncache: %d, isCacheSatisfied: %d, data: %d\n",
                 pkt->req->getReqInstSeqNum(), pkt->getAddr(), isLoad(), mainReq()->isLLSC(),
                 mainReq()->isUncacheable(), pkt->cacheSatisfied, *(pkt->getPtr<uint64_t*>()));
     assert(_numOutstandingPackets == 1);
-    if (isNormalLd && LSQRequest::_inst->waitingCacheRefill()) {
+    if (enableLdMissReplay && isNormalLd && LSQRequest::_inst->waitingCacheRefill()) {
         // Data in Dcache is ready, wake up missed load in replay queue
         LSQRequest::_inst->waitingCacheRefill(false);
         discard();
     } else {
         flags.set(Flag::Complete);
         assert(pkt == _packets.front());
-        if (isNormalLd) {
+        if (enableLdMissReplay && isNormalLd) {
             // cache satisfied load, assemblePackets at load s2
             _port.setFlagInPipeLine(_inst, LdStFlags::CacheHit);
         } else {
@@ -1436,13 +1442,14 @@ LSQ::SplitDataRequest::recvTimingResp(PacketPtr pkt)
     numReceivedPackets++;
     if (numReceivedPackets == _packets.size()) {
         bool isNormalLd = this->isNormalLd();
-        if (isNormalLd && LSQRequest::_inst->waitingCacheRefill()) {
+        bool enableLdMissReplay = this->_port.getLsq()->enableLdMissReplay();
+        if (enableLdMissReplay && isNormalLd && LSQRequest::_inst->waitingCacheRefill()) {
             // Data in Dcache is ready, wake up missed load in replay queue
             LSQRequest::_inst->waitingCacheRefill(false);
             discard();
         } else {
             flags.set(Flag::Complete);
-            if (isNormalLd) {
+            if (enableLdMissReplay && isNormalLd) {
                 // cache satisfied load, assemblePackets at load s2
                 _port.setFlagInPipeLine(_inst, LdStFlags::CacheHit);
             } else {
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
index 5181fc9b2f..1a44bea2a7 100644
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -1013,6 +1013,9 @@ class LSQ
 
     RequestPort &getDataPort() { return dcachePort; }
 
+    bool enableLdMissReplay() const { return _enableLdMissReplay; }
+    bool enablePipeNukeCheck() const { return _enablePipeNukeCheck; }
+
   protected:
     /** D-cache is blocked */
     bool _cacheBlocked;
@@ -1033,6 +1036,9 @@ class LSQ
 
     bool enableBankConflictCheck;
 
+    bool _enableLdMissReplay;
+    bool _enablePipeNukeCheck;
+
     /** If the LSQ is currently waiting for stale translations */
     bool waitingForStaleTranslation;
     /** The ID if the transaction that made translations stale */
diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc
index 238097afa9..89907d0c80 100644
--- a/src/cpu/o3/lsq_unit.cc
+++ b/src/cpu/o3/lsq_unit.cc
@@ -745,7 +745,7 @@ LSQUnit::pipeLineNukeCheck(const DynInstPtr &load_inst, const DynInstPtr &store_
     bool load_need_check = load_inst->effAddrValid() && (load_inst->lqIt >= store_inst->lqIt);
     bool store_need_check = store_req && store_req->isTranslationComplete() &&
                             store_req->isMemAccessRequired() && (store_inst->getFault() == NoFault);
-    if (load_need_check && store_need_check) {
+    if (lsq->enablePipeNukeCheck() && load_need_check && store_need_check) {
         if (load_eff_addr1 <= store_eff_addr2 && store_eff_addr1 <= load_eff_addr2) {
             return true;
         }
@@ -868,12 +868,14 @@ LSQUnit::skipNukeReplay(const DynInstPtr& load_inst)
 {
     // if the load_inst has been marked as `Nuke`
     // load will be replayed, so no Raw violation happens.
-    for (int i = 1; i <= 2; i++) {
-        // check loadPipe s1 & s2
-        auto& stage = loadPipeSx[i];
-        for (int j = 0; j < stage->size; j++) {
-            if (load_inst == stage->insts[j] && stage->flags[j][LdStFlags::Nuke]) {
-                return true;
+    if (lsq->enablePipeNukeCheck()) {
+        for (int i = 1; i <= 2; i++) {
+            // check loadPipe s1 & s2
+            auto& stage = loadPipeSx[i];
+            for (int j = 0; j < stage->size; j++) {
+                if (load_inst == stage->insts[j] && stage->flags[j][LdStFlags::Nuke]) {
+                    return true;
+                }
             }
         }
     }
@@ -1188,7 +1190,8 @@ LSQUnit::loadPipeS2(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag)
 
     // check if cache hit & get cache response?
     // NOTE: cache miss replay has higher priority than nuke replay!
-    if (request && request->isNormalLd() && !flag[LdStFlags::FullForward] && !flag[LdStFlags::CacheHit]) {
+    if (lsq->enableLdMissReplay() &&
+        request && request->isNormalLd() && !flag[LdStFlags::FullForward] && !flag[LdStFlags::CacheHit]) {
         // cannot get cache data at load s2, replay this load
         // clear state in this instruction
         inst->effAddrValid(false);
@@ -1206,6 +1209,7 @@ LSQUnit::loadPipeS2(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag)
     }
 
     if (flag[LdStFlags::Nuke]) {
+        assert(lsq->enablePipeNukeCheck());
         // replay load if nuke happens
         request->discard();
         inst->savedRequest = nullptr;
@@ -1232,7 +1236,7 @@ LSQUnit::loadPipeS2(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag)
             writeback(inst, request->_fwd_data_pkt);
             request->writebackDone();
         } else {
-            if (request && request->isNormalLd()) {
+            if (lsq->enableLdMissReplay() && request && request->isNormalLd()) {
                 // assemble cache & sbuffer forwarded data and completeDataAcess
                 request->assemblePackets();
             }
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
index 8b8b6f3fea..56d0290d5a 100644
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -453,7 +453,8 @@ class LSQUnit
     int numStoresToSbuffer() { return storesToWB; }
 
     /** get description string from load/store pipeLine flag. */
-    std::string getLdStFlagStr(const std::bitset<LdStFlagNum>& flag) {
+    std::string getLdStFlagStr(const std::bitset<LdStFlagNum>& flag)
+    {
         std::string res{};
         for (int i = 0; i < LdStFlagNum; i++) {
             if (flag.test(i)) {
@@ -465,6 +466,8 @@ class LSQUnit
         return res;
     }
 
+    LSQ* getLsq() { return lsq; }
+
     /** Returns if the LSQ unit will writeback on this cycle. */
     bool
     willWB()

From 2f898df893adde163aed04533f1da2e17d7ed8fe Mon Sep 17 00:00:00 2001
From: lixin <1037997956@qq.com>
Date: Fri, 20 Dec 2024 13:57:09 +0800
Subject: [PATCH 10/10] cpu-o3: make store wb stage configurable

store writeback at S4 by default
when using --ideal-kmhv3, store writeback at S2

Change-Id: I6a318ff6c182daca0ab041840d76575a16e45d82
---
 configs/example/xiangshan.py |  1 +
 src/cpu/o3/BaseO3CPU.py      |  2 ++
 src/cpu/o3/lsq.cc            |  2 ++
 src/cpu/o3/lsq.hh            |  3 +++
 src/cpu/o3/lsq_unit.cc       | 26 ++++++++++++++------------
 5 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/configs/example/xiangshan.py b/configs/example/xiangshan.py
index 6d19885413..01269e7ec2 100644
--- a/configs/example/xiangshan.py
+++ b/configs/example/xiangshan.py
@@ -339,6 +339,7 @@ def setKmhV3IdealParams(args, system):
         cpu.BankConflictCheck = False   # real bank conflict 0.2 score
         cpu.EnableLdMissReplay = False
         cpu.EnablePipeNukeCheck = False
+        cpu.StoreWbStage = 2 # store writeback at s2
 
         cpu.scheduler = IdealScheduler()
         # use centralized load/store issue queue, for hmmer
diff --git a/src/cpu/o3/BaseO3CPU.py b/src/cpu/o3/BaseO3CPU.py
index 9f14fc94de..cd6e69aef6 100644
--- a/src/cpu/o3/BaseO3CPU.py
+++ b/src/cpu/o3/BaseO3CPU.py
@@ -175,6 +175,8 @@ def support_take_over(cls):
     SbufferEvictThreshold = Param.Unsigned(7, "store buffer eviction threshold")
     storeBufferInactiveThreshold = Param.Unsigned(800, "store buffer writeback timeout threshold")
 
+    StoreWbStage = Param.Unsigned(4, "Which PipeLine Stage store instruction writeback, 4 means S4")
+
     LSQDepCheckShift = Param.Unsigned(0,
             "Number of places to shift addr before check")
     LSQCheckLoads = Param.Bool(True,
diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc
index f35a63ee93..23fdd4f5e1 100644
--- a/src/cpu/o3/lsq.cc
+++ b/src/cpu/o3/lsq.cc
@@ -88,6 +88,7 @@ LSQ::LSQ(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams &params)
       enableBankConflictCheck(params.BankConflictCheck),
       _enableLdMissReplay(params.EnableLdMissReplay),
       _enablePipeNukeCheck(params.EnablePipeNukeCheck),
+      _storeWbStage(params.StoreWbStage),
       waitingForStaleTranslation(false),
       staleTranslationWaitTxnId(0),
       lsqPolicy(params.smtLSQPolicy),
@@ -104,6 +105,7 @@ LSQ::LSQ(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams &params)
     if (!_enableLdMissReplay && _enablePipeNukeCheck) {
         panic("LSQ can not support pipeline nuke replay when EnableLdMissReplay is False");
     }
+    assert(_storeWbStage >= 2 && _storeWbStage <= 4);
 
     //**********************************************
     //************ Handle SMT Parameters ***********
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
index 1a44bea2a7..dd321344ea 100644
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -1015,6 +1015,7 @@ class LSQ
 
     bool enableLdMissReplay() const { return _enableLdMissReplay; }
     bool enablePipeNukeCheck() const { return _enablePipeNukeCheck; }
+    int storeWbStage() const { return _storeWbStage; }
 
   protected:
     /** D-cache is blocked */
@@ -1039,6 +1040,8 @@ class LSQ
     bool _enableLdMissReplay;
     bool _enablePipeNukeCheck;
 
+    int _storeWbStage;
+
     /** If the LSQ is currently waiting for stale translations */
     bool waitingForStaleTranslation;
     /** The ID if the transaction that made translations stale */
diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc
index 89907d0c80..6cce3eb492 100644
--- a/src/cpu/o3/lsq_unit.cc
+++ b/src/cpu/o3/lsq_unit.cc
@@ -1432,18 +1432,6 @@ LSQUnit::storePipeS4(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag)
     DPRINTF(LSQUnit, "StorePipeS4: Executing store PC %s [sn:%lli] flags: %s\n",
             inst->pcState(), inst->seqNum, getLdStFlagStr(flag));
 
-    // If the store had a fault then it may not have a mem req
-    if (fault != NoFault || !inst->readPredicate() || !inst->isStoreConditional()) {
-        // If the instruction faulted, then we need to send it
-        // along to commit without the instruction completing.
-        // Send this instruction to commit, also make sure iew
-        // stage realizes there is activity.
-        if (!flag[LdStFlags::Replayed]) {
-            inst->setExecuted();
-            iewStage->instToCommit(inst);
-            iewStage->activityThisCycle();
-        }
-    }
     return fault;
 }
 
@@ -1489,6 +1477,20 @@ LSQUnit::executeStorePipeSx()
                     default:
                         panic("unsupported storepipe length");
                 }
+                if (i == (lsq->storeWbStage() - 1)) {
+                    // If the store had a fault then it may not have a mem req
+                    if (fault != NoFault || !inst->readPredicate() || !inst->isStoreConditional()) {
+                        // If the instruction faulted, then we need to send it
+                        // along to commit without the instruction completing.
+                        // Send this instruction to commit, also make sure iew
+                        // stage realizes there is activity.
+                        if (!flag[LdStFlags::Replayed]) {
+                            inst->setExecuted();
+                            iewStage->instToCommit(inst);
+                            iewStage->activityThisCycle();
+                        }
+                    }
+                }
             } else {
                 DPRINTF(LSQUnit, "Execute: Instruction was squashed. PC: %s, [tid:%i]"
                                 " [sn:%llu]\n", inst->pcState(), inst->threadNumber,