diff --git a/configs/example/xiangshan.py b/configs/example/xiangshan.py
index b4400b88b2..01269e7ec2 100644
--- a/configs/example/xiangshan.py
+++ b/configs/example/xiangshan.py
@@ -337,8 +337,11 @@ def setKmhV3IdealParams(args, system):
         cpu.mmu.itb.size = 96
         
         cpu.BankConflictCheck = False   # real bank conflict 0.2 score
+        cpu.EnableLdMissReplay = False
+        cpu.EnablePipeNukeCheck = False
+        cpu.StoreWbStage = 2 # store writeback at s2
 
-        cpu.scheduler = IdealScheduler()    
+        cpu.scheduler = IdealScheduler()
         # use centralized load/store issue queue, for hmmer
 
         # ideal decoupled frontend
@@ -362,6 +365,7 @@ def setKmhV3IdealParams(args, system):
         if args.caches:
             cpu.icache.size = '128kB'
             cpu.dcache.size = '128kB'
+            cpu.dcache.hint_wakeup_ahead_cycles = 0;
             cpu.icache.enable_wayprediction = False
             cpu.dcache.enable_wayprediction = False
             cpu.dcache.tag_load_read_ports = 100 # 3->100
diff --git a/src/arch/riscv/isa/decoder.isa b/src/arch/riscv/isa/decoder.isa
index 858ecece1e..c5f98cd5b0 100644
--- a/src/arch/riscv/isa/decoder.isa
+++ b/src/arch/riscv/isa/decoder.isa
@@ -613,7 +613,7 @@ decode QUADRANT default Unknown::unknown() {
         0x03: decode FUNCT3 {
             format FenceOp {
                 0x0: fence({{
-                }}, uint64_t, IsReadBarrier, IsWriteBarrier, No_OpClass);
+                }}, uint64_t, IsReadBarrier, IsWriteBarrier, MemReadOp);
                 0x1: fence_i({{
                 }}, uint64_t, IsNonSpeculative, IsSerializeAfter, No_OpClass);
             }
diff --git a/src/arch/riscv/isa/formats/amo.isa b/src/arch/riscv/isa/formats/amo.isa
index f7e9b5bcc6..b104c07657 100644
--- a/src/arch/riscv/isa/formats/amo.isa
+++ b/src/arch/riscv/isa/formats/amo.isa
@@ -151,6 +151,36 @@ def template LRSCMacroConstructor {{
     }
 }};
 
+// Strictly order-preserving LRSC
+def template LRSCStrictMacroConstructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst):
+        %(base_class)s("%(mnemonic)s", machInst, %(op_class)s)
+    {
+        %(constructor)s;
+
+        StaticInstPtr rel_fence;
+        StaticInstPtr lrsc;
+        StaticInstPtr acq_fence;
+
+        rel_fence = new MemFenceMicro(machInst, No_OpClass);
+        rel_fence->setFlag(IsFirstMicroop);
+        rel_fence->setFlag(IsReadBarrier);
+        rel_fence->setFlag(IsWriteBarrier);
+        rel_fence->setFlag(IsDelayedCommit);
+
+        // set up atomic rmw op
+        lrsc = new %(class_name)sMicro(machInst, this);
+        lrsc->setFlag(IsDelayedCommit);
+
+        acq_fence = new MemFenceMicro(machInst, No_OpClass);
+        acq_fence->setFlag(IsLastMicroop);
+        acq_fence->setFlag(IsReadBarrier);
+        acq_fence->setFlag(IsWriteBarrier);
+
+        microops = {rel_fence, lrsc, acq_fence};
+    }
+}};
+
 def template LRSCMicroConstructor {{
     %(class_name)s::%(class_name)sMicro::%(class_name)sMicro(
         ExtMachInst machInst, %(class_name)s *_p)
@@ -435,7 +465,7 @@ def format LoadReserved(memacc_code, postacc_code={{ }}, ea_code={{EA = Rs1;}},
     macro_iop = InstObjParams(name, Name, 'LoadReserved', macro_ea_code,
                               macro_inst_flags)
     header_output = LRSCDeclare.subst(macro_iop)
-    decoder_output = LRSCMacroConstructor.subst(macro_iop)
+    decoder_output = LRSCStrictMacroConstructor.subst(macro_iop)
     decode_block = BasicDecode.subst(macro_iop)
 
     exec_output = ''
@@ -463,7 +493,7 @@ def format StoreCond(memacc_code, postacc_code={{ }}, ea_code={{EA = Rs1;}},
     macro_iop = InstObjParams(name, Name, 'StoreCond', macro_ea_code,
                               macro_inst_flags)
     header_output = LRSCDeclare.subst(macro_iop)
-    decoder_output = LRSCMacroConstructor.subst(macro_iop)
+    decoder_output = LRSCStrictMacroConstructor.subst(macro_iop)
     decode_block = BasicDecode.subst(macro_iop)
 
     exec_output = ''
diff --git a/src/cpu/o3/BaseO3CPU.py b/src/cpu/o3/BaseO3CPU.py
index 718939d85f..cd6e69aef6 100644
--- a/src/cpu/o3/BaseO3CPU.py
+++ b/src/cpu/o3/BaseO3CPU.py
@@ -175,6 +175,8 @@ def support_take_over(cls):
     SbufferEvictThreshold = Param.Unsigned(7, "store buffer eviction threshold")
     storeBufferInactiveThreshold = Param.Unsigned(800, "store buffer writeback timeout threshold")
 
+    StoreWbStage = Param.Unsigned(4, "Which PipeLine Stage store instruction writeback, 4 means S4")
+
     LSQDepCheckShift = Param.Unsigned(0,
             "Number of places to shift addr before check")
     LSQCheckLoads = Param.Bool(True,
@@ -188,6 +190,8 @@ def support_take_over(cls):
     LFSTEntrySize = Param.Unsigned(4,"The number of store table inst in every entry of LFST can contain")
     SSITSize = Param.Unsigned(8192, "Store set ID table size")
     BankConflictCheck = Param.Bool(True, "open Bank conflict check")
+    EnableLdMissReplay = Param.Bool(True, "Replay Cache missed load instrution from ReplayQ if True")
+    EnablePipeNukeCheck = Param.Bool(True, "Replay load if Raw violation is detected in loadPipe if True")
 
 
     numRobs = Param.Unsigned(1, "Number of Reorder Buffers");
diff --git a/src/cpu/o3/dyn_inst.hh b/src/cpu/o3/dyn_inst.hh
index f519b0c504..5bcdd14d34 100644
--- a/src/cpu/o3/dyn_inst.hh
+++ b/src/cpu/o3/dyn_inst.hh
@@ -194,6 +194,7 @@ class DynInst : public ExecContext, public RefCounted
         NotAnInst,
         TranslationStarted,
         TranslationCompleted,
+        WaitingCacheRefill,
         PossibleLoadViolation,
         HitExternalSnoop,
         EffAddrValid,
@@ -462,6 +463,14 @@ class DynInst : public ExecContext, public RefCounted
     }
     void translationCompleted(bool f) { instFlags[TranslationCompleted] = f; }
 
+    /** True if inst is waiting for Dcache refill. */
+    bool
+    waitingCacheRefill() const
+    {
+        return instFlags[WaitingCacheRefill];
+    }
+    void waitingCacheRefill(bool f) { instFlags[WaitingCacheRefill] = f; }
+
     /** True if this address was found to match a previous load and they issued
      * out of order. If that happend, then it's only a problem if an incoming
      * snoop invalidate modifies the line, in which case we need to squash.
@@ -1395,6 +1404,10 @@ class DynInst : public ExecContext, public RefCounted
         return squashVer.getVersion();
     }
 
+    ssize_t getLqIdx()
+    {
+        return lqIdx;
+    }
 
     Addr getPC()
     {
diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc
index 9e32532ec8..b3fa6f517c 100644
--- a/src/cpu/o3/iew.cc
+++ b/src/cpu/o3/iew.cc
@@ -682,6 +682,12 @@ IEW::blockMemInst(const DynInstPtr& inst)
     instQueue.blockMemInst(inst);
 }
 
+void
+IEW::cacheMissLdReplay(const DynInstPtr& inst)
+{
+    instQueue.cacheMissLdReplay(inst);
+}
+
 void
 IEW::cacheUnblocked()
 {
@@ -1326,6 +1332,84 @@ IEW::printAvailableInsts()
     std::cout << "\n";
 }
 
+void
+IEW::SquashCheckAfterExe(DynInstPtr inst)
+{
+    ThreadID tid = inst->threadNumber;
+
+    if (!fetchRedirect[tid] ||
+        !execWB->squash[tid] ||
+        execWB->squashedSeqNum[tid] > inst->seqNum) {
+
+        // Prevent testing for misprediction on load instructions,
+        // that have not been executed.
+        bool loadNotExecuted = !inst->isExecuted() && inst->isLoad();
+
+        if (inst->mispredicted() && !loadNotExecuted) {
+            fetchRedirect[tid] = true;
+
+            DPRINTF(IEW, "[tid:%i] [sn:%llu] Execute: "
+                    "Branch mispredict detected.\n",
+                    tid, inst->seqNum);
+            DPRINTF(IEW, "[tid:%i] [sn:%llu] "
+                    "Predicted target was PC: %s\n",
+                    tid, inst->seqNum, inst->readPredTarg());
+            DPRINTF(IEW, "[tid:%i] [sn:%llu] Execute: "
+                    "Redirecting fetch to PC: %s\n",
+                    tid, inst->seqNum, inst->pcState());
+            // If incorrect, then signal the ROB that it must be squashed.
+            squashDueToBranch(inst, tid);
+
+            ppMispredict->notify(inst);
+
+            if (inst->readPredTaken()) {
+                iewStats.predictedTakenIncorrect++;
+            } else {
+                iewStats.predictedNotTakenIncorrect++;
+            }
+        } else if (ldstQueue.violation(tid)) {
+            assert(inst->isMemRef());
+            // If there was an ordering violation, then get the
+            // DynInst that caused the violation.  Note that this
+            // clears the violation signal.
+            DynInstPtr violator;
+            violator = ldstQueue.getMemDepViolator(tid);
+
+            DPRINTF(IEW, "LDSTQ detected a violation. Violator PC: %s "
+                    "[sn:%lli], inst PC: %s [sn:%lli]. Addr is: %#x.\n",
+                    violator->pcState(), violator->seqNum,
+                    inst->pcState(), inst->seqNum, inst->physEffAddr);
+
+            fetchRedirect[tid] = true;
+
+            // Tell the instruction queue that a violation has occured.
+            instQueue.violation(inst, violator);
+
+            // Squash.
+            squashDueToMemOrder(violator, tid);
+
+            ++iewStats.memOrderViolationEvents;
+        }
+    } else {
+        // Reset any state associated with redirects that will not
+        // be used.
+        if (ldstQueue.violation(tid)) {
+            assert(inst->isMemRef());
+
+            DynInstPtr violator = ldstQueue.getMemDepViolator(tid);
+
+            DPRINTF(IEW, "LDSTQ detected a violation.  Violator PC: "
+                    "%s, inst PC: %s.  Addr is: %#x.\n",
+                    violator->pcState(), inst->pcState(),
+                    inst->physEffAddr);
+            DPRINTF(IEW, "Violation will not be handled because "
+                    "already squashing\n");
+
+            ++iewStats.memOrderViolationEvents;
+        }
+    }
+}
+
 void
 IEW::executeInsts()
 {
@@ -1393,7 +1477,7 @@ IEW::executeInsts()
             // Tell the LDSTQ to execute this instruction (if it is a load).
             if (inst->isAtomic()) {
                 // AMOs are treated like store requests
-                fault = ldstQueue.executeStore(inst);
+                fault = ldstQueue.executeAmo(inst);
 
                 if (inst->isTranslationDelayed() &&
                     fault == NoFault) {
@@ -1401,53 +1485,15 @@ IEW::executeInsts()
                     // instruction must be deferred.
                     DPRINTF(IEW, "Execute: Delayed translation, deferring "
                             "store.\n");
-                    instQueue.deferMemInst(inst);
+                    deferMemInst(inst);
                     continue;
                 }
             } else if (inst->isLoad()) {
-                // Loads will mark themselves as executed, and their writeback
-                // event adds the instruction to the queue to commit
-                fault = ldstQueue.executeLoad(inst);
-
-                if (inst->isTranslationDelayed() &&
-                    fault == NoFault) {
-                    // A hw page table walk is currently going on; the
-                    // instruction must be deferred.
-                    DPRINTF(IEW, "Execute: Delayed translation, deferring "
-                            "load.\n");
-                    instQueue.deferMemInst(inst);
-                    continue;
-                }
-
-                if (inst->isDataPrefetch() || inst->isInstPrefetch()) {
-                    inst->fault = NoFault;
-                }
+                // add this load inst to loadpipe S0.
+                ldstQueue.issueToLoadPipe(inst);
             } else if (inst->isStore()) {
-                fault = ldstQueue.executeStore(inst);
-
-                if (inst->isTranslationDelayed() &&
-                    fault == NoFault) {
-                    // A hw page table walk is currently going on; the
-                    // instruction must be deferred.
-                    DPRINTF(IEW, "Execute: Delayed translation, deferring "
-                            "store.\n");
-                    instQueue.deferMemInst(inst);
-                    continue;
-                }
-
-                // If the store had a fault then it may not have a mem req
-                if (fault != NoFault || !inst->readPredicate() ||
-                        !inst->isStoreConditional()) {
-                    // If the instruction faulted, then we need to send it
-                    // along to commit without the instruction completing.
-                    // Send this instruction to commit, also make sure iew
-                    // stage realizes there is activity.
-                    inst->setExecuted();
-                    instToCommit(inst);
-                    activityThisCycle();
-                }
-
-                instQueue.notifyExecuted(inst);
+                // add this store inst to storepipe S0.
+                ldstQueue.issueToStorePipe(inst);
 
                 // Store conditionals will mark themselves as
                 // executed, and their writeback event will add the
@@ -1486,81 +1532,14 @@ IEW::executeInsts()
         // This probably needs to prioritize the redirects if a different
         // scheduler is used.  Currently the scheduler schedules the oldest
         // instruction first, so the branch resolution order will be correct.
-        ThreadID tid = inst->threadNumber;
-
-        if (!fetchRedirect[tid] ||
-            !execWB->squash[tid] ||
-            execWB->squashedSeqNum[tid] > inst->seqNum) {
-
-            // Prevent testing for misprediction on load instructions,
-            // that have not been executed.
-            bool loadNotExecuted = !inst->isExecuted() && inst->isLoad();
-
-            if (inst->mispredicted() && !loadNotExecuted) {
-                fetchRedirect[tid] = true;
-
-                DPRINTF(IEW, "[tid:%i] [sn:%llu] Execute: "
-                        "Branch mispredict detected.\n",
-                        tid, inst->seqNum);
-                DPRINTF(IEW, "[tid:%i] [sn:%llu] "
-                        "Predicted target was PC: %s\n",
-                        tid, inst->seqNum, inst->readPredTarg());
-                DPRINTF(IEW, "[tid:%i] [sn:%llu] Execute: "
-                        "Redirecting fetch to PC: %s\n",
-                        tid, inst->seqNum, inst->pcState());
-                // If incorrect, then signal the ROB that it must be squashed.
-                squashDueToBranch(inst, tid);
-
-                ppMispredict->notify(inst);
-
-                if (inst->readPredTaken()) {
-                    iewStats.predictedTakenIncorrect++;
-                } else {
-                    iewStats.predictedNotTakenIncorrect++;
-                }
-            } else if (ldstQueue.violation(tid)) {
-                assert(inst->isMemRef());
-                // If there was an ordering violation, then get the
-                // DynInst that caused the violation.  Note that this
-                // clears the violation signal.
-                DynInstPtr violator;
-                violator = ldstQueue.getMemDepViolator(tid);
-
-                DPRINTF(IEW, "LDSTQ detected a violation. Violator PC: %s "
-                        "[sn:%lli], inst PC: %s [sn:%lli]. Addr is: %#x.\n",
-                        violator->pcState(), violator->seqNum,
-                        inst->pcState(), inst->seqNum, inst->physEffAddr);
-
-                fetchRedirect[tid] = true;
-
-                // Tell the instruction queue that a violation has occured.
-                instQueue.violation(inst, violator);
-
-                // Squash.
-                squashDueToMemOrder(violator, tid);
-
-                ++iewStats.memOrderViolationEvents;
-            }
-        } else {
-            // Reset any state associated with redirects that will not
-            // be used.
-            if (ldstQueue.violation(tid)) {
-                assert(inst->isMemRef());
-
-                DynInstPtr violator = ldstQueue.getMemDepViolator(tid);
-
-                DPRINTF(IEW, "LDSTQ detected a violation.  Violator PC: "
-                        "%s, inst PC: %s.  Addr is: %#x.\n",
-                        violator->pcState(), inst->pcState(),
-                        inst->physEffAddr);
-                DPRINTF(IEW, "Violation will not be handled because "
-                        "already squashing\n");
-
-                ++iewStats.memOrderViolationEvents;
-            }
+        if (!(inst->isLoad() || inst->isStore())) {
+            // Load/Store will call this in `lsq_unit.cc` after execution
+            SquashCheckAfterExe(inst);
         }
     }
 
+    ldstQueue.executePipeSx();
+
     // Update and record activity if we processed any instructions.
     if (inst_num) {
         if (exeStatus == Idle) {
diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh
index f41dfb9492..32c62c65d2 100644
--- a/src/cpu/o3/iew.hh
+++ b/src/cpu/o3/iew.hh
@@ -209,6 +209,9 @@ class IEW
     /** Moves memory instruction onto the list of cache blocked instructions */
     void blockMemInst(const DynInstPtr &inst);
 
+    /** Moves load instruction onto the Set of cache missed instructions */
+    void cacheMissLdReplay(const DynInstPtr &inst);
+
     /** Notifies that the cache has become unblocked */
     void cacheUnblocked();
 
@@ -252,6 +255,17 @@ class IEW
 
     bool flushAllStores(ThreadID tid) { return ldstQueue.flushAllStores(tid); }
 
+    /** Check if we need to squash after a load/store/branch is executed. */
+    void SquashCheckAfterExe(DynInstPtr inst);
+
+    void notifyExecuted(const DynInstPtr &inst) { instQueue.notifyExecuted(inst); }
+
+    /**
+     * Defers a memory instruction when its DTB translation incurs a hw
+     * page table walk.
+     */
+    void deferMemInst(const DynInstPtr &deferred_inst) { instQueue.deferMemInst(deferred_inst); }
+
     /** Check misprediction  */
     void checkMisprediction(const DynInstPtr &inst);
 
diff --git a/src/cpu/o3/inst_queue.cc b/src/cpu/o3/inst_queue.cc
index ae7647ad7a..090765a79d 100644
--- a/src/cpu/o3/inst_queue.cc
+++ b/src/cpu/o3/inst_queue.cc
@@ -94,6 +94,12 @@ InstructionQueue::FUCompletion::description() const
     return "Functional unit completion";
 }
 
+size_t
+InstructionQueue::CacheMissLdInstsHash::operator()(const DynInstPtr& ptr) const
+{
+    return ptr->getLqIdx();
+}
+
 InstructionQueue::InstructionQueue(CPU *cpu_ptr, IEW *iew_ptr,
         const BaseO3CPUParams &params)
     : cpu(cpu_ptr),
@@ -352,6 +358,7 @@ InstructionQueue::resetState()
 
     nonSpecInsts.clear();
     deferredMemInsts.clear();
+    cacheMissLdInsts.clear();
     blockedMemInsts.clear();
     retryMemInsts.clear();
     wbOutstanding = 0;
@@ -650,6 +657,10 @@ InstructionQueue::scheduleReadyInsts()
     IssueStruct *i2e_info = issueToExecuteQueue->access(0);
 
     DynInstPtr mem_inst;
+    while ((mem_inst = getCacheMissInstToExecute())) {
+        mem_inst->issueQue->retryMem(mem_inst);
+    }
+
     while ((mem_inst = getDeferredMemInstToExecute())) {
         mem_inst->issueQue->retryMem(mem_inst);
     }
@@ -697,7 +708,7 @@ InstructionQueue::scheduleReadyInsts()
         assert(op_latency < 64);
         DPRINTF(Schedule, "[sn:%llu] start execute %u cycles\n", issued_inst->seqNum, op_latency);
         cpu->perfCCT->updateInstPos(issued_inst->seqNum, PerfRecord::AtFU);
-        if (op_latency <= 1) {
+        if (op_latency <= 1 || issued_inst->isLoad() || issued_inst->isStore()) {
             i2e_info->size++;
             instsToExecute.push_back(issued_inst);
         }
@@ -721,7 +732,7 @@ InstructionQueue::scheduleReadyInsts()
     // @todo If the way deferred memory instructions are handeled due to
     // translation changes then the deferredMemInsts condition should be
     // removed from the code below.
-    if (total_issued || !retryMemInsts.empty() || !deferredMemInsts.empty()) {
+    if (total_issued || !retryMemInsts.empty() || !deferredMemInsts.empty() || !cacheMissLdInsts.empty()) {
         cpu->activityThisCycle();
     } else {
         DPRINTF(IQ, "Not able to schedule any instructions.\n");
@@ -860,6 +871,19 @@ InstructionQueue::deferMemInst(const DynInstPtr &deferred_inst)
     deferredMemInsts.push_back(deferred_inst);
 }
 
+void
+InstructionQueue::cacheMissLdReplay(const DynInstPtr &deferred_inst)
+{
+    DPRINTF(IQ, "Get Cache Missed Load, insert to Replay Queue "
+            "[sn:%llu]\n", deferred_inst->seqNum);
+    // Reset DTB translation state
+    deferred_inst->translationStarted(false);
+    deferred_inst->translationCompleted(false);
+
+    deferred_inst->clearCanIssue();
+    cacheMissLdInsts.insert(deferred_inst);
+}
+
 void
 InstructionQueue::blockMemInst(const DynInstPtr &blocked_inst)
 {
@@ -902,6 +926,29 @@ InstructionQueue::getDeferredMemInstToExecute()
     return nullptr;
 }
 
+DynInstPtr
+InstructionQueue::getCacheMissInstToExecute()
+{
+    for (auto it = cacheMissLdInsts.begin(); it != cacheMissLdInsts.end();
+         ++it) {
+        if (!(*it)->waitingCacheRefill() || (*it)->isSquashed()) {
+            DPRINTF(IQ, "CacheMissed load inst [sn:%llu] PC %s is ready to "
+                    "execute\n", (*it)->seqNum, (*it)->pcState());
+            DynInstPtr mem_inst = std::move(*it);
+            cacheMissLdInsts.erase(it);
+            return mem_inst;
+        }
+        if ((*it)->waitingCacheRefill()) {
+            DPRINTF(
+                IQ,
+                "CacheMissed load inst [sn:%llu] PC %s has not been waken up "
+                "by Dcache\n",
+                (*it)->seqNum, (*it)->pcState());
+        }
+    }
+    return nullptr;
+}
+
 DynInstPtr
 InstructionQueue::getBlockedMemInstToExecute()
 {
diff --git a/src/cpu/o3/inst_queue.hh b/src/cpu/o3/inst_queue.hh
index 0d1b780d61..0d0f333e43 100644
--- a/src/cpu/o3/inst_queue.hh
+++ b/src/cpu/o3/inst_queue.hh
@@ -45,6 +45,7 @@
 #include <list>
 #include <map>
 #include <queue>
+#include <unordered_set>
 #include <vector>
 
 #include "base/statistics.hh"
@@ -199,6 +200,11 @@ class InstructionQueue
      */
     DynInstPtr getDeferredMemInstToExecute();
 
+    /** Gets a load instruction that was referred due to Dcache miss
+     *  if it is now ready to execute.  NULL if none available.
+     */
+    DynInstPtr getCacheMissInstToExecute();
+
     /** Gets a memory instruction that was blocked on the cache. NULL if none
      *  available.
      */
@@ -242,6 +248,11 @@ class InstructionQueue
      */
     void deferMemInst(const DynInstPtr &deferred_inst);
 
+    /**
+     * Defers a load instruction when Dcache miss.
+     */
+    void cacheMissLdReplay(const DynInstPtr &deferred_inst);
+
     /**  Defers a memory instruction when it is cache blocked. */
     void blockMemInst(const DynInstPtr &blocked_inst);
 
@@ -302,6 +313,16 @@ class InstructionQueue
      */
     std::list<DynInstPtr> deferredMemInsts;
 
+    /** Set of load instructions waiting for Dcache refill
+     *    use unordered_set to prevent repeat enqueue,
+     *    SplitDataRequest may call `cacheMissLdReplay` multiple times.
+     */
+    struct CacheMissLdInstsHash
+    {
+      size_t operator()(const DynInstPtr& ptr) const;
+    };
+    std::unordered_set<DynInstPtr, CacheMissLdInstsHash> cacheMissLdInsts;
+
     /** List of instructions that have been cache blocked. */
     std::list<DynInstPtr> blockedMemInsts;
 
diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc
index 3c3abac3b9..23fdd4f5e1 100644
--- a/src/cpu/o3/lsq.cc
+++ b/src/cpu/o3/lsq.cc
@@ -42,7 +42,9 @@
 #include "cpu/o3/lsq.hh"
 
 #include <algorithm>
+#include <cassert>
 #include <csignal>
+#include <cstdint>
 #include <list>
 #include <string>
 
@@ -51,6 +53,7 @@
 #include "base/trace.hh"
 #include "cpu/o3/cpu.hh"
 #include "cpu/o3/dyn_inst.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/iew.hh"
 #include "cpu/o3/limits.hh"
 #include "debug/Drain.hh"
@@ -63,6 +66,7 @@
 #include "debug/TagReadFail.hh"
 #include "debug/Writeback.hh"
 #include "mem/packet_access.hh"
+#include "mem/request.hh"
 #include "params/BaseO3CPU.hh"
 
 namespace gem5
@@ -82,6 +86,9 @@ LSQ::LSQ(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams &params)
       cacheLoadPorts(params.cacheLoadPorts), usedLoadPorts(0),lastConflictCheckTick(0),
       recentlyloadAddr(8),
       enableBankConflictCheck(params.BankConflictCheck),
+      _enableLdMissReplay(params.EnableLdMissReplay),
+      _enablePipeNukeCheck(params.EnablePipeNukeCheck),
+      _storeWbStage(params.StoreWbStage),
       waitingForStaleTranslation(false),
       staleTranslationWaitTxnId(0),
       lsqPolicy(params.smtLSQPolicy),
@@ -95,6 +102,10 @@ LSQ::LSQ(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams &params)
       numThreads(params.numThreads)
 {
     assert(numThreads > 0 && numThreads <= MaxThreads);
+    if (!_enableLdMissReplay && _enablePipeNukeCheck) {
+        panic("LSQ can not support pipeline nuke replay when EnableLdMissReplay is False");
+    }
+    assert(_storeWbStage >= 2 && _storeWbStage <= 4);
 
     //**********************************************
     //************ Handle SMT Parameters ***********
@@ -121,9 +132,10 @@ LSQ::LSQ(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams &params)
     }
 
     thread.reserve(numThreads);
+    // TODO: Parameterize the load/store pipeline stages
     for (ThreadID tid = 0; tid < numThreads; tid++) {
         thread.emplace_back(maxLQEntries, maxSQEntries, params.SbufferEntries,
-            params.SbufferEvictThreshold, params.storeBufferInactiveThreshold);
+            params.SbufferEvictThreshold, params.storeBufferInactiveThreshold, 4, 5);
         thread[tid].init(cpu, iew_ptr, params, this, tid);
         thread[tid].setDcachePort(&dcachePort);
     }
@@ -190,6 +202,15 @@ LSQ::tick()
 
     usedLoadPorts = 0;
     usedStorePorts = 0;
+    // tick lsq_unit
+    std::list<ThreadID>::iterator threads = activeThreads->begin();
+    std::list<ThreadID>::iterator end = activeThreads->end();
+
+    while (threads != end) {
+        ThreadID tid = *threads++;
+        thread[tid].tick();
+    }
+
 }
 Tick
 LSQ::getLastConflictCheckTick()
@@ -280,20 +301,41 @@ LSQ::insertStore(const DynInstPtr &store_inst)
     thread[tid].insertStore(store_inst);
 }
 
-Fault
-LSQ::executeLoad(const DynInstPtr &inst)
+void
+LSQ::issueToLoadPipe(const DynInstPtr &inst)
 {
     ThreadID tid = inst->threadNumber;
 
-    return thread[tid].executeLoad(inst);
+    thread[tid].issueToLoadPipe(inst);
+}
+
+void
+LSQ::issueToStorePipe(const DynInstPtr &inst)
+{
+    ThreadID tid = inst->threadNumber;
+
+    thread[tid].issueToStorePipe(inst);
+}
+
+void
+LSQ::executePipeSx()
+{
+    std::list<ThreadID>::iterator threads = activeThreads->begin();
+    std::list<ThreadID>::iterator end = activeThreads->end();
+
+    while (threads != end) {
+        ThreadID tid = *threads++;
+
+        thread[tid].executePipeSx();
+    }
 }
 
 Fault
-LSQ::executeStore(const DynInstPtr &inst)
+LSQ::executeAmo(const DynInstPtr &inst)
 {
     ThreadID tid = inst->threadNumber;
 
-    return thread[tid].executeStore(inst);
+    return thread[tid].executeAmo(inst);
 }
 
 void
@@ -522,8 +564,10 @@ LSQ::recvFunctionalCustomSignal(PacketPtr pkt, int sig)
 
     LSQRequest *request = dynamic_cast<LSQRequest*>(pkt->getPrimarySenderState());
     panic_if(!request, "Got packet back with unknown sender state\n");
-    if (sig == DcacheRespType::Miss) {
-        // notify cache miss
+    if (sig == DcacheRespType::Miss || sig == DcacheRespType::Block_Not_Ready) {
+        DPRINTF(LSQ, "recvFunctionalCustomSignal: Resp type: %d, [sn:%ld], lqidx: %ld\n",
+                sig, request->instruction()->seqNum, request->instruction()->lqIdx);
+        // cancel subsequent dependent insts of this load
         iewStage->loadCancel(request->instruction());
     } else {
         panic("unsupported sig %d in recvFunctionalCustomSignal\n", sig);
@@ -916,6 +960,14 @@ LSQ::pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
         request->initiateTranslation();
     }
 
+    if (!isLoad && !isAtomic) {
+        // store inst temporally saves its data in memData
+        inst->memData = new uint8_t[size];
+        memcpy(inst->memData, data, size);
+    }
+
+    inst->effSize = size;
+
     if (!isLoad && !inst->isVector() && size > 1 && addr % size != 0) {
         warn( "Store misaligned: size: %u, Addr: %#lx, code: %d\n", size,
             addr, RiscvISA::ExceptionCode::STORE_ADDR_MISALIGNED);
@@ -925,7 +977,7 @@ LSQ::pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
     }
 
     /* This is the place were instructions get the effAddr. */
-    if (request->isTranslationComplete()) {
+    if (inst->isAtomic() && request->isTranslationComplete()) {
         if (request->isMemAccessRequired()) {
             inst->effAddr = request->getVaddr();
             inst->effSize = size;
@@ -1194,7 +1246,7 @@ LSQ::LSQRequest::LSQRequest(
     : _state(State::NotIssued),
     numTranslatedFragments(0),
     numInTranslationFragments(0),
-    _port(*port), _inst(inst), _data(data),
+    _port(*port), _inst(inst), _data(data), _fwd_data_pkt(nullptr),
     _res(res), _addr(addr), _size(size),
     _flags(flags_),
     _numOutstandingPackets(0),
@@ -1283,6 +1335,9 @@ LSQ::LSQRequest::~LSQRequest()
 
     for (auto r: _packets)
         delete r;
+
+    if (_fwd_data_pkt)
+        delete  _fwd_data_pkt;
 };
 
 ContextID
@@ -1349,15 +1404,31 @@ LSQ::SbufferRequest::recvTimingResp(PacketPtr pkt)
 bool
 LSQ::SingleDataRequest::recvTimingResp(PacketPtr pkt)
 {
+    bool isNormalLd = this->isNormalLd();
+    bool enableLdMissReplay = this->_port.getLsq()->enableLdMissReplay();
     // Dump inst num, request addr, and packet addr
-    DPRINTF(LSQ, "Single Req::recvTimingResp: inst: %llu, pkt: %#lx\n", pkt->req->getReqInstSeqNum(),
-            pkt->getAddr());
+    DPRINTF(LSQ, "Single Req::recvTimingResp: inst: %llu, pkt: %#lx, isLoad: %d, "
+                "isLLSC: %d, isUncache: %d, isCacheSatisfied: %d, data: %d\n",
+                pkt->req->getReqInstSeqNum(), pkt->getAddr(), isLoad(), mainReq()->isLLSC(),
+                mainReq()->isUncacheable(), pkt->cacheSatisfied, *(pkt->getPtr<uint64_t*>()));
     assert(_numOutstandingPackets == 1);
-    flags.set(Flag::Complete);
-    assert(pkt == _packets.front());
-    forward();
-    _port.completeDataAccess(pkt);
-    _hasStaleTranslation = false;
+    if (enableLdMissReplay && isNormalLd && LSQRequest::_inst->waitingCacheRefill()) {
+        // Data in Dcache is ready, wake up missed load in replay queue
+        LSQRequest::_inst->waitingCacheRefill(false);
+        discard();
+    } else {
+        flags.set(Flag::Complete);
+        assert(pkt == _packets.front());
+        if (enableLdMissReplay && isNormalLd) {
+            // cache satisfied load, assemblePackets at load s2
+            _port.setFlagInPipeLine(_inst, LdStFlags::CacheHit);
+        } else {
+            // cache satisfied other kinds of request
+            assert(pkt == mainPacket());
+            assemblePackets();
+        }
+        _hasStaleTranslation = false;
+    }
     return true;
 }
 
@@ -1372,24 +1443,50 @@ LSQ::SplitDataRequest::recvTimingResp(PacketPtr pkt)
     assert(pktIdx < _packets.size());
     numReceivedPackets++;
     if (numReceivedPackets == _packets.size()) {
-        flags.set(Flag::Complete);
-        /* Assemble packets. */
-        PacketPtr resp = isLoad()
-            ? Packet::createRead(_mainReq)
-            : Packet::createWrite(_mainReq);
-        if (isLoad())
-            resp->dataStatic(_inst->memData);
-        else
-            resp->dataStatic(_data);
-        resp->senderState = this;
-        forward();
-        _port.completeDataAccess(resp);
-        delete resp;
+        bool isNormalLd = this->isNormalLd();
+        bool enableLdMissReplay = this->_port.getLsq()->enableLdMissReplay();
+        if (enableLdMissReplay && isNormalLd && LSQRequest::_inst->waitingCacheRefill()) {
+            // Data in Dcache is ready, wake up missed load in replay queue
+            LSQRequest::_inst->waitingCacheRefill(false);
+            discard();
+        } else {
+            flags.set(Flag::Complete);
+            if (enableLdMissReplay && isNormalLd) {
+                // cache satisfied load, assemblePackets at load s2
+                _port.setFlagInPipeLine(_inst, LdStFlags::CacheHit);
+            } else {
+                // Assemble packets, cache satisfied other kinds of request
+                assemblePackets();
+            }
+            _hasStaleTranslation = false;
+        }
     }
-    _hasStaleTranslation = false;
     return true;
 }
 
+void
+LSQ::SingleDataRequest::assemblePackets()
+{
+    forward();
+    _port.completeDataAccess(mainPacket());
+}
+
+void
+LSQ::SplitDataRequest::assemblePackets()
+{
+    PacketPtr resp = isLoad()
+        ? Packet::createRead(_mainReq)
+        : Packet::createWrite(_mainReq);
+    if (isLoad())
+        resp->dataStatic(_inst->memData);
+    else
+        resp->dataStatic(_data);
+    resp->senderState = this;
+    forward();
+    _port.completeDataAccess(resp);
+    delete resp;
+}
+
 void
 LSQ::SbufferRequest::buildPackets()
 {
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
index 079cb54e52..dd321344ea 100644
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -47,6 +47,8 @@
 #include <list>
 #include <map>
 #include <queue>
+#include <string>
+#include <utility>
 #include <vector>
 
 #include <boost/compute/detail/lru_cache.hpp>
@@ -60,6 +62,7 @@
 #include "cpu/o3/dyn_inst_xsmeta.hh"
 #include "cpu/utils.hh"
 #include "enums/SMTQueuePolicy.hh"
+#include "mem/packet.hh"
 #include "mem/port.hh"
 #include "sim/sim_object.hh"
 
@@ -76,6 +79,37 @@ class IEW;
 class LSQUnit;
 class StoreBufferEntry;
 
+/** The Flag of Load/Store inst in Pipeline. */
+enum LdStFlags
+{
+    Valid = 0,
+    Replayed,
+    CacheHit,
+    Nuke,
+    FullForward,
+    LocalAccess,
+    HasFault,
+    readNotPredicate,
+    readMemAccNotPredicate,
+    Squashed,
+    Num_Flags
+};
+
+constexpr uint64_t LdStFlagNum = LdStFlags::Num_Flags;
+
+const std::string LdStFlagName[LdStFlagNum] = {
+    "Valid",
+    "Replayed",
+    "CacheHit",
+    "Nuke",
+    "FullForward",
+    "LocalAccess",
+    "HasFault",
+    "readNotPredicate",
+    "readMemAccNotPredicate",
+    "Squashed"
+};
+
 class LSQ
 {
   public:
@@ -256,6 +290,7 @@ class LSQ
         PacketDataPtr _data;
         std::vector<PacketPtr> _packets;
         std::vector<RequestPtr> _reqs;
+        PacketPtr _fwd_data_pkt;
         std::vector<Fault> _fault;
         uint64_t* _res;
         const Addr _addr;
@@ -463,6 +498,8 @@ class LSQ
          */
         virtual bool isCacheBlockHit(Addr blockAddr, Addr cacheBlockMask) = 0;
 
+        virtual void assemblePackets() { panic("assemblePackets not implemented!\n"); }
+
         /** Update the status to reflect that a packet was sent. */
         void
         packetSent()
@@ -594,6 +631,13 @@ class LSQ
             flags.set(Flag::Complete);
         }
 
+        /* Load instrutcion which is not LR or MMIO type of Load. */
+        bool
+        isNormalLd()
+        {
+            return isLoad() && !mainReq()->isLLSC() && !mainReq()->isUncacheable();
+        }
+
         virtual std::string name() const { return "LSQRequest"; }
     };
 
@@ -613,6 +657,7 @@ class LSQ
         virtual void finish(const Fault &fault, const RequestPtr &req,
                 gem5::ThreadContext* tc, BaseMMU::Mode mode);
         virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void assemblePackets();
         virtual bool sendPacketToCache();
         virtual void buildPackets();
         virtual Cycles handleLocalAccess(
@@ -678,6 +723,7 @@ class LSQ
         virtual void finish(const Fault &fault, const RequestPtr &req,
                 gem5::ThreadContext* tc, BaseMMU::Mode mode);
         virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void assemblePackets();
         virtual void initiateTranslation();
         virtual bool sendPacketToCache();
         virtual void buildPackets();
@@ -742,11 +788,17 @@ class LSQ
     /** Inserts a store into the LSQ. */
     void insertStore(const DynInstPtr &store_inst);
 
-    /** Executes a load. */
-    Fault executeLoad(const DynInstPtr &inst);
+    /** Executes an amo inst. */
+    Fault executeAmo(const DynInstPtr &inst);
+
+    /** Iq issues a load to load pipeline. */
+    void issueToLoadPipe(const DynInstPtr &inst);
+
+    /** Iq issues a store to store pipeline. */
+    void issueToStorePipe(const DynInstPtr &inst);
 
-    /** Executes a store. */
-    Fault executeStore(const DynInstPtr &inst);
+    /** Process instructions in each load/store pipeline stages. */
+    void executePipeSx();
 
     /**
      * Commits loads up until the given sequence number for a specific thread.
@@ -961,6 +1013,10 @@ class LSQ
 
     RequestPort &getDataPort() { return dcachePort; }
 
+    bool enableLdMissReplay() const { return _enableLdMissReplay; }
+    bool enablePipeNukeCheck() const { return _enablePipeNukeCheck; }
+    int storeWbStage() const { return _storeWbStage; }
+
   protected:
     /** D-cache is blocked */
     bool _cacheBlocked;
@@ -981,6 +1037,11 @@ class LSQ
 
     bool enableBankConflictCheck;
 
+    bool _enableLdMissReplay;
+    bool _enablePipeNukeCheck;
+
+    int _storeWbStage;
+
     /** If the LSQ is currently waiting for stale translations */
     bool waitingForStaleTranslation;
     /** The ID if the transaction that made translations stale */
diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc
index 86a40eebfc..6cce3eb492 100644
--- a/src/cpu/o3/lsq_unit.cc
+++ b/src/cpu/o3/lsq_unit.cc
@@ -41,15 +41,21 @@
 
 #include "cpu/o3/lsq_unit.hh"
 
+#include <cassert>
+
 #include "arch/generic/debugfaults.hh"
 #include "arch/riscv/faults.hh"
+#include "base/logging.hh"
 #include "base/str.hh"
 #include "base/trace.hh"
+#include "base/types.hh"
 #include "config/the_isa.hh"
 #include "cpu/base.hh"
 #include "cpu/checker/cpu.hh"
 #include "cpu/golden_global_mem.hh"
 #include "cpu/o3/dyn_inst.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
+#include "cpu/o3/issue_queue.hh"
 #include "cpu/o3/limits.hh"
 #include "cpu/o3/lsq.hh"
 #include "cpu/utils.hh"
@@ -430,7 +436,7 @@ LSQUnit::completeDataAccess(PacketPtr pkt)
 }
 
 LSQUnit::LSQUnit(uint32_t lqEntries, uint32_t sqEntries, uint32_t sbufferEntries, uint32_t sbufferEvictThreshold,
-    uint64_t storeBufferInactiveThreshold)
+    uint64_t storeBufferInactiveThreshold, uint32_t ldPipeStages, uint32_t stPipeStages)
     : sbufferEvictThreshold(sbufferEvictThreshold),
       sbufferEntries(sbufferEntries),
       storeBufferWritebackInactive(0),
@@ -438,6 +444,8 @@ LSQUnit::LSQUnit(uint32_t lqEntries, uint32_t sqEntries, uint32_t sbufferEntries
       lsqID(-1),
       storeQueue(sqEntries),
       loadQueue(lqEntries),
+      loadPipe(ldPipeStages - 1, 0),
+      storePipe(stPipeStages - 1, 0),
       storesToWB(0),
       htmStarts(0),
       htmStops(0),
@@ -452,9 +460,27 @@ LSQUnit::LSQUnit(uint32_t lqEntries, uint32_t sqEntries, uint32_t sbufferEntries
     // reserve space, we want if sq will be full, sbuffer will start evicting
     sqFullUpperLimit = sqEntries - 4;
     sqFullLowerLimit = sqFullUpperLimit - 4;
+
+    loadPipeSx.resize(ldPipeStages);
+    storePipeSx.resize(stPipeStages);
+
+    for (int i = 0; i < ldPipeStages; i++) {
+        loadPipeSx[i] = loadPipe.getWire(-i);
+    }
+    for (int i = 0; i < stPipeStages; i++) {
+        storePipeSx[i] = storePipe.getWire(-i);
+    }
+    assert(ldPipeStages >= 4 && stPipeStages >= 5);
     assert(sqFullLowerLimit > 0);
 }
 
+void
+LSQUnit::tick()
+{
+    loadPipe.advance();
+    storePipe.advance();
+}
+
 void
 LSQUnit::init(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams &params,
         LSQ *lsq_ptr, unsigned id)
@@ -547,6 +573,8 @@ LSQUnit::LSQUnitStats::LSQUnitStats(statistics::Group *parent)
                "Number of loads that had data forwarded from stores"),
       ADD_STAT(squashedLoads, statistics::units::Count::get(),
                "Number of loads squashed"),
+      ADD_STAT(pipeRawNukeReplay, statistics::units::Count::get(),
+               "Number of pipeline detected raw nuke"),
       ADD_STAT(ignoredResponses, statistics::units::Count::get(),
                "Number of memory responses ignored because the instruction is "
                "squashed"),
@@ -704,6 +732,27 @@ LSQUnit::insertStore(const DynInstPtr& store_inst)
     storeQueue.back().set(store_inst);
 }
 
+bool
+LSQUnit::pipeLineNukeCheck(const DynInstPtr &load_inst, const DynInstPtr &store_inst)
+{
+    Addr load_eff_addr1 = load_inst->effAddr >> depCheckShift;
+    Addr load_eff_addr2 = (load_inst->effAddr + load_inst->effSize - 1) >> depCheckShift;
+
+    Addr store_eff_addr1 = store_inst->effAddr >> depCheckShift;
+    Addr store_eff_addr2 = (store_inst->effAddr + store_inst->effSize - 1) >> depCheckShift;
+
+    LSQRequest* store_req = store_inst->savedRequest;
+    bool load_need_check = load_inst->effAddrValid() && (load_inst->lqIt >= store_inst->lqIt);
+    bool store_need_check = store_req && store_req->isTranslationComplete() &&
+                            store_req->isMemAccessRequired() && (store_inst->getFault() == NoFault);
+    if (lsq->enablePipeNukeCheck() && load_need_check && store_need_check) {
+        if (load_eff_addr1 <= store_eff_addr2 && store_eff_addr1 <= load_eff_addr2) {
+            return true;
+        }
+    }
+    return false;
+}
+
 DynInstPtr
 LSQUnit::getMemDepViolator()
 {
@@ -814,6 +863,25 @@ LSQUnit::checkSnoop(PacketPtr pkt)
     return;
 }
 
+bool
+LSQUnit::skipNukeReplay(const DynInstPtr& load_inst)
+{
+    // if the load_inst has been marked as `Nuke`
+    // load will be replayed, so no Raw violation happens.
+    if (lsq->enablePipeNukeCheck()) {
+        for (int i = 1; i <= 2; i++) {
+            // check loadPipe s1 & s2
+            auto& stage = loadPipeSx[i];
+            for (int j = 0; j < stage->size; j++) {
+                if (load_inst == stage->insts[j] && stage->flags[j][LdStFlags::Nuke]) {
+                    return true;
+                }
+            }
+        }
+    }
+    return false;
+}
+
 Fault
 LSQUnit::checkViolations(typename LoadQueue::iterator& loadIt,
         const DynInstPtr& inst)
@@ -876,6 +944,14 @@ LSQUnit::checkViolations(typename LoadQueue::iterator& loadIt,
                 if (memDepViolator && ld_inst->seqNum > memDepViolator->seqNum)
                     break;
 
+                // if this load has been marked as Nuke, the load will then be replayed
+                // So next time this load replaying to pipeline will forward from store correctly
+                // And no RAW violation happens
+                if (skipNukeReplay(ld_inst)) {
+                    ++loadIt;
+                    continue;
+                }
+
                 DPRINTF(LSQUnit,
                         "ld_eff_addr1: %#x, ld_eff_addr2: %#x, "
                         "inst_eff_addr1: %#x, inst_eff_addr2: %#x\n",
@@ -900,35 +976,125 @@ LSQUnit::checkViolations(typename LoadQueue::iterator& loadIt,
     return NoFault;
 }
 
+void
+LSQUnit::setFlagInPipeLine(DynInstPtr inst, LdStFlags f)
+{
+    bool found = false;
+    if (inst->isLoad()) {
+        for (int i = (loadPipeSx.size() - 1); i >= 0; i--) {
+            for (int j = 0; j < loadPipeSx[i]->size; j++) {
+                if (inst == loadPipeSx[i]->insts[j]) {
+                    found = true;
+                    (loadPipeSx[i]->flags[j])[f] = true;
+                    break;
+                }
+            }
+        }
+    } else {
+        for (int i = (storePipeSx.size() - 1); i >= 0; i--) {
+            for (int j = 0; j < storePipeSx[i]->size; j++) {
+                if (inst == storePipeSx[i]->insts[j]) {
+                    found = true;
+                    (storePipeSx[i]->flags[j])[f] = true;
+                    break;
+                }
+            }
+        }
+    }
+
+    if (!found) {
+        panic("[sn:%ld] Can not found corresponding inst in PipeLine, isLoad: %d\n", inst->seqNum, inst->isLoad());
+    }
+}
 
+void
+LSQUnit::issueToLoadPipe(const DynInstPtr &inst)
+{
+    // push to loadPipeS0
+    assert(loadPipeSx[0]->size < MaxWidth);
+    int idx = loadPipeSx[0]->size;
 
+    loadPipeSx[0]->insts[idx] = inst;
+    loadPipeSx[0]->flags[idx][LdStFlags::Valid] = true;
+    loadPipeSx[0]->size++;
+
+    DPRINTF(LSQUnit, "issueToLoadPipe: [sn:%lli]\n", inst->seqNum);
+    dumpLoadPipe();
+}
+
+void
+LSQUnit::issueToStorePipe(const DynInstPtr &inst)
+{
+    // push to storePipeS0
+    assert(storePipeSx[0]->size < MaxWidth);
+    int idx = storePipeSx[0]->size;
+
+    storePipeSx[0]->insts[idx] = inst;
+    storePipeSx[0]->flags[idx][LdStFlags::Valid] = true;
+    storePipeSx[0]->size++;
+
+    DPRINTF(LSQUnit, "issueToStorePipe: [sn:%lli]\n", inst->seqNum);
+    dumpStorePipe();
+}
 
 Fault
-LSQUnit::executeLoad(const DynInstPtr &inst)
+LSQUnit::loadPipeS0(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag)
 {
-    // Execute a specific load.
+    DPRINTF(LSQUnit, "LoadPipeS0: Executing load PC %s, [sn:%lli] flags: %s\n",
+            inst->pcState(), inst->seqNum, getLdStFlagStr(flag));
+    assert(!inst->isSquashed());
+
     Fault load_fault = NoFault;
+    // Now initiateAcc only does TLB access
+    load_fault = inst->initiateAcc();
 
-    DPRINTF(LSQUnit, "Executing load PC %s, [sn:%lli]\n",
-            inst->pcState(), inst->seqNum);
+    return load_fault;
+}
 
+Fault
+LSQUnit::loadPipeS1(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag)
+{
+    DPRINTF(LSQUnit, "LoadPipeS1: Executing load PC %s, [sn:%lli] flags: %s\n",
+            inst->pcState(), inst->seqNum, getLdStFlagStr(flag));
     assert(!inst->isSquashed());
 
-    load_fault = inst->initiateAcc();
+    Fault load_fault = inst->getFault();
+    LSQRequest* request = inst->savedRequest;
+
+    // Cache access
+    if (request && request->isTranslationComplete()) {
+        if (request->isMemAccessRequired()) {
+            inst->effAddr = request->getVaddr();
+            inst->effAddrValid(true);
+
+            Fault fault;
+            fault = read(request, inst->lqIdx);
+            // inst->getFault() may have the first-fault of a
+            // multi-access split request at this point.
+            // Overwrite that only if we got another type of fault
+            // (e.g. re-exec).
+            if (fault != NoFault) {
+                inst->getFault() = fault;
+                load_fault = fault;
+            }
+        } else {
+            inst->setMemAccPredicate(false);
+            // Commit will have to clean up whatever happened.  Set this
+            // instruction as executed.
+            inst->setExecuted();
+        }
+    }
 
     if (!inst->translationCompleted()) {
+        // TLB miss
         iewStage->loadCancel(inst);
     } else {
-        DPRINTF(LSQUnit, "load tlb hit [sn:%lli]\n",
+        DPRINTF(LSQUnit, "LoadPipeS1: load tlb hit [sn:%lli]\n",
                 inst->seqNum);
     }
 
     if (load_fault == NoFault && !inst->readMemAccPredicate()) {
-        assert(inst->readPredicate());
-        inst->setExecuted();
-        inst->completeAcc(nullptr);
-        iewStage->instToCommit(inst);
-        iewStage->activityThisCycle();
+        flag[LdStFlags::readMemAccNotPredicate] = true;
         return NoFault;
     }
 
@@ -947,35 +1113,401 @@ LSQUnit::executeLoad(const DynInstPtr &inst)
         return NoFault;
     }
 
+    if (load_fault != NoFault || !inst->readPredicate()) {
+        flag[LdStFlags::HasFault] = load_fault != NoFault;
+        flag[LdStFlags::readNotPredicate] = !inst->readPredicate();
+    } else {
+        if (inst->effAddrValid()) {
+            // raw violation check (nuke replay)
+            for (int i = 0; i < storePipeSx[1]->size; i++) {
+                auto& store_inst = storePipeSx[1]->insts[i];
+                if (pipeLineNukeCheck(inst, store_inst)) {
+                    flag[LdStFlags::Nuke] = true;
+                    break;
+                }
+            }
+            // rar violation check
+            auto it = inst->lqIt;
+            ++it;
+
+            if (checkLoads)
+                load_fault = checkViolations(it, inst);
+        }
+    }
+
+    return load_fault;
+}
+
+Fault
+LSQUnit::loadPipeS2(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag)
+{
+    Fault fault = inst->getFault();
+    DPRINTF(LSQUnit, "LoadPipeS2: Executing load PC %s, [sn:%lli] flags: %s\n",
+            inst->pcState(), inst->seqNum, getLdStFlagStr(flag));
+    assert(!inst->isSquashed());
+    LSQRequest* request = inst->savedRequest;
+
+    if (flag[LdStFlags::readMemAccNotPredicate]) {
+        assert(inst->readPredicate() && fault == NoFault);
+        inst->setExecuted();
+        inst->completeAcc(nullptr);
+        iewStage->instToCommit(inst);
+        iewStage->activityThisCycle();
+        return NoFault;
+    }
+
     // If the instruction faulted or predicated false, then we need to send it
     // along to commit without the instruction completing.
-    if (load_fault != NoFault || !inst->readPredicate()) {
+    if (flag[LdStFlags::HasFault] || flag[LdStFlags::readNotPredicate]) {
         // Send this instruction to commit, also make sure iew stage
         // realizes there is activity.  Mark it as executed unless it
         // is a strictly ordered load that needs to hit the head of
         // commit.
-        if (!inst->readPredicate())
+        if (flag[LdStFlags::readNotPredicate])
             inst->forwardOldRegs();
-        DPRINTF(LSQUnit, "Load [sn:%lli] not executed from %s\n",
-                inst->seqNum,
-                (load_fault != NoFault ? "fault" : "predication"));
-        if (!(inst->hasRequest() && inst->strictlyOrdered()) ||
-            inst->isAtCommit()) {
+        DPRINTF(LSQUnit, "LoadPipeS2: Load [sn:%lli] not executed from %s\n",
+                inst->seqNum, (fault != NoFault ? "fault" : "predication"));
+        if (!(inst->hasRequest() && inst->strictlyOrdered()) || inst->isAtCommit()) {
             inst->setExecuted();
         }
         iewStage->instToCommit(inst);
         iewStage->activityThisCycle();
+        return fault;
+    }
+
+    if (flag[LdStFlags::Replayed] || flag[LdStFlags::LocalAccess]) {
+        return fault;
+    }
+
+    // raw violation check (nuke replay)
+    for (int i = 0; i < storePipeSx[1]->size; i++) {
+        auto& store_inst = storePipeSx[1]->insts[i];
+        if (pipeLineNukeCheck(inst, store_inst)) {
+            flag[LdStFlags::Nuke] = true;
+            break;
+        }
+    }
+
+    // check if cache hit & get cache response?
+    // NOTE: cache miss replay has higher priority than nuke replay!
+    if (lsq->enableLdMissReplay() &&
+        request && request->isNormalLd() && !flag[LdStFlags::FullForward] && !flag[LdStFlags::CacheHit]) {
+        // cannot get cache data at load s2, replay this load
+        // clear state in this instruction
+        inst->effAddrValid(false);
+        // set it as waiting for dcache refill
+        inst->waitingCacheRefill(true);
+        // clear request in loadQueue
+        loadQueue[inst->lqIdx].setRequest(nullptr);
+        // set cache miss & replayed flag in pipeline
+        flag[Replayed] = true;
+        // insert to missed load replay queue
+        iewStage->cacheMissLdReplay(inst);
+        // cancel subsequent dependent insts of this load
+        iewStage->loadCancel(inst);
+        return fault;
+    }
+
+    if (flag[LdStFlags::Nuke]) {
+        assert(lsq->enablePipeNukeCheck());
+        // replay load if nuke happens
+        request->discard();
+        inst->savedRequest = nullptr;
+        // clear state in this instruction
+        inst->translationStarted(false);
+        inst->translationCompleted(false);
+        inst->clearCanIssue();
+        inst->effAddrValid(false);
+        // clear request in loadQueue
+        loadQueue[inst->lqIdx].setRequest(nullptr);
+        // set replayed flag in pipeline
+        flag[LdStFlags::Replayed] = true;
+        // nuke fast replay
+        inst->issueQue->retryMem(inst);
+        stats.pipeRawNukeReplay++;
+        // cancel subsequent dependent insts of this load
+        iewStage->loadCancel(inst);
     } else {
-        if (inst->effAddrValid()) {
-            auto it = inst->lqIt;
-            ++it;
+        // no nuke happens, prepare the inst data
+        request = inst->savedRequest;
+        if (flag[LdStFlags::FullForward]) {
+            // this load gets full data from sq
+            assert(request && request->_fwd_data_pkt);
+            writeback(inst, request->_fwd_data_pkt);
+            request->writebackDone();
+        } else {
+            if (lsq->enableLdMissReplay() && request && request->isNormalLd()) {
+                // assemble cache & sbuffer forwarded data and completeDataAcess
+                request->assemblePackets();
+            }
+        }
+    }
 
-            if (checkLoads)
-                return checkViolations(it, inst);
+    return fault;
+}
+
+Fault
+LSQUnit::loadPipeS3(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag)
+{
+    Fault fault = inst->getFault();
+    DPRINTF(LSQUnit, "LoadPipeS3: Executing load PC %s, [sn:%lli] flags: %s\n",
+            inst->pcState(), inst->seqNum, getLdStFlagStr(flag));
+    assert(!inst->isSquashed());
+    return fault;
+}
+
+void
+LSQUnit::executeLoadPipeSx()
+{
+    // TODO: execute operations in each load pipelines
+    Fault fault = NoFault;
+    for (int i = 0; i < loadPipeSx.size(); i++) {
+        auto& stage = loadPipeSx[i];
+        for (int j = 0; j < stage->size; j++) {
+            auto& inst = stage->insts[j];
+            auto& flag = stage->flags[j];
+            if (!inst->isSquashed()) {
+                switch (i) {
+                    case 0:
+                        fault = loadPipeS0(inst, flag);
+                        break;
+                    case 1:
+                        // Loads will mark themselves as executed, and their writeback
+                        // event adds the instruction to the queue to commit
+                        fault = loadPipeS1(inst, flag);
+
+                        if (inst->isTranslationDelayed() && fault == NoFault) {
+                            // A hw page table walk is currently going on; the
+                            // instruction must be deferred.
+                            DPRINTF(LSQUnit, "Execute: Delayed translation, deferring "
+                            "load.\n");
+                            iewStage->deferMemInst(inst);
+                            flag[LdStFlags::Replayed] = true;
+                        }
+                        iewStage->SquashCheckAfterExe(inst);
+                        break;
+                    case 2:
+                        fault = loadPipeS2(inst, flag);
+
+                        if (inst->isDataPrefetch() || inst->isInstPrefetch()) {
+                            inst->fault = NoFault;
+                        }
+                        break;
+                    case 3:
+                        fault = loadPipeS3(inst, flag);
+                        break;
+                    default:
+                        panic("unsupported loadpipe length");
+                }
+            } else {
+                DPRINTF(LSQUnit, "Execute: Instruction was squashed. PC: %s, [tid:%i]"
+                                " [sn:%llu]\n", inst->pcState(), inst->threadNumber,
+                                inst->seqNum);
+                inst->setExecuted();
+                inst->setCanCommit();
+                flag[LdStFlags::Squashed] = true;
+            }
         }
     }
+}
 
-    return load_fault;
+Fault
+LSQUnit::storePipeS0(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag)
+{
+    // Make sure that a store exists.
+    assert(storeQueue.size() != 0);
+    assert(!inst->isSquashed());
+
+    DPRINTF(LSQUnit, "StorePipeS0: Executing store PC %s [sn:%lli] flags: %s\n",
+            inst->pcState(), inst->seqNum, getLdStFlagStr(flag));
+
+    // Now initiateAcc only does TLB access
+    Fault store_fault = inst->initiateAcc();
+
+    return store_fault;
+}
+
+Fault
+LSQUnit::storePipeS1(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag)
+{
+    // Make sure that a store exists.
+    assert(storeQueue.size() != 0);
+
+    ssize_t store_idx = inst->sqIdx;
+    LSQRequest* request = inst->savedRequest;
+
+    DPRINTF(LSQUnit, "StorePipeS1: Executing store PC %s [sn:%lli] flags: %s\n",
+            inst->pcState(), inst->seqNum, getLdStFlagStr(flag));
+
+    // Check the recently completed loads to see if any match this store's
+    // address.  If so, then we have a memory ordering violation.
+    typename LoadQueue::iterator loadIt = inst->lqIt;
+
+    /* This is the place were instructions get the effAddr. */
+    if (request && request->isTranslationComplete()) {
+        if (request->isMemAccessRequired() && (inst->getFault() == NoFault)) {
+            inst->effAddr = request->getVaddr();
+            inst->effAddrValid(true);
+
+            if (cpu->checker) {
+                inst->reqToVerify = std::make_shared<Request>(*request->req());
+            }
+            Fault fault;
+            fault = write(request, inst->memData, inst->sqIdx);
+            // release temporal data
+            delete [] inst->memData;
+            inst->memData = nullptr;
+
+            if (fault != NoFault)
+                inst->getFault() = fault;
+        }
+    }
+
+    Fault store_fault = inst->getFault();
+
+    if (inst->isTranslationDelayed() &&
+        store_fault == NoFault)
+        return store_fault;
+
+    if (!inst->readPredicate()) {
+        DPRINTF(LSQUnit, "StorePipeS1: Store [sn:%lli] not executed from predication\n",
+                inst->seqNum);
+        inst->forwardOldRegs();
+        flag[LdStFlags::readNotPredicate] = true;
+        return store_fault;
+    }
+
+    if (storeQueue[store_idx].size() == 0) {
+        DPRINTF(LSQUnit, "StorePipeS1: Fault on Store PC %s, [sn:%lli], Size = 0\n",
+                inst->pcState(), inst->seqNum);
+        flag[LdStFlags::HasFault] = true;
+        return store_fault;
+    }
+
+    assert(store_fault == NoFault);
+
+    if (inst->isStoreConditional()) {
+        // Store conditionals need to set themselves as able to
+        // writeback if we haven't had a fault by here.
+        storeQueue[store_idx].canWB() = true;
+
+        ++storesToWB;
+    } else {
+        if (enableStorePrefetchTrain) {
+            triggerStorePFTrain(store_idx);
+        }
+    }
+
+    return checkViolations(loadIt, inst);
+}
+
+Fault
+LSQUnit::storePipeS2(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag)
+{
+    Fault fault = inst->getFault();
+    assert(!inst->isSquashed());
+
+    DPRINTF(LSQUnit, "StorePipeS2: Executing store PC %s [sn:%lli] flags: %s\n",
+            inst->pcState(), inst->seqNum, getLdStFlagStr(flag));
+    return fault;
+}
+
+Fault
+LSQUnit::storePipeS3(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag)
+{
+    Fault fault = inst->getFault();
+    assert(!inst->isSquashed());
+
+    DPRINTF(LSQUnit, "StorePipeS3: Executing store PC %s [sn:%lli] flags: %s\n",
+            inst->pcState(), inst->seqNum, getLdStFlagStr(flag));
+    return fault;
+}
+
+Fault
+LSQUnit::storePipeS4(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag)
+{
+    Fault fault = inst->getFault();
+    assert(!inst->isSquashed());
+
+    DPRINTF(LSQUnit, "StorePipeS4: Executing store PC %s [sn:%lli] flags: %s\n",
+            inst->pcState(), inst->seqNum, getLdStFlagStr(flag));
+
+    return fault;
+}
+
+void
+LSQUnit::executeStorePipeSx()
+{
+    // TODO: execute operations in each store pipelines
+    Fault fault = NoFault;
+    for (int i = 0; i < storePipeSx.size(); i++) {
+        auto& stage = storePipeSx[i];
+        for (int j = 0; j < stage->size; j++) {
+            auto& inst = stage->insts[j];
+            auto& flag = stage->flags[j];
+            if (!inst->isSquashed()) {
+                switch (i) {
+                    case 0:
+                        fault = storePipeS0(inst, flag);
+                        break;
+                    case 1:
+                        fault = storePipeS1(inst, flag);
+                        if (inst->isTranslationDelayed() && fault == NoFault) {
+                            // A hw page table walk is currently going on; the
+                            // instruction must be deferred.
+                            DPRINTF(LSQUnit, "Execute: Delayed translation, deferring "
+                                    "store.\n");
+                            iewStage->deferMemInst(inst);
+                            flag[LdStFlags::Replayed] = true;
+                            continue;
+                        }
+
+                        iewStage->notifyExecuted(inst);
+                        iewStage->SquashCheckAfterExe(inst);
+                        break;
+                    case 2:
+                        fault = storePipeS2(inst, flag);
+                        break;
+                    case 3:
+                        fault = storePipeS3(inst, flag);
+                        break;
+                    case 4:
+                        fault = storePipeS4(inst, flag);
+                        break;
+                    default:
+                        panic("unsupported storepipe length");
+                }
+                if (i == (lsq->storeWbStage() - 1)) {
+                    // If the store had a fault then it may not have a mem req
+                    if (fault != NoFault || !inst->readPredicate() || !inst->isStoreConditional()) {
+                        // If the instruction faulted, then we need to send it
+                        // along to commit without the instruction completing.
+                        // Send this instruction to commit, also make sure iew
+                        // stage realizes there is activity.
+                        if (!flag[LdStFlags::Replayed]) {
+                            inst->setExecuted();
+                            iewStage->instToCommit(inst);
+                            iewStage->activityThisCycle();
+                        }
+                    }
+                }
+            } else {
+                DPRINTF(LSQUnit, "Execute: Instruction was squashed. PC: %s, [tid:%i]"
+                                " [sn:%llu]\n", inst->pcState(), inst->threadNumber,
+                                inst->seqNum);
+                inst->setExecuted();
+                inst->setCanCommit();
+                flag[LdStFlags::Squashed] = true;
+            }
+        }
+    }
+}
+
+void
+LSQUnit::executePipeSx()
+{
+    executeLoadPipeSx();
+    executeStorePipeSx();
 }
 
 bool
@@ -1001,69 +1533,57 @@ LSQUnit::triggerStorePFTrain(int sq_idx)
 }
 
 Fault
-LSQUnit::executeStore(const DynInstPtr &store_inst)
+LSQUnit::executeAmo(const DynInstPtr &amo_inst)
 {
     // Make sure that a store exists.
     assert(storeQueue.size() != 0);
 
-    ssize_t store_idx = store_inst->sqIdx;
+    ssize_t amo_idx = amo_inst->sqIdx;
 
-    DPRINTF(LSQUnit, "Executing store PC %s [sn:%lli]\n",
-            store_inst->pcState(), store_inst->seqNum);
+    DPRINTF(LSQUnit, "Executing AMO PC %s [sn:%lli]\n",
+            amo_inst->pcState(), amo_inst->seqNum);
 
-    assert(!store_inst->isSquashed());
+    assert(!amo_inst->isSquashed());
 
-    // Check the recently completed loads to see if any match this store's
+    // Check the recently completed loads to see if any match this amo's
     // address.  If so, then we have a memory ordering violation.
-    typename LoadQueue::iterator loadIt = store_inst->lqIt;
+    typename LoadQueue::iterator loadIt = amo_inst->lqIt;
 
-    Fault store_fault = store_inst->initiateAcc();
+    Fault amo_fault = amo_inst->initiateAcc();
 
-    if (store_inst->isTranslationDelayed() &&
-        store_fault == NoFault)
-        return store_fault;
+    if (amo_inst->isTranslationDelayed() && amo_fault == NoFault)
+        return amo_fault;
 
-    if (!store_inst->readPredicate()) {
-        DPRINTF(LSQUnit, "Store [sn:%lli] not executed from predication\n",
-                store_inst->seqNum);
-        store_inst->forwardOldRegs();
-        return store_fault;
+    if (!amo_inst->readPredicate()) {
+        DPRINTF(LSQUnit, "AMO [sn:%lli] not executed from predication\n",
+                amo_inst->seqNum);
+        amo_inst->forwardOldRegs();
+        return amo_fault;
     }
 
-    if (storeQueue[store_idx].size() == 0) {
-        DPRINTF(LSQUnit,"Fault on Store PC %s, [sn:%lli], Size = 0\n",
-                store_inst->pcState(), store_inst->seqNum);
-
-        if (store_inst->isAtomic()) {
-            // If the instruction faulted, then we need to send it along
-            // to commit without the instruction completing.
-            if (!(store_inst->hasRequest() && store_inst->strictlyOrdered()) ||
-                store_inst->isAtCommit()) {
-                store_inst->setExecuted();
-            }
-            iewStage->instToCommit(store_inst);
-            iewStage->activityThisCycle();
+    if (storeQueue[amo_idx].size() == 0) {
+        DPRINTF(LSQUnit,"Fault on AMO PC %s, [sn:%lli], Size = 0\n",
+                amo_inst->pcState(), amo_inst->seqNum);
+
+        // If the amo instruction faulted, then we need to send it along
+        // to commit without the instruction completing.
+        if (!(amo_inst->hasRequest() && amo_inst->strictlyOrdered()) ||
+            amo_inst->isAtCommit()) {
+            amo_inst->setExecuted();
         }
+        iewStage->instToCommit(amo_inst);
+        iewStage->activityThisCycle();
 
-        return store_fault;
+        return amo_fault;
     }
 
-    assert(store_fault == NoFault);
+    assert(amo_fault == NoFault);
 
-    if (store_inst->isStoreConditional() || store_inst->isAtomic()) {
-        // Store conditionals and Atomics need to set themselves as able to
-        // writeback if we haven't had a fault by here.
-        storeQueue[store_idx].canWB() = true;
-
-        ++storesToWB;
-    } else {
-        if (enableStorePrefetchTrain) {
-            triggerStorePFTrain(store_idx);
-        }
-    }
-
-    return checkViolations(loadIt, store_inst);
+    // Atomics need to set themselves as able to writeback if we haven't had a fault by here.
+    storeQueue[amo_idx].canWB() = true;
+    ++storesToWB;
 
+    return checkViolations(loadIt, amo_inst);
 }
 
 void
@@ -1968,6 +2488,40 @@ LSQUnit::recvRetry()
     }
 }
 
+void
+LSQUnit::dumpLoadPipe()
+{
+    DPRINTF(LSQUnit, "Dumping LoadPipe:\n");
+    for (int i = 0; i < loadPipeSx.size(); i++) {
+        DPRINTF(LSQUnit, "Load S%d:, size: %d\n", i, loadPipeSx[i]->size);
+        for (int j = 0; j < loadPipeSx[i]->size; j++) {
+            DPRINTF(LSQUnit, "  PC: %s, [tid:%i] [sn:%lli] flags: %s\n",
+                    loadPipeSx[i]->insts[j]->pcState(),
+                    loadPipeSx[i]->insts[j]->threadNumber,
+                    loadPipeSx[i]->insts[j]->seqNum,
+                    getLdStFlagStr(loadPipeSx[i]->flags[j])
+            );
+        }
+    }
+}
+
+void
+LSQUnit::dumpStorePipe()
+{
+    DPRINTF(LSQUnit, "Dumping StorePipe:\n");
+    for (int i = 0; i < storePipeSx.size(); i++) {
+        DPRINTF(LSQUnit, "Store S%d:, size: %d\n", i, storePipeSx[i]->size);
+        for (int j = 0; j < storePipeSx[i]->size; j++) {
+            DPRINTF(LSQUnit, "  PC: %s, [tid:%i] [sn:%lli] flags: %s\n",
+                    storePipeSx[i]->insts[j]->pcState(),
+                    storePipeSx[i]->insts[j]->threadNumber,
+                    storePipeSx[i]->insts[j]->seqNum,
+                    getLdStFlagStr(storePipeSx[i]->flags[j])
+            );
+        }
+    }
+}
+
 void
 LSQUnit::dumpInsts() const
 {
@@ -2054,6 +2608,7 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx)
         // rescheduled eventually
         iewStage->rescheduleMemInst(load_inst);
         load_inst->effAddrValid(false);
+        setFlagInPipeLine(load_inst, LdStFlags::Replayed);
         ++stats.rescheduledLoads;
         DPRINTF(LSQUnit, "Strictly ordered load [sn:%lli] PC %s\n",
                 load_inst->seqNum, load_inst->pcState());
@@ -2102,6 +2657,7 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx)
 
         WritebackEvent *wb = new WritebackEvent(load_inst, main_pkt, this);
         cpu->schedule(wb, cpu->clockEdge(delay));
+        setFlagInPipeLine(load_inst, LdStFlags::LocalAccess);
         return NoFault;
     }
 
@@ -2253,13 +2809,12 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx)
                     request->discard();
                 }
 
-                WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt,
-                        this);
-
-                // We'll say this has a 1 cycle load-store forwarding latency
-                // for now.
-                // @todo: Need to make this a parameter.
-                cpu->schedule(wb, curTick());
+                // set FullForward flag, save the forward result(data_pkt) in _fwd_data_pkt
+                // then this load will be written back at s2
+                // @todo: make sure _fwd_data_pkt no memory leak!
+                assert(request->_fwd_data_pkt == nullptr);
+                request->_fwd_data_pkt = data_pkt;
+                setFlagInPipeLine(load_inst, LdStFlags::FullForward);
 
                 // Don't need to do anything special for split loads.
                 ++stats.forwLoads;
@@ -2289,6 +2844,7 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx)
                 // rescheduled eventually
                 iewStage->rescheduleMemInst(load_inst);
                 load_inst->effAddrValid(false);
+                setFlagInPipeLine(load_inst, LdStFlags::Replayed);
                 ++stats.rescheduledLoads;
 
                 // Do not generate a writeback event as this instruction is not
@@ -2336,9 +2892,13 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx)
                     request->discard();
                 }
 
-                WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt,
-                        this);
-                cpu->schedule(wb, curTick());
+                // set FullForward flag, save the forward result(data_pkt) in _fwd_data_pkt
+                // then this load will be written back at s2
+                // @todo: make sure _fwd_data_pkt no memory leak!
+                assert(request->_fwd_data_pkt == nullptr);
+                request->_fwd_data_pkt = data_pkt;
+                setFlagInPipeLine(load_inst, LdStFlags::FullForward);
+
                 return NoFault;
             }
             // if not fully forward, need to clear buffer
@@ -2378,6 +2938,7 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx)
     }
     if (!request->isSent()) {
         iewStage->blockMemInst(load_inst);
+        setFlagInPipeLine(load_inst, LdStFlags::Replayed);
     }
 
     return NoFault;
@@ -2388,9 +2949,9 @@ LSQUnit::write(LSQRequest *request, uint8_t *data, ssize_t store_idx)
 {
     assert(storeQueue[store_idx].valid());
 
-    DPRINTF(LSQUnit, "Doing write to store idx %i, addr %#x | storeHead:%i "
+    DPRINTF(LSQUnit, "Doing write to store idx %i, addr %#x | storeHead:%i, size: %d"
             "[sn:%llu]\n",
-            store_idx - 1, request->req()->getPaddr(), storeQueue.head() - 1,
+            store_idx - 1, request->req()->getPaddr(), storeQueue.head() - 1, request->_size,
             storeQueue[store_idx].instruction()->seqNum);
 
     storeQueue[store_idx].setRequest(request);
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
index 802d359cb6..56d0290d5a 100644
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -43,6 +43,7 @@
 #define __CPU_O3_LSQ_UNIT_HH__
 
 #include <algorithm>
+#include <bitset>
 #include <cstdint>
 #include <cstring>
 #include <map>
@@ -62,6 +63,7 @@
 #include "cpu/o3/comm.hh"
 #include "cpu/o3/cpu.hh"
 #include "cpu/o3/dyn_inst_ptr.hh"
+#include "cpu/o3/limits.hh"
 #include "cpu/o3/lsq.hh"
 #include "cpu/timebuf.hh"
 #include "debug/HtmCpu.hh"
@@ -290,7 +292,8 @@ class LSQUnit
   public:
     /** Constructs an LSQ unit. init() must be called prior to use. */
     LSQUnit(uint32_t lqEntries, uint32_t sqEntries, uint32_t sbufferEntries,
-      uint32_t sbufferEvictThreshold, uint64_t storeBufferInactiveThreshold);
+      uint32_t sbufferEvictThreshold, uint64_t storeBufferInactiveThreshold,
+      uint32_t ldPipeStages, uint32_t stPipeStages);
 
     /** We cannot copy LSQUnit because it has stats for which copy
      * contructor is deleted explicitly. However, STL vector requires
@@ -339,15 +342,16 @@ class LSQUnit
      */
     void checkSnoop(PacketPtr pkt);
 
-    /** Executes a load instruction. */
-    Fault executeLoad(const DynInstPtr &inst);
-
-    Fault executeLoad(int lq_idx) { panic("Not implemented"); return NoFault; }
+    /** Iq issues a load to load pipeline. */
+    void issueToLoadPipe(const DynInstPtr &inst);
 
     bool triggerStorePFTrain(int sq_idx);
 
-    /** Executes a store instruction. */
-    Fault executeStore(const DynInstPtr& inst);
+    /** Executes an amo instruction. */
+    Fault executeAmo(const DynInstPtr& inst);
+
+    /** Iq issues a store to store pipeline. */
+    void issueToStorePipe(const DynInstPtr &inst);
 
     /** Commits the head load. */
     void commitLoad();
@@ -387,6 +391,12 @@ class LSQUnit
     /** Returns the memory ordering violator. */
     DynInstPtr getMemDepViolator();
 
+    /** Check if store should skip this raw violation because of nuke replay. */
+    bool skipNukeReplay(const DynInstPtr& load_inst);
+
+    /** Check if there exists raw nuke between load and store. */
+    bool pipeLineNukeCheck(const DynInstPtr &load_inst, const DynInstPtr &store_inst);
+
     /** Returns the number of free LQ entries. */
     unsigned numFreeLoadEntries();
 
@@ -442,6 +452,22 @@ class LSQUnit
     /** Returns the number of stores to writeback. */
     int numStoresToSbuffer() { return storesToWB; }
 
+    /** get description string from load/store pipeLine flag. */
+    std::string getLdStFlagStr(const std::bitset<LdStFlagNum>& flag)
+    {
+        std::string res{};
+        for (int i = 0; i < LdStFlagNum; i++) {
+            if (flag.test(i)) {
+                res += LdStFlagName[i] + ": [1] ";
+            } else {
+                res += LdStFlagName[i] + ": [0] ";
+            }
+        }
+        return res;
+    }
+
+    LSQ* getLsq() { return lsq; }
+
     /** Returns if the LSQ unit will writeback on this cycle. */
     bool
     willWB()
@@ -487,9 +513,40 @@ class LSQUnit
 
     bool sbufferSendPacket(PacketPtr data_pkt);
 
+    /** Debugging function to dump instructions in the LoadPipe. */
+    void dumpLoadPipe();
+
+    /** Debugging function to dump instructions in the storePipe. */
+    void dumpStorePipe();
+
     /** Debugging function to dump instructions in the LSQ. */
     void dumpInsts() const;
 
+    /** Ticks
+     *  causing load/store pipe to run for one cycle.
+     */
+    void tick();
+
+    /** Process instructions in each load pipeline stages. */
+    void executeLoadPipeSx();
+
+    Fault loadPipeS0(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag);
+    Fault loadPipeS1(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag);
+    Fault loadPipeS2(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag);
+    Fault loadPipeS3(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag);
+
+    /** Process instructions in each store pipeline stages. */
+    void executeStorePipeSx();
+
+    Fault storePipeS0(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag);
+    Fault storePipeS1(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag);
+    Fault storePipeS2(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag);
+    Fault storePipeS3(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag);
+    Fault storePipeS4(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag);
+
+    /** Wrap function. */
+    void executePipeSx();
+
     /** Schedule event for the cpu. */
     void schedule(Event& ev, Tick when);
 
@@ -586,6 +643,35 @@ class LSQUnit
     /** The load queue. */
     LoadQueue loadQueue;
 
+    /** Struct that defines the information passed through Load Pipeline. */
+    struct LoadPipeStruct
+    {
+        int size;
+
+        DynInstPtr insts[MaxWidth];
+        std::bitset<LdStFlagNum> flags[MaxWidth];
+    };
+    /** The load pipeline TimeBuffer. */
+    TimeBuffer<LoadPipeStruct> loadPipe;
+    /** Each stage in load pipeline. loadPipeSx[0] means load pipe S0 */
+    std::vector<TimeBuffer<LoadPipeStruct>::wire> loadPipeSx;
+
+    /** Struct that defines the information passed through Store Pipeline. */
+    struct StorePipeStruct
+    {
+        int size;
+
+        DynInstPtr insts[MaxWidth];
+        std::bitset<LdStFlagNum> flags[MaxWidth];
+    };
+    /** The store pipeline TimeBuffer. */
+    TimeBuffer<StorePipeStruct> storePipe;
+    /** Each stage in store pipeline. storePipeSx[0] means store pipe S0 */
+    std::vector<TimeBuffer<StorePipeStruct>::wire> storePipeSx;
+
+    /** Find inst in Load/Store Pipeline, set corresponding flag to true */
+    void setFlagInPipeLine(DynInstPtr inst, LdStFlags f);
+
   private:
     /** The number of places to shift addresses in the LSQ before checking
      * for dependency violations
@@ -658,6 +744,9 @@ class LSQUnit
         /** Total number of squashed loads. */
         statistics::Scalar squashedLoads;
 
+        /** Total number of pipeline detected raw nuke. */
+        statistics::Scalar pipeRawNukeReplay;
+
         /** Total number of responses from the memory system that are
          * ignored due to the instruction already being squashed. */
         statistics::Scalar ignoredResponses;
diff --git a/src/mem/cache/Cache.py b/src/mem/cache/Cache.py
index 79adc68fae..45e185e8a1 100644
--- a/src/mem/cache/Cache.py
+++ b/src/mem/cache/Cache.py
@@ -154,6 +154,9 @@ class BaseCache(ClockedObject):
 
     tag_load_read_ports = Param.Unsigned(3, "Total tag read ports for load/prefetcher(in L1 Cache)")
 
+    hint_wakeup_ahead_cycles = Param.Unsigned(3, "How many cycles " \
+        "giving a response to LSU waking up the missed load in advance")
+
     force_hit = Param.Bool(False, "Force some PC to hit in L1")
     way_entries = Param.MemorySize(
         "64",
diff --git a/src/mem/cache/base.cc b/src/mem/cache/base.cc
index ab0abeb998..c3d5adce35 100644
--- a/src/mem/cache/base.cc
+++ b/src/mem/cache/base.cc
@@ -145,9 +145,10 @@ BaseCache::BaseCache(const BaseCacheParams &p, unsigned blk_size)
       missCount(p.max_miss_count),
       addrRanges(p.addr_ranges.begin(), p.addr_ranges.end()),
       archDBer(p.arch_db),
+      cacheLevel(p.cache_level),
+      hintWakeUpAheadCycles(p.hint_wakeup_ahead_cycles),
       system(p.system),
       stats(*this),
-      cacheLevel(p.cache_level),
       forceHit(p.force_hit)
 {
     // the MSHR queue has no reserve entries as we check the MSHR
@@ -642,14 +643,16 @@ BaseCache::recvTimingReq(PacketPtr pkt)
         }
 
         handleTimingReqHit(pkt, blk, request_time, first_acc_after_pf);
-        if (cacheLevel == 1 && pkt->isResponse() && pkt->isRead() && lat > 1) {
-            // send cache miss signal
-            cpuSidePort.sendCustomSignal(pkt, DcacheRespType::Miss);
+        if (cacheLevel == 1 && pkt->isResponse() && pkt->isRead() && !pkt->isWrite() && lat > 1) {
+            // cache block not ready, send cancel signal
+            cpuSidePort.sendCustomSignal(pkt, DcacheRespType::Block_Not_Ready);
+            pkt->cacheSatisfied = false;
         }
     } else {
-        if (cacheLevel == 1 && pkt->needsResponse() && pkt->isRead()) {
+        if (cacheLevel == 1 && pkt->needsResponse() && pkt->isRead() && !pkt->isWrite()) {
             // send cache miss signal
             cpuSidePort.sendCustomSignal(pkt, DcacheRespType::Miss);
+            pkt->cacheSatisfied = false;
         }
 
         // ArchDB: for now we only track packet which has PC
diff --git a/src/mem/cache/base.hh b/src/mem/cache/base.hh
index f40411785d..cc15f85e46 100644
--- a/src/mem/cache/base.hh
+++ b/src/mem/cache/base.hh
@@ -1052,6 +1052,11 @@ class BaseCache : public ClockedObject, CacheAccessor
     /** ArchDB */
     ArchDBer *archDBer;
 
+    /** Cache Level, 1 means L1 */
+    const unsigned cacheLevel{0};
+
+    Cycles hintWakeUpAheadCycles;
+
     int squashedWays;
 
   public:
@@ -1503,8 +1508,6 @@ class BaseCache : public ClockedObject, CacheAccessor
 
   private:
 
-    const unsigned cacheLevel{0};
-
     //const unsigned maxCacheLevel;
 
     const bool dumpMissPC{false};
diff --git a/src/mem/cache/cache.cc b/src/mem/cache/cache.cc
index d34b367fce..c8e3bad0ab 100644
--- a/src/mem/cache/cache.cc
+++ b/src/mem/cache/cache.cc
@@ -56,6 +56,7 @@
 #include "debug/CacheTags.hh"
 #include "debug/CacheVerbose.hh"
 #include "enums/Clusivity.hh"
+#include "mem/cache/base.hh"
 #include "mem/cache/cache_blk.hh"
 #include "mem/cache/mshr.hh"
 #include "mem/cache/tags/base.hh"
@@ -802,8 +803,16 @@ Cache::serviceMSHRTargets(MSHR *mshr, const PacketPtr pkt, CacheBlk *blk)
                 // responseLatency is the latency of the return path
                 // from lower level caches/memory to an upper level cache or
                 // the core.
-                completion_time += clockEdge(responseLatency) +
-                    (transfer_offset ? pkt->payloadDelay : 0);
+                if ((cacheLevel == 1 && !isReadOnly) &&
+                    tgt_pkt->isRead() && !tgt_pkt->isWrite() && !tgt_pkt->isLLSC()) {
+                    // Send TimingResp to LSU a few cycles in advance so that it can be replayed from ReplayQ earlier.
+                    assert(hintWakeUpAheadCycles <= responseLatency);
+                    completion_time += clockEdge(responseLatency - hintWakeUpAheadCycles) +
+                        (transfer_offset ? pkt->payloadDelay : 0);
+                } else {
+                    completion_time += clockEdge(responseLatency) +
+                        (transfer_offset ? pkt->payloadDelay : 0);
+                }
 
                 assert(!tgt_pkt->req->isUncacheable());
 
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index a62d05de04..8964904215 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -1598,6 +1598,8 @@ class Packet : public Printable
 
     bool tagReadFail = false;
 
+    bool cacheSatisfied = true;
+
     bool fromBOP() const { return pfSource == PrefetchSourceType::HWP_BOP; }
     
     PrefetchSourceType getPFSource() const { return static_cast<PrefetchSourceType>(pfSource); }
diff --git a/src/mem/request.hh b/src/mem/request.hh
index acbd793c0a..075949a2d9 100644
--- a/src/mem/request.hh
+++ b/src/mem/request.hh
@@ -91,6 +91,7 @@ enum DcacheRespType
 {
     NONE = 0,
     Hit,
+    Block_Not_Ready,
     Miss,
     NUM_Resp_Type
 };
diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc
index e4ee650f85..f5790d608e 100644
--- a/src/mem/ruby/system/RubyPort.cc
+++ b/src/mem/ruby/system/RubyPort.cc
@@ -49,6 +49,7 @@
 #include "mem/packet_access.hh"
 #include "mem/ruby/protocol/AccessPermission.hh"
 #include "mem/ruby/slicc_interface/AbstractController.hh"
+#include "mem/ruby/slicc_interface/RubySlicc_Util.hh"
 #include "mem/simple_mem.hh"
 #include "sim/full_system.hh"
 #include "sim/system.hh"
@@ -497,6 +498,7 @@ RubyPort::ruby_custom_signal_callback(PacketPtr pkt)
 
     DPRINTF(RubyPort, "Sent custom signal back to LSQ with sender state %#lx\n", sender_state);
     port->sendCustomSignal(pkt, DcacheRespType::Miss);
+    pkt->cacheSatisfied = false;
 }
 
 void
@@ -675,7 +677,12 @@ RubyPort::MemResponsePort::hitCallback(PacketPtr pkt)
         // Send a response in the same cycle. There is no need to delay the
         // response because the response latency is already incurred in the
         // Ruby protocol.
-        schedTimingResp(pkt, curTick());
+        if (pkt->isRead() && !pkt->isWrite() && !pkt->fromCache()) {
+            // send resp right now, make sure load has certain latency
+            respQueue.sendTiming(pkt);
+        } else {
+            schedTimingResp(pkt, curTick());
+        }
     } else {
         delete pkt;
     }
diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc
index 8f6213b70c..0b442ad1f5 100644
--- a/src/mem/ruby/system/Sequencer.cc
+++ b/src/mem/ruby/system/Sequencer.cc
@@ -383,7 +383,7 @@ Sequencer::insertRequest(PacketPtr pkt, RubyRequestType primary_type,
 
     if (seq_req_list.size() > 1) {
         if (cache_block_busy) {
-            if (pkt->isRead()) {
+            if (pkt->isRead() && !pkt->isWrite()) {
                 DPRINTF(RubySequencer, "Pkt %#lx %s is delayed because blk is busy doing ruby stuff\n",
                     pkt, pkt->cmdString());
                 ruby_custom_signal_callback(pkt);
@@ -649,7 +649,7 @@ Sequencer::notifyMissCallback(Addr address, bool is_upgrade, bool is_busy)
 
     // cancel pending loads' speculation
     for (auto &seq_req: seq_req_list) {
-        if (seq_req.pkt->isRead()) {
+        if (seq_req.pkt->isRead() && !seq_req.pkt->isWrite()) {
             ruby_custom_signal_callback(seq_req.pkt);
             stat.loadcancel++;
         }
@@ -693,7 +693,7 @@ Sequencer::TBEFullCancel(Addr address)
 
     // cancel pending loads' speculation
     for (auto &seq_req: seq_req_list) {
-        if (seq_req.pkt->isRead()) {
+        if (seq_req.pkt->isRead() && !seq_req.pkt->isWrite()) {
             ruby_custom_signal_callback(seq_req.pkt);
             stat.loadcancel++;
         }