diff --git a/configs/example/xiangshan.py b/configs/example/xiangshan.py index b4400b88b2..01269e7ec2 100644 --- a/configs/example/xiangshan.py +++ b/configs/example/xiangshan.py @@ -337,8 +337,11 @@ def setKmhV3IdealParams(args, system): cpu.mmu.itb.size = 96 cpu.BankConflictCheck = False # real bank conflict 0.2 score + cpu.EnableLdMissReplay = False + cpu.EnablePipeNukeCheck = False + cpu.StoreWbStage = 2 # store writeback at s2 - cpu.scheduler = IdealScheduler() + cpu.scheduler = IdealScheduler() # use centralized load/store issue queue, for hmmer # ideal decoupled frontend @@ -362,6 +365,7 @@ def setKmhV3IdealParams(args, system): if args.caches: cpu.icache.size = '128kB' cpu.dcache.size = '128kB' + cpu.dcache.hint_wakeup_ahead_cycles = 0; cpu.icache.enable_wayprediction = False cpu.dcache.enable_wayprediction = False cpu.dcache.tag_load_read_ports = 100 # 3->100 diff --git a/src/arch/riscv/isa/decoder.isa b/src/arch/riscv/isa/decoder.isa index 858ecece1e..c5f98cd5b0 100644 --- a/src/arch/riscv/isa/decoder.isa +++ b/src/arch/riscv/isa/decoder.isa @@ -613,7 +613,7 @@ decode QUADRANT default Unknown::unknown() { 0x03: decode FUNCT3 { format FenceOp { 0x0: fence({{ - }}, uint64_t, IsReadBarrier, IsWriteBarrier, No_OpClass); + }}, uint64_t, IsReadBarrier, IsWriteBarrier, MemReadOp); 0x1: fence_i({{ }}, uint64_t, IsNonSpeculative, IsSerializeAfter, No_OpClass); } diff --git a/src/arch/riscv/isa/formats/amo.isa b/src/arch/riscv/isa/formats/amo.isa index f7e9b5bcc6..b104c07657 100644 --- a/src/arch/riscv/isa/formats/amo.isa +++ b/src/arch/riscv/isa/formats/amo.isa @@ -151,6 +151,36 @@ def template LRSCMacroConstructor {{ } }}; +// Strictly order-preserving LRSC +def template LRSCStrictMacroConstructor {{ + %(class_name)s::%(class_name)s(ExtMachInst machInst): + %(base_class)s("%(mnemonic)s", machInst, %(op_class)s) + { + %(constructor)s; + + StaticInstPtr rel_fence; + StaticInstPtr lrsc; + StaticInstPtr acq_fence; + + rel_fence = new MemFenceMicro(machInst, No_OpClass); + rel_fence->setFlag(IsFirstMicroop); + rel_fence->setFlag(IsReadBarrier); + rel_fence->setFlag(IsWriteBarrier); + rel_fence->setFlag(IsDelayedCommit); + + // set up atomic rmw op + lrsc = new %(class_name)sMicro(machInst, this); + lrsc->setFlag(IsDelayedCommit); + + acq_fence = new MemFenceMicro(machInst, No_OpClass); + acq_fence->setFlag(IsLastMicroop); + acq_fence->setFlag(IsReadBarrier); + acq_fence->setFlag(IsWriteBarrier); + + microops = {rel_fence, lrsc, acq_fence}; + } +}}; + def template LRSCMicroConstructor {{ %(class_name)s::%(class_name)sMicro::%(class_name)sMicro( ExtMachInst machInst, %(class_name)s *_p) @@ -435,7 +465,7 @@ def format LoadReserved(memacc_code, postacc_code={{ }}, ea_code={{EA = Rs1;}}, macro_iop = InstObjParams(name, Name, 'LoadReserved', macro_ea_code, macro_inst_flags) header_output = LRSCDeclare.subst(macro_iop) - decoder_output = LRSCMacroConstructor.subst(macro_iop) + decoder_output = LRSCStrictMacroConstructor.subst(macro_iop) decode_block = BasicDecode.subst(macro_iop) exec_output = '' @@ -463,7 +493,7 @@ def format StoreCond(memacc_code, postacc_code={{ }}, ea_code={{EA = Rs1;}}, macro_iop = InstObjParams(name, Name, 'StoreCond', macro_ea_code, macro_inst_flags) header_output = LRSCDeclare.subst(macro_iop) - decoder_output = LRSCMacroConstructor.subst(macro_iop) + decoder_output = LRSCStrictMacroConstructor.subst(macro_iop) decode_block = BasicDecode.subst(macro_iop) exec_output = '' diff --git a/src/cpu/o3/BaseO3CPU.py b/src/cpu/o3/BaseO3CPU.py index 718939d85f..cd6e69aef6 100644 --- a/src/cpu/o3/BaseO3CPU.py +++ b/src/cpu/o3/BaseO3CPU.py @@ -175,6 +175,8 @@ def support_take_over(cls): SbufferEvictThreshold = Param.Unsigned(7, "store buffer eviction threshold") storeBufferInactiveThreshold = Param.Unsigned(800, "store buffer writeback timeout threshold") + StoreWbStage = Param.Unsigned(4, "Which PipeLine Stage store instruction writeback, 4 means S4") + LSQDepCheckShift = Param.Unsigned(0, "Number of places to shift addr before check") LSQCheckLoads = Param.Bool(True, @@ -188,6 +190,8 @@ def support_take_over(cls): LFSTEntrySize = Param.Unsigned(4,"The number of store table inst in every entry of LFST can contain") SSITSize = Param.Unsigned(8192, "Store set ID table size") BankConflictCheck = Param.Bool(True, "open Bank conflict check") + EnableLdMissReplay = Param.Bool(True, "Replay Cache missed load instrution from ReplayQ if True") + EnablePipeNukeCheck = Param.Bool(True, "Replay load if Raw violation is detected in loadPipe if True") numRobs = Param.Unsigned(1, "Number of Reorder Buffers"); diff --git a/src/cpu/o3/dyn_inst.hh b/src/cpu/o3/dyn_inst.hh index f519b0c504..5bcdd14d34 100644 --- a/src/cpu/o3/dyn_inst.hh +++ b/src/cpu/o3/dyn_inst.hh @@ -194,6 +194,7 @@ class DynInst : public ExecContext, public RefCounted NotAnInst, TranslationStarted, TranslationCompleted, + WaitingCacheRefill, PossibleLoadViolation, HitExternalSnoop, EffAddrValid, @@ -462,6 +463,14 @@ class DynInst : public ExecContext, public RefCounted } void translationCompleted(bool f) { instFlags[TranslationCompleted] = f; } + /** True if inst is waiting for Dcache refill. */ + bool + waitingCacheRefill() const + { + return instFlags[WaitingCacheRefill]; + } + void waitingCacheRefill(bool f) { instFlags[WaitingCacheRefill] = f; } + /** True if this address was found to match a previous load and they issued * out of order. If that happend, then it's only a problem if an incoming * snoop invalidate modifies the line, in which case we need to squash. @@ -1395,6 +1404,10 @@ class DynInst : public ExecContext, public RefCounted return squashVer.getVersion(); } + ssize_t getLqIdx() + { + return lqIdx; + } Addr getPC() { diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc index 9e32532ec8..b3fa6f517c 100644 --- a/src/cpu/o3/iew.cc +++ b/src/cpu/o3/iew.cc @@ -682,6 +682,12 @@ IEW::blockMemInst(const DynInstPtr& inst) instQueue.blockMemInst(inst); } +void +IEW::cacheMissLdReplay(const DynInstPtr& inst) +{ + instQueue.cacheMissLdReplay(inst); +} + void IEW::cacheUnblocked() { @@ -1326,6 +1332,84 @@ IEW::printAvailableInsts() std::cout << "\n"; } +void +IEW::SquashCheckAfterExe(DynInstPtr inst) +{ + ThreadID tid = inst->threadNumber; + + if (!fetchRedirect[tid] || + !execWB->squash[tid] || + execWB->squashedSeqNum[tid] > inst->seqNum) { + + // Prevent testing for misprediction on load instructions, + // that have not been executed. + bool loadNotExecuted = !inst->isExecuted() && inst->isLoad(); + + if (inst->mispredicted() && !loadNotExecuted) { + fetchRedirect[tid] = true; + + DPRINTF(IEW, "[tid:%i] [sn:%llu] Execute: " + "Branch mispredict detected.\n", + tid, inst->seqNum); + DPRINTF(IEW, "[tid:%i] [sn:%llu] " + "Predicted target was PC: %s\n", + tid, inst->seqNum, inst->readPredTarg()); + DPRINTF(IEW, "[tid:%i] [sn:%llu] Execute: " + "Redirecting fetch to PC: %s\n", + tid, inst->seqNum, inst->pcState()); + // If incorrect, then signal the ROB that it must be squashed. + squashDueToBranch(inst, tid); + + ppMispredict->notify(inst); + + if (inst->readPredTaken()) { + iewStats.predictedTakenIncorrect++; + } else { + iewStats.predictedNotTakenIncorrect++; + } + } else if (ldstQueue.violation(tid)) { + assert(inst->isMemRef()); + // If there was an ordering violation, then get the + // DynInst that caused the violation. Note that this + // clears the violation signal. + DynInstPtr violator; + violator = ldstQueue.getMemDepViolator(tid); + + DPRINTF(IEW, "LDSTQ detected a violation. Violator PC: %s " + "[sn:%lli], inst PC: %s [sn:%lli]. Addr is: %#x.\n", + violator->pcState(), violator->seqNum, + inst->pcState(), inst->seqNum, inst->physEffAddr); + + fetchRedirect[tid] = true; + + // Tell the instruction queue that a violation has occured. + instQueue.violation(inst, violator); + + // Squash. + squashDueToMemOrder(violator, tid); + + ++iewStats.memOrderViolationEvents; + } + } else { + // Reset any state associated with redirects that will not + // be used. + if (ldstQueue.violation(tid)) { + assert(inst->isMemRef()); + + DynInstPtr violator = ldstQueue.getMemDepViolator(tid); + + DPRINTF(IEW, "LDSTQ detected a violation. Violator PC: " + "%s, inst PC: %s. Addr is: %#x.\n", + violator->pcState(), inst->pcState(), + inst->physEffAddr); + DPRINTF(IEW, "Violation will not be handled because " + "already squashing\n"); + + ++iewStats.memOrderViolationEvents; + } + } +} + void IEW::executeInsts() { @@ -1393,7 +1477,7 @@ IEW::executeInsts() // Tell the LDSTQ to execute this instruction (if it is a load). if (inst->isAtomic()) { // AMOs are treated like store requests - fault = ldstQueue.executeStore(inst); + fault = ldstQueue.executeAmo(inst); if (inst->isTranslationDelayed() && fault == NoFault) { @@ -1401,53 +1485,15 @@ IEW::executeInsts() // instruction must be deferred. DPRINTF(IEW, "Execute: Delayed translation, deferring " "store.\n"); - instQueue.deferMemInst(inst); + deferMemInst(inst); continue; } } else if (inst->isLoad()) { - // Loads will mark themselves as executed, and their writeback - // event adds the instruction to the queue to commit - fault = ldstQueue.executeLoad(inst); - - if (inst->isTranslationDelayed() && - fault == NoFault) { - // A hw page table walk is currently going on; the - // instruction must be deferred. - DPRINTF(IEW, "Execute: Delayed translation, deferring " - "load.\n"); - instQueue.deferMemInst(inst); - continue; - } - - if (inst->isDataPrefetch() || inst->isInstPrefetch()) { - inst->fault = NoFault; - } + // add this load inst to loadpipe S0. + ldstQueue.issueToLoadPipe(inst); } else if (inst->isStore()) { - fault = ldstQueue.executeStore(inst); - - if (inst->isTranslationDelayed() && - fault == NoFault) { - // A hw page table walk is currently going on; the - // instruction must be deferred. - DPRINTF(IEW, "Execute: Delayed translation, deferring " - "store.\n"); - instQueue.deferMemInst(inst); - continue; - } - - // If the store had a fault then it may not have a mem req - if (fault != NoFault || !inst->readPredicate() || - !inst->isStoreConditional()) { - // If the instruction faulted, then we need to send it - // along to commit without the instruction completing. - // Send this instruction to commit, also make sure iew - // stage realizes there is activity. - inst->setExecuted(); - instToCommit(inst); - activityThisCycle(); - } - - instQueue.notifyExecuted(inst); + // add this store inst to storepipe S0. + ldstQueue.issueToStorePipe(inst); // Store conditionals will mark themselves as // executed, and their writeback event will add the @@ -1486,81 +1532,14 @@ IEW::executeInsts() // This probably needs to prioritize the redirects if a different // scheduler is used. Currently the scheduler schedules the oldest // instruction first, so the branch resolution order will be correct. - ThreadID tid = inst->threadNumber; - - if (!fetchRedirect[tid] || - !execWB->squash[tid] || - execWB->squashedSeqNum[tid] > inst->seqNum) { - - // Prevent testing for misprediction on load instructions, - // that have not been executed. - bool loadNotExecuted = !inst->isExecuted() && inst->isLoad(); - - if (inst->mispredicted() && !loadNotExecuted) { - fetchRedirect[tid] = true; - - DPRINTF(IEW, "[tid:%i] [sn:%llu] Execute: " - "Branch mispredict detected.\n", - tid, inst->seqNum); - DPRINTF(IEW, "[tid:%i] [sn:%llu] " - "Predicted target was PC: %s\n", - tid, inst->seqNum, inst->readPredTarg()); - DPRINTF(IEW, "[tid:%i] [sn:%llu] Execute: " - "Redirecting fetch to PC: %s\n", - tid, inst->seqNum, inst->pcState()); - // If incorrect, then signal the ROB that it must be squashed. - squashDueToBranch(inst, tid); - - ppMispredict->notify(inst); - - if (inst->readPredTaken()) { - iewStats.predictedTakenIncorrect++; - } else { - iewStats.predictedNotTakenIncorrect++; - } - } else if (ldstQueue.violation(tid)) { - assert(inst->isMemRef()); - // If there was an ordering violation, then get the - // DynInst that caused the violation. Note that this - // clears the violation signal. - DynInstPtr violator; - violator = ldstQueue.getMemDepViolator(tid); - - DPRINTF(IEW, "LDSTQ detected a violation. Violator PC: %s " - "[sn:%lli], inst PC: %s [sn:%lli]. Addr is: %#x.\n", - violator->pcState(), violator->seqNum, - inst->pcState(), inst->seqNum, inst->physEffAddr); - - fetchRedirect[tid] = true; - - // Tell the instruction queue that a violation has occured. - instQueue.violation(inst, violator); - - // Squash. - squashDueToMemOrder(violator, tid); - - ++iewStats.memOrderViolationEvents; - } - } else { - // Reset any state associated with redirects that will not - // be used. - if (ldstQueue.violation(tid)) { - assert(inst->isMemRef()); - - DynInstPtr violator = ldstQueue.getMemDepViolator(tid); - - DPRINTF(IEW, "LDSTQ detected a violation. Violator PC: " - "%s, inst PC: %s. Addr is: %#x.\n", - violator->pcState(), inst->pcState(), - inst->physEffAddr); - DPRINTF(IEW, "Violation will not be handled because " - "already squashing\n"); - - ++iewStats.memOrderViolationEvents; - } + if (!(inst->isLoad() || inst->isStore())) { + // Load/Store will call this in `lsq_unit.cc` after execution + SquashCheckAfterExe(inst); } } + ldstQueue.executePipeSx(); + // Update and record activity if we processed any instructions. if (inst_num) { if (exeStatus == Idle) { diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh index f41dfb9492..32c62c65d2 100644 --- a/src/cpu/o3/iew.hh +++ b/src/cpu/o3/iew.hh @@ -209,6 +209,9 @@ class IEW /** Moves memory instruction onto the list of cache blocked instructions */ void blockMemInst(const DynInstPtr &inst); + /** Moves load instruction onto the Set of cache missed instructions */ + void cacheMissLdReplay(const DynInstPtr &inst); + /** Notifies that the cache has become unblocked */ void cacheUnblocked(); @@ -252,6 +255,17 @@ class IEW bool flushAllStores(ThreadID tid) { return ldstQueue.flushAllStores(tid); } + /** Check if we need to squash after a load/store/branch is executed. */ + void SquashCheckAfterExe(DynInstPtr inst); + + void notifyExecuted(const DynInstPtr &inst) { instQueue.notifyExecuted(inst); } + + /** + * Defers a memory instruction when its DTB translation incurs a hw + * page table walk. + */ + void deferMemInst(const DynInstPtr &deferred_inst) { instQueue.deferMemInst(deferred_inst); } + /** Check misprediction */ void checkMisprediction(const DynInstPtr &inst); diff --git a/src/cpu/o3/inst_queue.cc b/src/cpu/o3/inst_queue.cc index ae7647ad7a..090765a79d 100644 --- a/src/cpu/o3/inst_queue.cc +++ b/src/cpu/o3/inst_queue.cc @@ -94,6 +94,12 @@ InstructionQueue::FUCompletion::description() const return "Functional unit completion"; } +size_t +InstructionQueue::CacheMissLdInstsHash::operator()(const DynInstPtr& ptr) const +{ + return ptr->getLqIdx(); +} + InstructionQueue::InstructionQueue(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams ¶ms) : cpu(cpu_ptr), @@ -352,6 +358,7 @@ InstructionQueue::resetState() nonSpecInsts.clear(); deferredMemInsts.clear(); + cacheMissLdInsts.clear(); blockedMemInsts.clear(); retryMemInsts.clear(); wbOutstanding = 0; @@ -650,6 +657,10 @@ InstructionQueue::scheduleReadyInsts() IssueStruct *i2e_info = issueToExecuteQueue->access(0); DynInstPtr mem_inst; + while ((mem_inst = getCacheMissInstToExecute())) { + mem_inst->issueQue->retryMem(mem_inst); + } + while ((mem_inst = getDeferredMemInstToExecute())) { mem_inst->issueQue->retryMem(mem_inst); } @@ -697,7 +708,7 @@ InstructionQueue::scheduleReadyInsts() assert(op_latency < 64); DPRINTF(Schedule, "[sn:%llu] start execute %u cycles\n", issued_inst->seqNum, op_latency); cpu->perfCCT->updateInstPos(issued_inst->seqNum, PerfRecord::AtFU); - if (op_latency <= 1) { + if (op_latency <= 1 || issued_inst->isLoad() || issued_inst->isStore()) { i2e_info->size++; instsToExecute.push_back(issued_inst); } @@ -721,7 +732,7 @@ InstructionQueue::scheduleReadyInsts() // @todo If the way deferred memory instructions are handeled due to // translation changes then the deferredMemInsts condition should be // removed from the code below. - if (total_issued || !retryMemInsts.empty() || !deferredMemInsts.empty()) { + if (total_issued || !retryMemInsts.empty() || !deferredMemInsts.empty() || !cacheMissLdInsts.empty()) { cpu->activityThisCycle(); } else { DPRINTF(IQ, "Not able to schedule any instructions.\n"); @@ -860,6 +871,19 @@ InstructionQueue::deferMemInst(const DynInstPtr &deferred_inst) deferredMemInsts.push_back(deferred_inst); } +void +InstructionQueue::cacheMissLdReplay(const DynInstPtr &deferred_inst) +{ + DPRINTF(IQ, "Get Cache Missed Load, insert to Replay Queue " + "[sn:%llu]\n", deferred_inst->seqNum); + // Reset DTB translation state + deferred_inst->translationStarted(false); + deferred_inst->translationCompleted(false); + + deferred_inst->clearCanIssue(); + cacheMissLdInsts.insert(deferred_inst); +} + void InstructionQueue::blockMemInst(const DynInstPtr &blocked_inst) { @@ -902,6 +926,29 @@ InstructionQueue::getDeferredMemInstToExecute() return nullptr; } +DynInstPtr +InstructionQueue::getCacheMissInstToExecute() +{ + for (auto it = cacheMissLdInsts.begin(); it != cacheMissLdInsts.end(); + ++it) { + if (!(*it)->waitingCacheRefill() || (*it)->isSquashed()) { + DPRINTF(IQ, "CacheMissed load inst [sn:%llu] PC %s is ready to " + "execute\n", (*it)->seqNum, (*it)->pcState()); + DynInstPtr mem_inst = std::move(*it); + cacheMissLdInsts.erase(it); + return mem_inst; + } + if ((*it)->waitingCacheRefill()) { + DPRINTF( + IQ, + "CacheMissed load inst [sn:%llu] PC %s has not been waken up " + "by Dcache\n", + (*it)->seqNum, (*it)->pcState()); + } + } + return nullptr; +} + DynInstPtr InstructionQueue::getBlockedMemInstToExecute() { diff --git a/src/cpu/o3/inst_queue.hh b/src/cpu/o3/inst_queue.hh index 0d1b780d61..0d0f333e43 100644 --- a/src/cpu/o3/inst_queue.hh +++ b/src/cpu/o3/inst_queue.hh @@ -45,6 +45,7 @@ #include #include #include +#include #include #include "base/statistics.hh" @@ -199,6 +200,11 @@ class InstructionQueue */ DynInstPtr getDeferredMemInstToExecute(); + /** Gets a load instruction that was referred due to Dcache miss + * if it is now ready to execute. NULL if none available. + */ + DynInstPtr getCacheMissInstToExecute(); + /** Gets a memory instruction that was blocked on the cache. NULL if none * available. */ @@ -242,6 +248,11 @@ class InstructionQueue */ void deferMemInst(const DynInstPtr &deferred_inst); + /** + * Defers a load instruction when Dcache miss. + */ + void cacheMissLdReplay(const DynInstPtr &deferred_inst); + /** Defers a memory instruction when it is cache blocked. */ void blockMemInst(const DynInstPtr &blocked_inst); @@ -302,6 +313,16 @@ class InstructionQueue */ std::list deferredMemInsts; + /** Set of load instructions waiting for Dcache refill + * use unordered_set to prevent repeat enqueue, + * SplitDataRequest may call `cacheMissLdReplay` multiple times. + */ + struct CacheMissLdInstsHash + { + size_t operator()(const DynInstPtr& ptr) const; + }; + std::unordered_set cacheMissLdInsts; + /** List of instructions that have been cache blocked. */ std::list blockedMemInsts; diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc index 3c3abac3b9..23fdd4f5e1 100644 --- a/src/cpu/o3/lsq.cc +++ b/src/cpu/o3/lsq.cc @@ -42,7 +42,9 @@ #include "cpu/o3/lsq.hh" #include +#include #include +#include #include #include @@ -51,6 +53,7 @@ #include "base/trace.hh" #include "cpu/o3/cpu.hh" #include "cpu/o3/dyn_inst.hh" +#include "cpu/o3/dyn_inst_ptr.hh" #include "cpu/o3/iew.hh" #include "cpu/o3/limits.hh" #include "debug/Drain.hh" @@ -63,6 +66,7 @@ #include "debug/TagReadFail.hh" #include "debug/Writeback.hh" #include "mem/packet_access.hh" +#include "mem/request.hh" #include "params/BaseO3CPU.hh" namespace gem5 @@ -82,6 +86,9 @@ LSQ::LSQ(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams ¶ms) cacheLoadPorts(params.cacheLoadPorts), usedLoadPorts(0),lastConflictCheckTick(0), recentlyloadAddr(8), enableBankConflictCheck(params.BankConflictCheck), + _enableLdMissReplay(params.EnableLdMissReplay), + _enablePipeNukeCheck(params.EnablePipeNukeCheck), + _storeWbStage(params.StoreWbStage), waitingForStaleTranslation(false), staleTranslationWaitTxnId(0), lsqPolicy(params.smtLSQPolicy), @@ -95,6 +102,10 @@ LSQ::LSQ(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams ¶ms) numThreads(params.numThreads) { assert(numThreads > 0 && numThreads <= MaxThreads); + if (!_enableLdMissReplay && _enablePipeNukeCheck) { + panic("LSQ can not support pipeline nuke replay when EnableLdMissReplay is False"); + } + assert(_storeWbStage >= 2 && _storeWbStage <= 4); //********************************************** //************ Handle SMT Parameters *********** @@ -121,9 +132,10 @@ LSQ::LSQ(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams ¶ms) } thread.reserve(numThreads); + // TODO: Parameterize the load/store pipeline stages for (ThreadID tid = 0; tid < numThreads; tid++) { thread.emplace_back(maxLQEntries, maxSQEntries, params.SbufferEntries, - params.SbufferEvictThreshold, params.storeBufferInactiveThreshold); + params.SbufferEvictThreshold, params.storeBufferInactiveThreshold, 4, 5); thread[tid].init(cpu, iew_ptr, params, this, tid); thread[tid].setDcachePort(&dcachePort); } @@ -190,6 +202,15 @@ LSQ::tick() usedLoadPorts = 0; usedStorePorts = 0; + // tick lsq_unit + std::list::iterator threads = activeThreads->begin(); + std::list::iterator end = activeThreads->end(); + + while (threads != end) { + ThreadID tid = *threads++; + thread[tid].tick(); + } + } Tick LSQ::getLastConflictCheckTick() @@ -280,20 +301,41 @@ LSQ::insertStore(const DynInstPtr &store_inst) thread[tid].insertStore(store_inst); } -Fault -LSQ::executeLoad(const DynInstPtr &inst) +void +LSQ::issueToLoadPipe(const DynInstPtr &inst) { ThreadID tid = inst->threadNumber; - return thread[tid].executeLoad(inst); + thread[tid].issueToLoadPipe(inst); +} + +void +LSQ::issueToStorePipe(const DynInstPtr &inst) +{ + ThreadID tid = inst->threadNumber; + + thread[tid].issueToStorePipe(inst); +} + +void +LSQ::executePipeSx() +{ + std::list::iterator threads = activeThreads->begin(); + std::list::iterator end = activeThreads->end(); + + while (threads != end) { + ThreadID tid = *threads++; + + thread[tid].executePipeSx(); + } } Fault -LSQ::executeStore(const DynInstPtr &inst) +LSQ::executeAmo(const DynInstPtr &inst) { ThreadID tid = inst->threadNumber; - return thread[tid].executeStore(inst); + return thread[tid].executeAmo(inst); } void @@ -522,8 +564,10 @@ LSQ::recvFunctionalCustomSignal(PacketPtr pkt, int sig) LSQRequest *request = dynamic_cast(pkt->getPrimarySenderState()); panic_if(!request, "Got packet back with unknown sender state\n"); - if (sig == DcacheRespType::Miss) { - // notify cache miss + if (sig == DcacheRespType::Miss || sig == DcacheRespType::Block_Not_Ready) { + DPRINTF(LSQ, "recvFunctionalCustomSignal: Resp type: %d, [sn:%ld], lqidx: %ld\n", + sig, request->instruction()->seqNum, request->instruction()->lqIdx); + // cancel subsequent dependent insts of this load iewStage->loadCancel(request->instruction()); } else { panic("unsupported sig %d in recvFunctionalCustomSignal\n", sig); @@ -916,6 +960,14 @@ LSQ::pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data, request->initiateTranslation(); } + if (!isLoad && !isAtomic) { + // store inst temporally saves its data in memData + inst->memData = new uint8_t[size]; + memcpy(inst->memData, data, size); + } + + inst->effSize = size; + if (!isLoad && !inst->isVector() && size > 1 && addr % size != 0) { warn( "Store misaligned: size: %u, Addr: %#lx, code: %d\n", size, addr, RiscvISA::ExceptionCode::STORE_ADDR_MISALIGNED); @@ -925,7 +977,7 @@ LSQ::pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data, } /* This is the place were instructions get the effAddr. */ - if (request->isTranslationComplete()) { + if (inst->isAtomic() && request->isTranslationComplete()) { if (request->isMemAccessRequired()) { inst->effAddr = request->getVaddr(); inst->effSize = size; @@ -1194,7 +1246,7 @@ LSQ::LSQRequest::LSQRequest( : _state(State::NotIssued), numTranslatedFragments(0), numInTranslationFragments(0), - _port(*port), _inst(inst), _data(data), + _port(*port), _inst(inst), _data(data), _fwd_data_pkt(nullptr), _res(res), _addr(addr), _size(size), _flags(flags_), _numOutstandingPackets(0), @@ -1283,6 +1335,9 @@ LSQ::LSQRequest::~LSQRequest() for (auto r: _packets) delete r; + + if (_fwd_data_pkt) + delete _fwd_data_pkt; }; ContextID @@ -1349,15 +1404,31 @@ LSQ::SbufferRequest::recvTimingResp(PacketPtr pkt) bool LSQ::SingleDataRequest::recvTimingResp(PacketPtr pkt) { + bool isNormalLd = this->isNormalLd(); + bool enableLdMissReplay = this->_port.getLsq()->enableLdMissReplay(); // Dump inst num, request addr, and packet addr - DPRINTF(LSQ, "Single Req::recvTimingResp: inst: %llu, pkt: %#lx\n", pkt->req->getReqInstSeqNum(), - pkt->getAddr()); + DPRINTF(LSQ, "Single Req::recvTimingResp: inst: %llu, pkt: %#lx, isLoad: %d, " + "isLLSC: %d, isUncache: %d, isCacheSatisfied: %d, data: %d\n", + pkt->req->getReqInstSeqNum(), pkt->getAddr(), isLoad(), mainReq()->isLLSC(), + mainReq()->isUncacheable(), pkt->cacheSatisfied, *(pkt->getPtr())); assert(_numOutstandingPackets == 1); - flags.set(Flag::Complete); - assert(pkt == _packets.front()); - forward(); - _port.completeDataAccess(pkt); - _hasStaleTranslation = false; + if (enableLdMissReplay && isNormalLd && LSQRequest::_inst->waitingCacheRefill()) { + // Data in Dcache is ready, wake up missed load in replay queue + LSQRequest::_inst->waitingCacheRefill(false); + discard(); + } else { + flags.set(Flag::Complete); + assert(pkt == _packets.front()); + if (enableLdMissReplay && isNormalLd) { + // cache satisfied load, assemblePackets at load s2 + _port.setFlagInPipeLine(_inst, LdStFlags::CacheHit); + } else { + // cache satisfied other kinds of request + assert(pkt == mainPacket()); + assemblePackets(); + } + _hasStaleTranslation = false; + } return true; } @@ -1372,24 +1443,50 @@ LSQ::SplitDataRequest::recvTimingResp(PacketPtr pkt) assert(pktIdx < _packets.size()); numReceivedPackets++; if (numReceivedPackets == _packets.size()) { - flags.set(Flag::Complete); - /* Assemble packets. */ - PacketPtr resp = isLoad() - ? Packet::createRead(_mainReq) - : Packet::createWrite(_mainReq); - if (isLoad()) - resp->dataStatic(_inst->memData); - else - resp->dataStatic(_data); - resp->senderState = this; - forward(); - _port.completeDataAccess(resp); - delete resp; + bool isNormalLd = this->isNormalLd(); + bool enableLdMissReplay = this->_port.getLsq()->enableLdMissReplay(); + if (enableLdMissReplay && isNormalLd && LSQRequest::_inst->waitingCacheRefill()) { + // Data in Dcache is ready, wake up missed load in replay queue + LSQRequest::_inst->waitingCacheRefill(false); + discard(); + } else { + flags.set(Flag::Complete); + if (enableLdMissReplay && isNormalLd) { + // cache satisfied load, assemblePackets at load s2 + _port.setFlagInPipeLine(_inst, LdStFlags::CacheHit); + } else { + // Assemble packets, cache satisfied other kinds of request + assemblePackets(); + } + _hasStaleTranslation = false; + } } - _hasStaleTranslation = false; return true; } +void +LSQ::SingleDataRequest::assemblePackets() +{ + forward(); + _port.completeDataAccess(mainPacket()); +} + +void +LSQ::SplitDataRequest::assemblePackets() +{ + PacketPtr resp = isLoad() + ? Packet::createRead(_mainReq) + : Packet::createWrite(_mainReq); + if (isLoad()) + resp->dataStatic(_inst->memData); + else + resp->dataStatic(_data); + resp->senderState = this; + forward(); + _port.completeDataAccess(resp); + delete resp; +} + void LSQ::SbufferRequest::buildPackets() { diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh index 079cb54e52..dd321344ea 100644 --- a/src/cpu/o3/lsq.hh +++ b/src/cpu/o3/lsq.hh @@ -47,6 +47,8 @@ #include #include #include +#include +#include #include #include @@ -60,6 +62,7 @@ #include "cpu/o3/dyn_inst_xsmeta.hh" #include "cpu/utils.hh" #include "enums/SMTQueuePolicy.hh" +#include "mem/packet.hh" #include "mem/port.hh" #include "sim/sim_object.hh" @@ -76,6 +79,37 @@ class IEW; class LSQUnit; class StoreBufferEntry; +/** The Flag of Load/Store inst in Pipeline. */ +enum LdStFlags +{ + Valid = 0, + Replayed, + CacheHit, + Nuke, + FullForward, + LocalAccess, + HasFault, + readNotPredicate, + readMemAccNotPredicate, + Squashed, + Num_Flags +}; + +constexpr uint64_t LdStFlagNum = LdStFlags::Num_Flags; + +const std::string LdStFlagName[LdStFlagNum] = { + "Valid", + "Replayed", + "CacheHit", + "Nuke", + "FullForward", + "LocalAccess", + "HasFault", + "readNotPredicate", + "readMemAccNotPredicate", + "Squashed" +}; + class LSQ { public: @@ -256,6 +290,7 @@ class LSQ PacketDataPtr _data; std::vector _packets; std::vector _reqs; + PacketPtr _fwd_data_pkt; std::vector _fault; uint64_t* _res; const Addr _addr; @@ -463,6 +498,8 @@ class LSQ */ virtual bool isCacheBlockHit(Addr blockAddr, Addr cacheBlockMask) = 0; + virtual void assemblePackets() { panic("assemblePackets not implemented!\n"); } + /** Update the status to reflect that a packet was sent. */ void packetSent() @@ -594,6 +631,13 @@ class LSQ flags.set(Flag::Complete); } + /* Load instrutcion which is not LR or MMIO type of Load. */ + bool + isNormalLd() + { + return isLoad() && !mainReq()->isLLSC() && !mainReq()->isUncacheable(); + } + virtual std::string name() const { return "LSQRequest"; } }; @@ -613,6 +657,7 @@ class LSQ virtual void finish(const Fault &fault, const RequestPtr &req, gem5::ThreadContext* tc, BaseMMU::Mode mode); virtual bool recvTimingResp(PacketPtr pkt); + virtual void assemblePackets(); virtual bool sendPacketToCache(); virtual void buildPackets(); virtual Cycles handleLocalAccess( @@ -678,6 +723,7 @@ class LSQ virtual void finish(const Fault &fault, const RequestPtr &req, gem5::ThreadContext* tc, BaseMMU::Mode mode); virtual bool recvTimingResp(PacketPtr pkt); + virtual void assemblePackets(); virtual void initiateTranslation(); virtual bool sendPacketToCache(); virtual void buildPackets(); @@ -742,11 +788,17 @@ class LSQ /** Inserts a store into the LSQ. */ void insertStore(const DynInstPtr &store_inst); - /** Executes a load. */ - Fault executeLoad(const DynInstPtr &inst); + /** Executes an amo inst. */ + Fault executeAmo(const DynInstPtr &inst); + + /** Iq issues a load to load pipeline. */ + void issueToLoadPipe(const DynInstPtr &inst); + + /** Iq issues a store to store pipeline. */ + void issueToStorePipe(const DynInstPtr &inst); - /** Executes a store. */ - Fault executeStore(const DynInstPtr &inst); + /** Process instructions in each load/store pipeline stages. */ + void executePipeSx(); /** * Commits loads up until the given sequence number for a specific thread. @@ -961,6 +1013,10 @@ class LSQ RequestPort &getDataPort() { return dcachePort; } + bool enableLdMissReplay() const { return _enableLdMissReplay; } + bool enablePipeNukeCheck() const { return _enablePipeNukeCheck; } + int storeWbStage() const { return _storeWbStage; } + protected: /** D-cache is blocked */ bool _cacheBlocked; @@ -981,6 +1037,11 @@ class LSQ bool enableBankConflictCheck; + bool _enableLdMissReplay; + bool _enablePipeNukeCheck; + + int _storeWbStage; + /** If the LSQ is currently waiting for stale translations */ bool waitingForStaleTranslation; /** The ID if the transaction that made translations stale */ diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc index 86a40eebfc..6cce3eb492 100644 --- a/src/cpu/o3/lsq_unit.cc +++ b/src/cpu/o3/lsq_unit.cc @@ -41,15 +41,21 @@ #include "cpu/o3/lsq_unit.hh" +#include + #include "arch/generic/debugfaults.hh" #include "arch/riscv/faults.hh" +#include "base/logging.hh" #include "base/str.hh" #include "base/trace.hh" +#include "base/types.hh" #include "config/the_isa.hh" #include "cpu/base.hh" #include "cpu/checker/cpu.hh" #include "cpu/golden_global_mem.hh" #include "cpu/o3/dyn_inst.hh" +#include "cpu/o3/dyn_inst_ptr.hh" +#include "cpu/o3/issue_queue.hh" #include "cpu/o3/limits.hh" #include "cpu/o3/lsq.hh" #include "cpu/utils.hh" @@ -430,7 +436,7 @@ LSQUnit::completeDataAccess(PacketPtr pkt) } LSQUnit::LSQUnit(uint32_t lqEntries, uint32_t sqEntries, uint32_t sbufferEntries, uint32_t sbufferEvictThreshold, - uint64_t storeBufferInactiveThreshold) + uint64_t storeBufferInactiveThreshold, uint32_t ldPipeStages, uint32_t stPipeStages) : sbufferEvictThreshold(sbufferEvictThreshold), sbufferEntries(sbufferEntries), storeBufferWritebackInactive(0), @@ -438,6 +444,8 @@ LSQUnit::LSQUnit(uint32_t lqEntries, uint32_t sqEntries, uint32_t sbufferEntries lsqID(-1), storeQueue(sqEntries), loadQueue(lqEntries), + loadPipe(ldPipeStages - 1, 0), + storePipe(stPipeStages - 1, 0), storesToWB(0), htmStarts(0), htmStops(0), @@ -452,9 +460,27 @@ LSQUnit::LSQUnit(uint32_t lqEntries, uint32_t sqEntries, uint32_t sbufferEntries // reserve space, we want if sq will be full, sbuffer will start evicting sqFullUpperLimit = sqEntries - 4; sqFullLowerLimit = sqFullUpperLimit - 4; + + loadPipeSx.resize(ldPipeStages); + storePipeSx.resize(stPipeStages); + + for (int i = 0; i < ldPipeStages; i++) { + loadPipeSx[i] = loadPipe.getWire(-i); + } + for (int i = 0; i < stPipeStages; i++) { + storePipeSx[i] = storePipe.getWire(-i); + } + assert(ldPipeStages >= 4 && stPipeStages >= 5); assert(sqFullLowerLimit > 0); } +void +LSQUnit::tick() +{ + loadPipe.advance(); + storePipe.advance(); +} + void LSQUnit::init(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams ¶ms, LSQ *lsq_ptr, unsigned id) @@ -547,6 +573,8 @@ LSQUnit::LSQUnitStats::LSQUnitStats(statistics::Group *parent) "Number of loads that had data forwarded from stores"), ADD_STAT(squashedLoads, statistics::units::Count::get(), "Number of loads squashed"), + ADD_STAT(pipeRawNukeReplay, statistics::units::Count::get(), + "Number of pipeline detected raw nuke"), ADD_STAT(ignoredResponses, statistics::units::Count::get(), "Number of memory responses ignored because the instruction is " "squashed"), @@ -704,6 +732,27 @@ LSQUnit::insertStore(const DynInstPtr& store_inst) storeQueue.back().set(store_inst); } +bool +LSQUnit::pipeLineNukeCheck(const DynInstPtr &load_inst, const DynInstPtr &store_inst) +{ + Addr load_eff_addr1 = load_inst->effAddr >> depCheckShift; + Addr load_eff_addr2 = (load_inst->effAddr + load_inst->effSize - 1) >> depCheckShift; + + Addr store_eff_addr1 = store_inst->effAddr >> depCheckShift; + Addr store_eff_addr2 = (store_inst->effAddr + store_inst->effSize - 1) >> depCheckShift; + + LSQRequest* store_req = store_inst->savedRequest; + bool load_need_check = load_inst->effAddrValid() && (load_inst->lqIt >= store_inst->lqIt); + bool store_need_check = store_req && store_req->isTranslationComplete() && + store_req->isMemAccessRequired() && (store_inst->getFault() == NoFault); + if (lsq->enablePipeNukeCheck() && load_need_check && store_need_check) { + if (load_eff_addr1 <= store_eff_addr2 && store_eff_addr1 <= load_eff_addr2) { + return true; + } + } + return false; +} + DynInstPtr LSQUnit::getMemDepViolator() { @@ -814,6 +863,25 @@ LSQUnit::checkSnoop(PacketPtr pkt) return; } +bool +LSQUnit::skipNukeReplay(const DynInstPtr& load_inst) +{ + // if the load_inst has been marked as `Nuke` + // load will be replayed, so no Raw violation happens. + if (lsq->enablePipeNukeCheck()) { + for (int i = 1; i <= 2; i++) { + // check loadPipe s1 & s2 + auto& stage = loadPipeSx[i]; + for (int j = 0; j < stage->size; j++) { + if (load_inst == stage->insts[j] && stage->flags[j][LdStFlags::Nuke]) { + return true; + } + } + } + } + return false; +} + Fault LSQUnit::checkViolations(typename LoadQueue::iterator& loadIt, const DynInstPtr& inst) @@ -876,6 +944,14 @@ LSQUnit::checkViolations(typename LoadQueue::iterator& loadIt, if (memDepViolator && ld_inst->seqNum > memDepViolator->seqNum) break; + // if this load has been marked as Nuke, the load will then be replayed + // So next time this load replaying to pipeline will forward from store correctly + // And no RAW violation happens + if (skipNukeReplay(ld_inst)) { + ++loadIt; + continue; + } + DPRINTF(LSQUnit, "ld_eff_addr1: %#x, ld_eff_addr2: %#x, " "inst_eff_addr1: %#x, inst_eff_addr2: %#x\n", @@ -900,35 +976,125 @@ LSQUnit::checkViolations(typename LoadQueue::iterator& loadIt, return NoFault; } +void +LSQUnit::setFlagInPipeLine(DynInstPtr inst, LdStFlags f) +{ + bool found = false; + if (inst->isLoad()) { + for (int i = (loadPipeSx.size() - 1); i >= 0; i--) { + for (int j = 0; j < loadPipeSx[i]->size; j++) { + if (inst == loadPipeSx[i]->insts[j]) { + found = true; + (loadPipeSx[i]->flags[j])[f] = true; + break; + } + } + } + } else { + for (int i = (storePipeSx.size() - 1); i >= 0; i--) { + for (int j = 0; j < storePipeSx[i]->size; j++) { + if (inst == storePipeSx[i]->insts[j]) { + found = true; + (storePipeSx[i]->flags[j])[f] = true; + break; + } + } + } + } + + if (!found) { + panic("[sn:%ld] Can not found corresponding inst in PipeLine, isLoad: %d\n", inst->seqNum, inst->isLoad()); + } +} +void +LSQUnit::issueToLoadPipe(const DynInstPtr &inst) +{ + // push to loadPipeS0 + assert(loadPipeSx[0]->size < MaxWidth); + int idx = loadPipeSx[0]->size; + loadPipeSx[0]->insts[idx] = inst; + loadPipeSx[0]->flags[idx][LdStFlags::Valid] = true; + loadPipeSx[0]->size++; + + DPRINTF(LSQUnit, "issueToLoadPipe: [sn:%lli]\n", inst->seqNum); + dumpLoadPipe(); +} + +void +LSQUnit::issueToStorePipe(const DynInstPtr &inst) +{ + // push to storePipeS0 + assert(storePipeSx[0]->size < MaxWidth); + int idx = storePipeSx[0]->size; + + storePipeSx[0]->insts[idx] = inst; + storePipeSx[0]->flags[idx][LdStFlags::Valid] = true; + storePipeSx[0]->size++; + + DPRINTF(LSQUnit, "issueToStorePipe: [sn:%lli]\n", inst->seqNum); + dumpStorePipe(); +} Fault -LSQUnit::executeLoad(const DynInstPtr &inst) +LSQUnit::loadPipeS0(const DynInstPtr &inst, std::bitset &flag) { - // Execute a specific load. + DPRINTF(LSQUnit, "LoadPipeS0: Executing load PC %s, [sn:%lli] flags: %s\n", + inst->pcState(), inst->seqNum, getLdStFlagStr(flag)); + assert(!inst->isSquashed()); + Fault load_fault = NoFault; + // Now initiateAcc only does TLB access + load_fault = inst->initiateAcc(); - DPRINTF(LSQUnit, "Executing load PC %s, [sn:%lli]\n", - inst->pcState(), inst->seqNum); + return load_fault; +} +Fault +LSQUnit::loadPipeS1(const DynInstPtr &inst, std::bitset &flag) +{ + DPRINTF(LSQUnit, "LoadPipeS1: Executing load PC %s, [sn:%lli] flags: %s\n", + inst->pcState(), inst->seqNum, getLdStFlagStr(flag)); assert(!inst->isSquashed()); - load_fault = inst->initiateAcc(); + Fault load_fault = inst->getFault(); + LSQRequest* request = inst->savedRequest; + + // Cache access + if (request && request->isTranslationComplete()) { + if (request->isMemAccessRequired()) { + inst->effAddr = request->getVaddr(); + inst->effAddrValid(true); + + Fault fault; + fault = read(request, inst->lqIdx); + // inst->getFault() may have the first-fault of a + // multi-access split request at this point. + // Overwrite that only if we got another type of fault + // (e.g. re-exec). + if (fault != NoFault) { + inst->getFault() = fault; + load_fault = fault; + } + } else { + inst->setMemAccPredicate(false); + // Commit will have to clean up whatever happened. Set this + // instruction as executed. + inst->setExecuted(); + } + } if (!inst->translationCompleted()) { + // TLB miss iewStage->loadCancel(inst); } else { - DPRINTF(LSQUnit, "load tlb hit [sn:%lli]\n", + DPRINTF(LSQUnit, "LoadPipeS1: load tlb hit [sn:%lli]\n", inst->seqNum); } if (load_fault == NoFault && !inst->readMemAccPredicate()) { - assert(inst->readPredicate()); - inst->setExecuted(); - inst->completeAcc(nullptr); - iewStage->instToCommit(inst); - iewStage->activityThisCycle(); + flag[LdStFlags::readMemAccNotPredicate] = true; return NoFault; } @@ -947,35 +1113,401 @@ LSQUnit::executeLoad(const DynInstPtr &inst) return NoFault; } + if (load_fault != NoFault || !inst->readPredicate()) { + flag[LdStFlags::HasFault] = load_fault != NoFault; + flag[LdStFlags::readNotPredicate] = !inst->readPredicate(); + } else { + if (inst->effAddrValid()) { + // raw violation check (nuke replay) + for (int i = 0; i < storePipeSx[1]->size; i++) { + auto& store_inst = storePipeSx[1]->insts[i]; + if (pipeLineNukeCheck(inst, store_inst)) { + flag[LdStFlags::Nuke] = true; + break; + } + } + // rar violation check + auto it = inst->lqIt; + ++it; + + if (checkLoads) + load_fault = checkViolations(it, inst); + } + } + + return load_fault; +} + +Fault +LSQUnit::loadPipeS2(const DynInstPtr &inst, std::bitset &flag) +{ + Fault fault = inst->getFault(); + DPRINTF(LSQUnit, "LoadPipeS2: Executing load PC %s, [sn:%lli] flags: %s\n", + inst->pcState(), inst->seqNum, getLdStFlagStr(flag)); + assert(!inst->isSquashed()); + LSQRequest* request = inst->savedRequest; + + if (flag[LdStFlags::readMemAccNotPredicate]) { + assert(inst->readPredicate() && fault == NoFault); + inst->setExecuted(); + inst->completeAcc(nullptr); + iewStage->instToCommit(inst); + iewStage->activityThisCycle(); + return NoFault; + } + // If the instruction faulted or predicated false, then we need to send it // along to commit without the instruction completing. - if (load_fault != NoFault || !inst->readPredicate()) { + if (flag[LdStFlags::HasFault] || flag[LdStFlags::readNotPredicate]) { // Send this instruction to commit, also make sure iew stage // realizes there is activity. Mark it as executed unless it // is a strictly ordered load that needs to hit the head of // commit. - if (!inst->readPredicate()) + if (flag[LdStFlags::readNotPredicate]) inst->forwardOldRegs(); - DPRINTF(LSQUnit, "Load [sn:%lli] not executed from %s\n", - inst->seqNum, - (load_fault != NoFault ? "fault" : "predication")); - if (!(inst->hasRequest() && inst->strictlyOrdered()) || - inst->isAtCommit()) { + DPRINTF(LSQUnit, "LoadPipeS2: Load [sn:%lli] not executed from %s\n", + inst->seqNum, (fault != NoFault ? "fault" : "predication")); + if (!(inst->hasRequest() && inst->strictlyOrdered()) || inst->isAtCommit()) { inst->setExecuted(); } iewStage->instToCommit(inst); iewStage->activityThisCycle(); + return fault; + } + + if (flag[LdStFlags::Replayed] || flag[LdStFlags::LocalAccess]) { + return fault; + } + + // raw violation check (nuke replay) + for (int i = 0; i < storePipeSx[1]->size; i++) { + auto& store_inst = storePipeSx[1]->insts[i]; + if (pipeLineNukeCheck(inst, store_inst)) { + flag[LdStFlags::Nuke] = true; + break; + } + } + + // check if cache hit & get cache response? + // NOTE: cache miss replay has higher priority than nuke replay! + if (lsq->enableLdMissReplay() && + request && request->isNormalLd() && !flag[LdStFlags::FullForward] && !flag[LdStFlags::CacheHit]) { + // cannot get cache data at load s2, replay this load + // clear state in this instruction + inst->effAddrValid(false); + // set it as waiting for dcache refill + inst->waitingCacheRefill(true); + // clear request in loadQueue + loadQueue[inst->lqIdx].setRequest(nullptr); + // set cache miss & replayed flag in pipeline + flag[Replayed] = true; + // insert to missed load replay queue + iewStage->cacheMissLdReplay(inst); + // cancel subsequent dependent insts of this load + iewStage->loadCancel(inst); + return fault; + } + + if (flag[LdStFlags::Nuke]) { + assert(lsq->enablePipeNukeCheck()); + // replay load if nuke happens + request->discard(); + inst->savedRequest = nullptr; + // clear state in this instruction + inst->translationStarted(false); + inst->translationCompleted(false); + inst->clearCanIssue(); + inst->effAddrValid(false); + // clear request in loadQueue + loadQueue[inst->lqIdx].setRequest(nullptr); + // set replayed flag in pipeline + flag[LdStFlags::Replayed] = true; + // nuke fast replay + inst->issueQue->retryMem(inst); + stats.pipeRawNukeReplay++; + // cancel subsequent dependent insts of this load + iewStage->loadCancel(inst); } else { - if (inst->effAddrValid()) { - auto it = inst->lqIt; - ++it; + // no nuke happens, prepare the inst data + request = inst->savedRequest; + if (flag[LdStFlags::FullForward]) { + // this load gets full data from sq + assert(request && request->_fwd_data_pkt); + writeback(inst, request->_fwd_data_pkt); + request->writebackDone(); + } else { + if (lsq->enableLdMissReplay() && request && request->isNormalLd()) { + // assemble cache & sbuffer forwarded data and completeDataAcess + request->assemblePackets(); + } + } + } - if (checkLoads) - return checkViolations(it, inst); + return fault; +} + +Fault +LSQUnit::loadPipeS3(const DynInstPtr &inst, std::bitset &flag) +{ + Fault fault = inst->getFault(); + DPRINTF(LSQUnit, "LoadPipeS3: Executing load PC %s, [sn:%lli] flags: %s\n", + inst->pcState(), inst->seqNum, getLdStFlagStr(flag)); + assert(!inst->isSquashed()); + return fault; +} + +void +LSQUnit::executeLoadPipeSx() +{ + // TODO: execute operations in each load pipelines + Fault fault = NoFault; + for (int i = 0; i < loadPipeSx.size(); i++) { + auto& stage = loadPipeSx[i]; + for (int j = 0; j < stage->size; j++) { + auto& inst = stage->insts[j]; + auto& flag = stage->flags[j]; + if (!inst->isSquashed()) { + switch (i) { + case 0: + fault = loadPipeS0(inst, flag); + break; + case 1: + // Loads will mark themselves as executed, and their writeback + // event adds the instruction to the queue to commit + fault = loadPipeS1(inst, flag); + + if (inst->isTranslationDelayed() && fault == NoFault) { + // A hw page table walk is currently going on; the + // instruction must be deferred. + DPRINTF(LSQUnit, "Execute: Delayed translation, deferring " + "load.\n"); + iewStage->deferMemInst(inst); + flag[LdStFlags::Replayed] = true; + } + iewStage->SquashCheckAfterExe(inst); + break; + case 2: + fault = loadPipeS2(inst, flag); + + if (inst->isDataPrefetch() || inst->isInstPrefetch()) { + inst->fault = NoFault; + } + break; + case 3: + fault = loadPipeS3(inst, flag); + break; + default: + panic("unsupported loadpipe length"); + } + } else { + DPRINTF(LSQUnit, "Execute: Instruction was squashed. PC: %s, [tid:%i]" + " [sn:%llu]\n", inst->pcState(), inst->threadNumber, + inst->seqNum); + inst->setExecuted(); + inst->setCanCommit(); + flag[LdStFlags::Squashed] = true; + } } } +} - return load_fault; +Fault +LSQUnit::storePipeS0(const DynInstPtr &inst, std::bitset &flag) +{ + // Make sure that a store exists. + assert(storeQueue.size() != 0); + assert(!inst->isSquashed()); + + DPRINTF(LSQUnit, "StorePipeS0: Executing store PC %s [sn:%lli] flags: %s\n", + inst->pcState(), inst->seqNum, getLdStFlagStr(flag)); + + // Now initiateAcc only does TLB access + Fault store_fault = inst->initiateAcc(); + + return store_fault; +} + +Fault +LSQUnit::storePipeS1(const DynInstPtr &inst, std::bitset &flag) +{ + // Make sure that a store exists. + assert(storeQueue.size() != 0); + + ssize_t store_idx = inst->sqIdx; + LSQRequest* request = inst->savedRequest; + + DPRINTF(LSQUnit, "StorePipeS1: Executing store PC %s [sn:%lli] flags: %s\n", + inst->pcState(), inst->seqNum, getLdStFlagStr(flag)); + + // Check the recently completed loads to see if any match this store's + // address. If so, then we have a memory ordering violation. + typename LoadQueue::iterator loadIt = inst->lqIt; + + /* This is the place were instructions get the effAddr. */ + if (request && request->isTranslationComplete()) { + if (request->isMemAccessRequired() && (inst->getFault() == NoFault)) { + inst->effAddr = request->getVaddr(); + inst->effAddrValid(true); + + if (cpu->checker) { + inst->reqToVerify = std::make_shared(*request->req()); + } + Fault fault; + fault = write(request, inst->memData, inst->sqIdx); + // release temporal data + delete [] inst->memData; + inst->memData = nullptr; + + if (fault != NoFault) + inst->getFault() = fault; + } + } + + Fault store_fault = inst->getFault(); + + if (inst->isTranslationDelayed() && + store_fault == NoFault) + return store_fault; + + if (!inst->readPredicate()) { + DPRINTF(LSQUnit, "StorePipeS1: Store [sn:%lli] not executed from predication\n", + inst->seqNum); + inst->forwardOldRegs(); + flag[LdStFlags::readNotPredicate] = true; + return store_fault; + } + + if (storeQueue[store_idx].size() == 0) { + DPRINTF(LSQUnit, "StorePipeS1: Fault on Store PC %s, [sn:%lli], Size = 0\n", + inst->pcState(), inst->seqNum); + flag[LdStFlags::HasFault] = true; + return store_fault; + } + + assert(store_fault == NoFault); + + if (inst->isStoreConditional()) { + // Store conditionals need to set themselves as able to + // writeback if we haven't had a fault by here. + storeQueue[store_idx].canWB() = true; + + ++storesToWB; + } else { + if (enableStorePrefetchTrain) { + triggerStorePFTrain(store_idx); + } + } + + return checkViolations(loadIt, inst); +} + +Fault +LSQUnit::storePipeS2(const DynInstPtr &inst, std::bitset &flag) +{ + Fault fault = inst->getFault(); + assert(!inst->isSquashed()); + + DPRINTF(LSQUnit, "StorePipeS2: Executing store PC %s [sn:%lli] flags: %s\n", + inst->pcState(), inst->seqNum, getLdStFlagStr(flag)); + return fault; +} + +Fault +LSQUnit::storePipeS3(const DynInstPtr &inst, std::bitset &flag) +{ + Fault fault = inst->getFault(); + assert(!inst->isSquashed()); + + DPRINTF(LSQUnit, "StorePipeS3: Executing store PC %s [sn:%lli] flags: %s\n", + inst->pcState(), inst->seqNum, getLdStFlagStr(flag)); + return fault; +} + +Fault +LSQUnit::storePipeS4(const DynInstPtr &inst, std::bitset &flag) +{ + Fault fault = inst->getFault(); + assert(!inst->isSquashed()); + + DPRINTF(LSQUnit, "StorePipeS4: Executing store PC %s [sn:%lli] flags: %s\n", + inst->pcState(), inst->seqNum, getLdStFlagStr(flag)); + + return fault; +} + +void +LSQUnit::executeStorePipeSx() +{ + // TODO: execute operations in each store pipelines + Fault fault = NoFault; + for (int i = 0; i < storePipeSx.size(); i++) { + auto& stage = storePipeSx[i]; + for (int j = 0; j < stage->size; j++) { + auto& inst = stage->insts[j]; + auto& flag = stage->flags[j]; + if (!inst->isSquashed()) { + switch (i) { + case 0: + fault = storePipeS0(inst, flag); + break; + case 1: + fault = storePipeS1(inst, flag); + if (inst->isTranslationDelayed() && fault == NoFault) { + // A hw page table walk is currently going on; the + // instruction must be deferred. + DPRINTF(LSQUnit, "Execute: Delayed translation, deferring " + "store.\n"); + iewStage->deferMemInst(inst); + flag[LdStFlags::Replayed] = true; + continue; + } + + iewStage->notifyExecuted(inst); + iewStage->SquashCheckAfterExe(inst); + break; + case 2: + fault = storePipeS2(inst, flag); + break; + case 3: + fault = storePipeS3(inst, flag); + break; + case 4: + fault = storePipeS4(inst, flag); + break; + default: + panic("unsupported storepipe length"); + } + if (i == (lsq->storeWbStage() - 1)) { + // If the store had a fault then it may not have a mem req + if (fault != NoFault || !inst->readPredicate() || !inst->isStoreConditional()) { + // If the instruction faulted, then we need to send it + // along to commit without the instruction completing. + // Send this instruction to commit, also make sure iew + // stage realizes there is activity. + if (!flag[LdStFlags::Replayed]) { + inst->setExecuted(); + iewStage->instToCommit(inst); + iewStage->activityThisCycle(); + } + } + } + } else { + DPRINTF(LSQUnit, "Execute: Instruction was squashed. PC: %s, [tid:%i]" + " [sn:%llu]\n", inst->pcState(), inst->threadNumber, + inst->seqNum); + inst->setExecuted(); + inst->setCanCommit(); + flag[LdStFlags::Squashed] = true; + } + } + } +} + +void +LSQUnit::executePipeSx() +{ + executeLoadPipeSx(); + executeStorePipeSx(); } bool @@ -1001,69 +1533,57 @@ LSQUnit::triggerStorePFTrain(int sq_idx) } Fault -LSQUnit::executeStore(const DynInstPtr &store_inst) +LSQUnit::executeAmo(const DynInstPtr &amo_inst) { // Make sure that a store exists. assert(storeQueue.size() != 0); - ssize_t store_idx = store_inst->sqIdx; + ssize_t amo_idx = amo_inst->sqIdx; - DPRINTF(LSQUnit, "Executing store PC %s [sn:%lli]\n", - store_inst->pcState(), store_inst->seqNum); + DPRINTF(LSQUnit, "Executing AMO PC %s [sn:%lli]\n", + amo_inst->pcState(), amo_inst->seqNum); - assert(!store_inst->isSquashed()); + assert(!amo_inst->isSquashed()); - // Check the recently completed loads to see if any match this store's + // Check the recently completed loads to see if any match this amo's // address. If so, then we have a memory ordering violation. - typename LoadQueue::iterator loadIt = store_inst->lqIt; + typename LoadQueue::iterator loadIt = amo_inst->lqIt; - Fault store_fault = store_inst->initiateAcc(); + Fault amo_fault = amo_inst->initiateAcc(); - if (store_inst->isTranslationDelayed() && - store_fault == NoFault) - return store_fault; + if (amo_inst->isTranslationDelayed() && amo_fault == NoFault) + return amo_fault; - if (!store_inst->readPredicate()) { - DPRINTF(LSQUnit, "Store [sn:%lli] not executed from predication\n", - store_inst->seqNum); - store_inst->forwardOldRegs(); - return store_fault; + if (!amo_inst->readPredicate()) { + DPRINTF(LSQUnit, "AMO [sn:%lli] not executed from predication\n", + amo_inst->seqNum); + amo_inst->forwardOldRegs(); + return amo_fault; } - if (storeQueue[store_idx].size() == 0) { - DPRINTF(LSQUnit,"Fault on Store PC %s, [sn:%lli], Size = 0\n", - store_inst->pcState(), store_inst->seqNum); - - if (store_inst->isAtomic()) { - // If the instruction faulted, then we need to send it along - // to commit without the instruction completing. - if (!(store_inst->hasRequest() && store_inst->strictlyOrdered()) || - store_inst->isAtCommit()) { - store_inst->setExecuted(); - } - iewStage->instToCommit(store_inst); - iewStage->activityThisCycle(); + if (storeQueue[amo_idx].size() == 0) { + DPRINTF(LSQUnit,"Fault on AMO PC %s, [sn:%lli], Size = 0\n", + amo_inst->pcState(), amo_inst->seqNum); + + // If the amo instruction faulted, then we need to send it along + // to commit without the instruction completing. + if (!(amo_inst->hasRequest() && amo_inst->strictlyOrdered()) || + amo_inst->isAtCommit()) { + amo_inst->setExecuted(); } + iewStage->instToCommit(amo_inst); + iewStage->activityThisCycle(); - return store_fault; + return amo_fault; } - assert(store_fault == NoFault); + assert(amo_fault == NoFault); - if (store_inst->isStoreConditional() || store_inst->isAtomic()) { - // Store conditionals and Atomics need to set themselves as able to - // writeback if we haven't had a fault by here. - storeQueue[store_idx].canWB() = true; - - ++storesToWB; - } else { - if (enableStorePrefetchTrain) { - triggerStorePFTrain(store_idx); - } - } - - return checkViolations(loadIt, store_inst); + // Atomics need to set themselves as able to writeback if we haven't had a fault by here. + storeQueue[amo_idx].canWB() = true; + ++storesToWB; + return checkViolations(loadIt, amo_inst); } void @@ -1968,6 +2488,40 @@ LSQUnit::recvRetry() } } +void +LSQUnit::dumpLoadPipe() +{ + DPRINTF(LSQUnit, "Dumping LoadPipe:\n"); + for (int i = 0; i < loadPipeSx.size(); i++) { + DPRINTF(LSQUnit, "Load S%d:, size: %d\n", i, loadPipeSx[i]->size); + for (int j = 0; j < loadPipeSx[i]->size; j++) { + DPRINTF(LSQUnit, " PC: %s, [tid:%i] [sn:%lli] flags: %s\n", + loadPipeSx[i]->insts[j]->pcState(), + loadPipeSx[i]->insts[j]->threadNumber, + loadPipeSx[i]->insts[j]->seqNum, + getLdStFlagStr(loadPipeSx[i]->flags[j]) + ); + } + } +} + +void +LSQUnit::dumpStorePipe() +{ + DPRINTF(LSQUnit, "Dumping StorePipe:\n"); + for (int i = 0; i < storePipeSx.size(); i++) { + DPRINTF(LSQUnit, "Store S%d:, size: %d\n", i, storePipeSx[i]->size); + for (int j = 0; j < storePipeSx[i]->size; j++) { + DPRINTF(LSQUnit, " PC: %s, [tid:%i] [sn:%lli] flags: %s\n", + storePipeSx[i]->insts[j]->pcState(), + storePipeSx[i]->insts[j]->threadNumber, + storePipeSx[i]->insts[j]->seqNum, + getLdStFlagStr(storePipeSx[i]->flags[j]) + ); + } + } +} + void LSQUnit::dumpInsts() const { @@ -2054,6 +2608,7 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx) // rescheduled eventually iewStage->rescheduleMemInst(load_inst); load_inst->effAddrValid(false); + setFlagInPipeLine(load_inst, LdStFlags::Replayed); ++stats.rescheduledLoads; DPRINTF(LSQUnit, "Strictly ordered load [sn:%lli] PC %s\n", load_inst->seqNum, load_inst->pcState()); @@ -2102,6 +2657,7 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx) WritebackEvent *wb = new WritebackEvent(load_inst, main_pkt, this); cpu->schedule(wb, cpu->clockEdge(delay)); + setFlagInPipeLine(load_inst, LdStFlags::LocalAccess); return NoFault; } @@ -2253,13 +2809,12 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx) request->discard(); } - WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt, - this); - - // We'll say this has a 1 cycle load-store forwarding latency - // for now. - // @todo: Need to make this a parameter. - cpu->schedule(wb, curTick()); + // set FullForward flag, save the forward result(data_pkt) in _fwd_data_pkt + // then this load will be written back at s2 + // @todo: make sure _fwd_data_pkt no memory leak! + assert(request->_fwd_data_pkt == nullptr); + request->_fwd_data_pkt = data_pkt; + setFlagInPipeLine(load_inst, LdStFlags::FullForward); // Don't need to do anything special for split loads. ++stats.forwLoads; @@ -2289,6 +2844,7 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx) // rescheduled eventually iewStage->rescheduleMemInst(load_inst); load_inst->effAddrValid(false); + setFlagInPipeLine(load_inst, LdStFlags::Replayed); ++stats.rescheduledLoads; // Do not generate a writeback event as this instruction is not @@ -2336,9 +2892,13 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx) request->discard(); } - WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt, - this); - cpu->schedule(wb, curTick()); + // set FullForward flag, save the forward result(data_pkt) in _fwd_data_pkt + // then this load will be written back at s2 + // @todo: make sure _fwd_data_pkt no memory leak! + assert(request->_fwd_data_pkt == nullptr); + request->_fwd_data_pkt = data_pkt; + setFlagInPipeLine(load_inst, LdStFlags::FullForward); + return NoFault; } // if not fully forward, need to clear buffer @@ -2378,6 +2938,7 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx) } if (!request->isSent()) { iewStage->blockMemInst(load_inst); + setFlagInPipeLine(load_inst, LdStFlags::Replayed); } return NoFault; @@ -2388,9 +2949,9 @@ LSQUnit::write(LSQRequest *request, uint8_t *data, ssize_t store_idx) { assert(storeQueue[store_idx].valid()); - DPRINTF(LSQUnit, "Doing write to store idx %i, addr %#x | storeHead:%i " + DPRINTF(LSQUnit, "Doing write to store idx %i, addr %#x | storeHead:%i, size: %d" "[sn:%llu]\n", - store_idx - 1, request->req()->getPaddr(), storeQueue.head() - 1, + store_idx - 1, request->req()->getPaddr(), storeQueue.head() - 1, request->_size, storeQueue[store_idx].instruction()->seqNum); storeQueue[store_idx].setRequest(request); diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh index 802d359cb6..56d0290d5a 100644 --- a/src/cpu/o3/lsq_unit.hh +++ b/src/cpu/o3/lsq_unit.hh @@ -43,6 +43,7 @@ #define __CPU_O3_LSQ_UNIT_HH__ #include +#include #include #include #include @@ -62,6 +63,7 @@ #include "cpu/o3/comm.hh" #include "cpu/o3/cpu.hh" #include "cpu/o3/dyn_inst_ptr.hh" +#include "cpu/o3/limits.hh" #include "cpu/o3/lsq.hh" #include "cpu/timebuf.hh" #include "debug/HtmCpu.hh" @@ -290,7 +292,8 @@ class LSQUnit public: /** Constructs an LSQ unit. init() must be called prior to use. */ LSQUnit(uint32_t lqEntries, uint32_t sqEntries, uint32_t sbufferEntries, - uint32_t sbufferEvictThreshold, uint64_t storeBufferInactiveThreshold); + uint32_t sbufferEvictThreshold, uint64_t storeBufferInactiveThreshold, + uint32_t ldPipeStages, uint32_t stPipeStages); /** We cannot copy LSQUnit because it has stats for which copy * contructor is deleted explicitly. However, STL vector requires @@ -339,15 +342,16 @@ class LSQUnit */ void checkSnoop(PacketPtr pkt); - /** Executes a load instruction. */ - Fault executeLoad(const DynInstPtr &inst); - - Fault executeLoad(int lq_idx) { panic("Not implemented"); return NoFault; } + /** Iq issues a load to load pipeline. */ + void issueToLoadPipe(const DynInstPtr &inst); bool triggerStorePFTrain(int sq_idx); - /** Executes a store instruction. */ - Fault executeStore(const DynInstPtr& inst); + /** Executes an amo instruction. */ + Fault executeAmo(const DynInstPtr& inst); + + /** Iq issues a store to store pipeline. */ + void issueToStorePipe(const DynInstPtr &inst); /** Commits the head load. */ void commitLoad(); @@ -387,6 +391,12 @@ class LSQUnit /** Returns the memory ordering violator. */ DynInstPtr getMemDepViolator(); + /** Check if store should skip this raw violation because of nuke replay. */ + bool skipNukeReplay(const DynInstPtr& load_inst); + + /** Check if there exists raw nuke between load and store. */ + bool pipeLineNukeCheck(const DynInstPtr &load_inst, const DynInstPtr &store_inst); + /** Returns the number of free LQ entries. */ unsigned numFreeLoadEntries(); @@ -442,6 +452,22 @@ class LSQUnit /** Returns the number of stores to writeback. */ int numStoresToSbuffer() { return storesToWB; } + /** get description string from load/store pipeLine flag. */ + std::string getLdStFlagStr(const std::bitset& flag) + { + std::string res{}; + for (int i = 0; i < LdStFlagNum; i++) { + if (flag.test(i)) { + res += LdStFlagName[i] + ": [1] "; + } else { + res += LdStFlagName[i] + ": [0] "; + } + } + return res; + } + + LSQ* getLsq() { return lsq; } + /** Returns if the LSQ unit will writeback on this cycle. */ bool willWB() @@ -487,9 +513,40 @@ class LSQUnit bool sbufferSendPacket(PacketPtr data_pkt); + /** Debugging function to dump instructions in the LoadPipe. */ + void dumpLoadPipe(); + + /** Debugging function to dump instructions in the storePipe. */ + void dumpStorePipe(); + /** Debugging function to dump instructions in the LSQ. */ void dumpInsts() const; + /** Ticks + * causing load/store pipe to run for one cycle. + */ + void tick(); + + /** Process instructions in each load pipeline stages. */ + void executeLoadPipeSx(); + + Fault loadPipeS0(const DynInstPtr &inst, std::bitset &flag); + Fault loadPipeS1(const DynInstPtr &inst, std::bitset &flag); + Fault loadPipeS2(const DynInstPtr &inst, std::bitset &flag); + Fault loadPipeS3(const DynInstPtr &inst, std::bitset &flag); + + /** Process instructions in each store pipeline stages. */ + void executeStorePipeSx(); + + Fault storePipeS0(const DynInstPtr &inst, std::bitset &flag); + Fault storePipeS1(const DynInstPtr &inst, std::bitset &flag); + Fault storePipeS2(const DynInstPtr &inst, std::bitset &flag); + Fault storePipeS3(const DynInstPtr &inst, std::bitset &flag); + Fault storePipeS4(const DynInstPtr &inst, std::bitset &flag); + + /** Wrap function. */ + void executePipeSx(); + /** Schedule event for the cpu. */ void schedule(Event& ev, Tick when); @@ -586,6 +643,35 @@ class LSQUnit /** The load queue. */ LoadQueue loadQueue; + /** Struct that defines the information passed through Load Pipeline. */ + struct LoadPipeStruct + { + int size; + + DynInstPtr insts[MaxWidth]; + std::bitset flags[MaxWidth]; + }; + /** The load pipeline TimeBuffer. */ + TimeBuffer loadPipe; + /** Each stage in load pipeline. loadPipeSx[0] means load pipe S0 */ + std::vector::wire> loadPipeSx; + + /** Struct that defines the information passed through Store Pipeline. */ + struct StorePipeStruct + { + int size; + + DynInstPtr insts[MaxWidth]; + std::bitset flags[MaxWidth]; + }; + /** The store pipeline TimeBuffer. */ + TimeBuffer storePipe; + /** Each stage in store pipeline. storePipeSx[0] means store pipe S0 */ + std::vector::wire> storePipeSx; + + /** Find inst in Load/Store Pipeline, set corresponding flag to true */ + void setFlagInPipeLine(DynInstPtr inst, LdStFlags f); + private: /** The number of places to shift addresses in the LSQ before checking * for dependency violations @@ -658,6 +744,9 @@ class LSQUnit /** Total number of squashed loads. */ statistics::Scalar squashedLoads; + /** Total number of pipeline detected raw nuke. */ + statistics::Scalar pipeRawNukeReplay; + /** Total number of responses from the memory system that are * ignored due to the instruction already being squashed. */ statistics::Scalar ignoredResponses; diff --git a/src/mem/cache/Cache.py b/src/mem/cache/Cache.py index 79adc68fae..45e185e8a1 100644 --- a/src/mem/cache/Cache.py +++ b/src/mem/cache/Cache.py @@ -154,6 +154,9 @@ class BaseCache(ClockedObject): tag_load_read_ports = Param.Unsigned(3, "Total tag read ports for load/prefetcher(in L1 Cache)") + hint_wakeup_ahead_cycles = Param.Unsigned(3, "How many cycles " \ + "giving a response to LSU waking up the missed load in advance") + force_hit = Param.Bool(False, "Force some PC to hit in L1") way_entries = Param.MemorySize( "64", diff --git a/src/mem/cache/base.cc b/src/mem/cache/base.cc index ab0abeb998..c3d5adce35 100644 --- a/src/mem/cache/base.cc +++ b/src/mem/cache/base.cc @@ -145,9 +145,10 @@ BaseCache::BaseCache(const BaseCacheParams &p, unsigned blk_size) missCount(p.max_miss_count), addrRanges(p.addr_ranges.begin(), p.addr_ranges.end()), archDBer(p.arch_db), + cacheLevel(p.cache_level), + hintWakeUpAheadCycles(p.hint_wakeup_ahead_cycles), system(p.system), stats(*this), - cacheLevel(p.cache_level), forceHit(p.force_hit) { // the MSHR queue has no reserve entries as we check the MSHR @@ -642,14 +643,16 @@ BaseCache::recvTimingReq(PacketPtr pkt) } handleTimingReqHit(pkt, blk, request_time, first_acc_after_pf); - if (cacheLevel == 1 && pkt->isResponse() && pkt->isRead() && lat > 1) { - // send cache miss signal - cpuSidePort.sendCustomSignal(pkt, DcacheRespType::Miss); + if (cacheLevel == 1 && pkt->isResponse() && pkt->isRead() && !pkt->isWrite() && lat > 1) { + // cache block not ready, send cancel signal + cpuSidePort.sendCustomSignal(pkt, DcacheRespType::Block_Not_Ready); + pkt->cacheSatisfied = false; } } else { - if (cacheLevel == 1 && pkt->needsResponse() && pkt->isRead()) { + if (cacheLevel == 1 && pkt->needsResponse() && pkt->isRead() && !pkt->isWrite()) { // send cache miss signal cpuSidePort.sendCustomSignal(pkt, DcacheRespType::Miss); + pkt->cacheSatisfied = false; } // ArchDB: for now we only track packet which has PC diff --git a/src/mem/cache/base.hh b/src/mem/cache/base.hh index f40411785d..cc15f85e46 100644 --- a/src/mem/cache/base.hh +++ b/src/mem/cache/base.hh @@ -1052,6 +1052,11 @@ class BaseCache : public ClockedObject, CacheAccessor /** ArchDB */ ArchDBer *archDBer; + /** Cache Level, 1 means L1 */ + const unsigned cacheLevel{0}; + + Cycles hintWakeUpAheadCycles; + int squashedWays; public: @@ -1503,8 +1508,6 @@ class BaseCache : public ClockedObject, CacheAccessor private: - const unsigned cacheLevel{0}; - //const unsigned maxCacheLevel; const bool dumpMissPC{false}; diff --git a/src/mem/cache/cache.cc b/src/mem/cache/cache.cc index d34b367fce..c8e3bad0ab 100644 --- a/src/mem/cache/cache.cc +++ b/src/mem/cache/cache.cc @@ -56,6 +56,7 @@ #include "debug/CacheTags.hh" #include "debug/CacheVerbose.hh" #include "enums/Clusivity.hh" +#include "mem/cache/base.hh" #include "mem/cache/cache_blk.hh" #include "mem/cache/mshr.hh" #include "mem/cache/tags/base.hh" @@ -802,8 +803,16 @@ Cache::serviceMSHRTargets(MSHR *mshr, const PacketPtr pkt, CacheBlk *blk) // responseLatency is the latency of the return path // from lower level caches/memory to an upper level cache or // the core. - completion_time += clockEdge(responseLatency) + - (transfer_offset ? pkt->payloadDelay : 0); + if ((cacheLevel == 1 && !isReadOnly) && + tgt_pkt->isRead() && !tgt_pkt->isWrite() && !tgt_pkt->isLLSC()) { + // Send TimingResp to LSU a few cycles in advance so that it can be replayed from ReplayQ earlier. + assert(hintWakeUpAheadCycles <= responseLatency); + completion_time += clockEdge(responseLatency - hintWakeUpAheadCycles) + + (transfer_offset ? pkt->payloadDelay : 0); + } else { + completion_time += clockEdge(responseLatency) + + (transfer_offset ? pkt->payloadDelay : 0); + } assert(!tgt_pkt->req->isUncacheable()); diff --git a/src/mem/packet.hh b/src/mem/packet.hh index a62d05de04..8964904215 100644 --- a/src/mem/packet.hh +++ b/src/mem/packet.hh @@ -1598,6 +1598,8 @@ class Packet : public Printable bool tagReadFail = false; + bool cacheSatisfied = true; + bool fromBOP() const { return pfSource == PrefetchSourceType::HWP_BOP; } PrefetchSourceType getPFSource() const { return static_cast(pfSource); } diff --git a/src/mem/request.hh b/src/mem/request.hh index acbd793c0a..075949a2d9 100644 --- a/src/mem/request.hh +++ b/src/mem/request.hh @@ -91,6 +91,7 @@ enum DcacheRespType { NONE = 0, Hit, + Block_Not_Ready, Miss, NUM_Resp_Type }; diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc index e4ee650f85..f5790d608e 100644 --- a/src/mem/ruby/system/RubyPort.cc +++ b/src/mem/ruby/system/RubyPort.cc @@ -49,6 +49,7 @@ #include "mem/packet_access.hh" #include "mem/ruby/protocol/AccessPermission.hh" #include "mem/ruby/slicc_interface/AbstractController.hh" +#include "mem/ruby/slicc_interface/RubySlicc_Util.hh" #include "mem/simple_mem.hh" #include "sim/full_system.hh" #include "sim/system.hh" @@ -497,6 +498,7 @@ RubyPort::ruby_custom_signal_callback(PacketPtr pkt) DPRINTF(RubyPort, "Sent custom signal back to LSQ with sender state %#lx\n", sender_state); port->sendCustomSignal(pkt, DcacheRespType::Miss); + pkt->cacheSatisfied = false; } void @@ -675,7 +677,12 @@ RubyPort::MemResponsePort::hitCallback(PacketPtr pkt) // Send a response in the same cycle. There is no need to delay the // response because the response latency is already incurred in the // Ruby protocol. - schedTimingResp(pkt, curTick()); + if (pkt->isRead() && !pkt->isWrite() && !pkt->fromCache()) { + // send resp right now, make sure load has certain latency + respQueue.sendTiming(pkt); + } else { + schedTimingResp(pkt, curTick()); + } } else { delete pkt; } diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc index 8f6213b70c..0b442ad1f5 100644 --- a/src/mem/ruby/system/Sequencer.cc +++ b/src/mem/ruby/system/Sequencer.cc @@ -383,7 +383,7 @@ Sequencer::insertRequest(PacketPtr pkt, RubyRequestType primary_type, if (seq_req_list.size() > 1) { if (cache_block_busy) { - if (pkt->isRead()) { + if (pkt->isRead() && !pkt->isWrite()) { DPRINTF(RubySequencer, "Pkt %#lx %s is delayed because blk is busy doing ruby stuff\n", pkt, pkt->cmdString()); ruby_custom_signal_callback(pkt); @@ -649,7 +649,7 @@ Sequencer::notifyMissCallback(Addr address, bool is_upgrade, bool is_busy) // cancel pending loads' speculation for (auto &seq_req: seq_req_list) { - if (seq_req.pkt->isRead()) { + if (seq_req.pkt->isRead() && !seq_req.pkt->isWrite()) { ruby_custom_signal_callback(seq_req.pkt); stat.loadcancel++; } @@ -693,7 +693,7 @@ Sequencer::TBEFullCancel(Addr address) // cancel pending loads' speculation for (auto &seq_req: seq_req_list) { - if (seq_req.pkt->isRead()) { + if (seq_req.pkt->isRead() && !seq_req.pkt->isWrite()) { ruby_custom_signal_callback(seq_req.pkt); stat.loadcancel++; }