mem: align cache latency use maprobe

Change-Id: I9f07d78d2a205a4962ed83d3ac1f004274e14b5f
OpenXiangShan · Dec 3, 2024 · 40c6841 · 40c6841
1 parent 3c1393c
commit 40c6841
Show file tree

Hide file tree

Showing 12 changed files with 153 additions and 37 deletions.
diff --git a/configs/common/CacheConfig.py b/configs/common/CacheConfig.py
@@ -122,16 +122,16 @@ def config_cache(options, system):
         # same clock as the CPUs.
         system.l2_caches = [l2_cache_class(clk_domain=system.cpu_clk_domain,
                                            **_get_cache_opts('l2', options)) for i in range(options.num_cpus)]
-        system.tol2bus_list = [L2XBar(
-            clk_domain=system.cpu_clk_domain, width=256) for i in range(options.num_cpus)]
+        system.tol2bus_list = [L1ToL2Bus(
+            clk_domain=system.cpu_clk_domain) for i in range(options.num_cpus)]
         for i in range(options.num_cpus):
             # system.l2_caches.append(l2_cache_class(clk_domain=system.cpu_clk_domain,
             #                        **_get_cache_opts('l2', options)))
 
             # system.tol2bus_list.append(L2XBar(clk_domain = system.cpu_clk_domain, width=256))
             system.l2_caches[i].cpu_side = system.tol2bus_list[i].mem_side_ports
             system.tol2bus_list[i].snoop_filter.max_capacity = "16MB"
-            if options.kmh_align:
+            if system.l2_caches[i].prefetcher != NULL and options.kmh_align:
                 assert options.l2_hwp_type == 'L2CompositeWithWorkerPrefetcher'
                 system.l2_caches[i].prefetcher.enable_cmc = True
                 system.l2_caches[i].prefetcher.enable_bop = True
@@ -162,18 +162,10 @@ def config_cache(options, system):
                 system.l2_caches[i].response_latency = 66
                 system.l2_caches[i].writeback_clean = False
 
-            system.membus.frontend_latency = 0
-            system.membus.response_latency = 0
-            system.membus.forward_latency = 0
-            system.membus.header_latency = 0
-            system.membus.snoop_response_latency = 0
-            system.membus.width = 128 # byte per cycle
-
-
         if options.l3cache:
             system.l3 = L3Cache(clk_domain=system.cpu_clk_domain,
                                         **_get_cache_opts('l3', options))
-            system.tol3bus = L2XBar(clk_domain=system.cpu_clk_domain, width=256)
+            system.tol3bus = L2ToL3Bus(clk_domain=system.cpu_clk_domain)
             system.tol3bus.snoop_filter.max_capacity = "32MB"
             system.l3.cpu_side = system.tol3bus.mem_side_ports
             system.l3.mem_side = system.membus.cpu_side_ports

diff --git a/configs/common/Caches.py b/configs/common/Caches.py
@@ -66,24 +66,26 @@ class L1_ICache(L1Cache):
     data_latency = 1
     sequential_access = False
 
-    response_latency = 4
+    response_latency = 0
     enable_wayprediction = False
 
 class L1_DCache(L1Cache):
-    mshrs = 32
+    mshrs = 16
 
     writeback_clean = False
 
     # aligned latency:
     tag_latency = 1
     data_latency = 1
     sequential_access = False
-    # This is communication latency between l1 & l2
-    response_latency = 4
+    # recvTimingResp serviceMSHR latency, not really response latency
+    response_latency = 0
 
     force_hit = False
 
-    demand_mshr_reserve = 8
+    demand_mshr_reserve = 6
+
+    enable_wayprediction = False
 
 class L2Cache(Cache):
     mshrs = 64
@@ -93,33 +95,27 @@ class L2Cache(Cache):
     writeback_clean = True
 
     # aligned latency:
-    tag_latency = 2
-    data_latency = 13
+    tag_latency = 1
+    data_latency = 2
     sequential_access = True
-
-    # This is communication latency between l2 & l3
-    response_latency = 15
+    # recvTimingResp serviceMSHR latency
+    response_latency = 0
 
     cache_level = 2
     enable_wayprediction = False
 
 class L3Cache(Cache):
-    mshrs = 128
+    mshrs = 64
     tgts_per_mshr = 20
     clusivity='mostly_excl'
     writeback_clean = False
 
     # aligned latency:
     tag_latency = 2
-    data_latency = 17
+    data_latency = 5
     sequential_access = True
-
-    # This is L3 miss latency, which act as padding for memory controller
-
-    # 5950x L3 miss latency (rand) = 70ns, L3 hit latency = 15ns, so Mem-->L3 = 55ns (165 cycle in 3GHz)
-    # But XS's miss latency on mcf (less random) is averagely 211 on padding=112.
-    # To make XS's L3 miss latency similar to 5950x, we reduce padding from 112 to (112 - (211-165)) = 66 cycle
-    response_latency = 66
+    # recvTimingResp serviceMSHR latency
+    response_latency = 0
 
     cache_level = 3
     enable_wayprediction = False
@@ -152,3 +148,71 @@ class PageTableWalkerCache(Cache):
         is_read_only = True
         # Writeback clean lines as well
         writeback_clean = True
+
+class L1ToL2Bus(CoherentXBar):
+    # 256-bit crossbar by default
+    width = 64 # half one cacheline
+
+    # Assume that most of this is covered by the cache latencies, with
+    # no more than a single pipeline stage for any packet.
+    frontend_latency = 0 # l1 -> l2 req additional latency
+    forward_latency = 3 # l1 -> l2 req/snoop latency
+    response_latency = 3 # l2 -> l1 resp latency
+    snoop_response_latency = 1
+
+    # Use a snoop-filter by default, and set the latency to zero as
+    # the lookup is assumed to overlap with the frontend latency of
+    # the crossbar
+    snoop_filter = SnoopFilter(lookup_latency = 0)
+
+    # This specialisation of the coherent crossbar is to be considered
+    # the point of unification, it connects the dcache and the icache
+    # to the first level of unified cache.
+    point_of_unification = True
+
+class L2ToL3Bus(CoherentXBar):
+    # 256-bit crossbar by default
+    width = 64
+
+    # Assume that most of this is covered by the cache latencies, with
+    # no more than a single pipeline stage for any packet.
+    frontend_latency = 0
+    forward_latency = 5
+    response_latency = 5
+    snoop_response_latency = 1
+
+    # Use a snoop-filter by default, and set the latency to zero as
+    # the lookup is assumed to overlap with the frontend latency of
+    # the crossbar
+    snoop_filter = SnoopFilter(lookup_latency = 0)
+
+    # This specialisation of the coherent crossbar is to be considered
+    # the point of unification, it connects the dcache and the icache
+    # to the first level of unified cache.
+    point_of_unification = True
+
+class L3ToMemBus(CoherentXBar):
+    # 128-bit crossbar by default
+    width = 64
+
+    # A handful pipeline stages for each portion of the latency
+    # contributions.
+    frontend_latency = 0
+    forward_latency = 30
+    response_latency = 30
+    snoop_response_latency = 4
+
+    # Use a snoop-filter by default
+    snoop_filter = SnoopFilter(lookup_latency = 1)
+
+    # This specialisation of the coherent crossbar is to be considered
+    # the point of coherency, as there are no (coherent) downstream
+    # caches.
+    point_of_coherency = True
+
+    # This specialisation of the coherent crossbar is to be considered
+    # the point of unification, it connects the dcache and the icache
+    # to the first level of unified cache. This is needed for systems
+    # without caches where the SystemXBar is also the point of
+    # unification.
+    point_of_unification = True
diff --git a/configs/common/FSConfig.py b/configs/common/FSConfig.py
@@ -43,6 +43,7 @@
 from m5.util import *
 from common.Benchmarks import *
 from common import ObjectList
+from common.Caches import *
 
 # Populate to reflect supported os types per target ISA
 os_types = { 'mips'  : [ 'linux' ],
@@ -64,7 +65,7 @@ class CowIdeDisk(IdeDisk):
     def childImage(self, ci):
         self.image.child.image_file = ci
 
-class MemBus(SystemXBar):
+class MemBus(L3ToMemBus):
     badaddr_responder = BadAddr()
     default = Self.badaddr_responder.pio
 

diff --git a/src/cpu/o3/FuncUnitConfig.py b/src/cpu/o3/FuncUnitConfig.py
@@ -148,8 +148,8 @@ class ReadPort(FUDesc):
                OpDesc(opClass='VectorWholeRegisterLoad', opLat=3)]
 
 class WritePort(FUDesc):
-    opList = [ OpDesc(opClass='MemWrite', opLat=2),
-               OpDesc(opClass='FloatMemWrite', opLat=3),
+    opList = [ OpDesc(opClass='MemWrite', opLat=4),
+               OpDesc(opClass='FloatMemWrite', opLat=4),
                OpDesc(opClass='VectorUnitStrideStore'),
                OpDesc(opClass='VectorSegUnitStrideStore'),
                OpDesc(opClass='VectorUnitStrideMaskStore'),

diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc
@@ -1237,6 +1237,9 @@ LSQ::LSQRequest::addReq(Addr addr, unsigned size,
                 _inst->pcState().instAddr(), _inst->contextId(),
                 std::move(_amo_op));
         req->setByteEnable(byte_enable);
+        if (isLoad()) {
+            req->setReqSeqNum(_port.allocateLoadReqSeqNum());
+        }
 
         /* If the request is marked as NO_ACCESS, setup a local access */
         if (_flags.isSet(Request::NO_ACCESS)) {

diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
@@ -1033,6 +1033,14 @@ class LSQ
 
     /** Number of Threads. */
     ThreadID numThreads;
+
+  private:
+    uint64_t ldreq_seqNum=0;
+  public:
+    uint64_t allocateLoadReqSeqNum() {
+        ldreq_seqNum++;
+        return ldreq_seqNum;
+    }
 };
 
 } // namespace o3

diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc
@@ -103,14 +103,17 @@ StoreBufferEntry::recordForward(PacketPtr pkt, LSQ::LSQRequest *req)
     if (goffset > 0) {
         assert(offset == 0);
     }
+    bool partial_forward = false;
     for (int i = 0; i < pkt->getSize(); i++) {
         if (validMask[offset + i]) {
             assert(goffset + i < req->_size);
             req->forwardPackets.push_back(
                 LSQ::LSQRequest::FWDPacket{.idx = goffset + i, .byte = blockDatas[offset + i]});
+        } else {
+            partial_forward = true;
         }
     }
-    return false;
+    return !partial_forward;
 }
 
 void
@@ -557,6 +560,8 @@ LSQUnit::LSQUnitStats::LSQUnitStats(statistics::Group *parent)
       ADD_STAT(sbufferEvictDuetoFull, statistics::units::Count::get(), ""),
       ADD_STAT(sbufferEvictDuetoSQFull, statistics::units::Count::get(), ""),
       ADD_STAT(sbufferEvictDuetoTimeout, statistics::units::Count::get(), ""),
+      ADD_STAT(sbufferPartialForward, statistics::units::Count::get(), ""),
+      ADD_STAT(sbufferFullyForward, statistics::units::Count::get(), ""),
       ADD_STAT(loadToUse, "Distribution of cycle latency between the "
                 "first time a load is issued and its completion"),
       ADD_STAT(loadTranslationLat, "Distribution of cycle latency between the "
@@ -1853,11 +1858,19 @@ LSQUnit::trySendPacket(bool isLoad, PacketPtr data_pkt, bool &bank_conflict, boo
         if (isLoad) {
             auto entry = storeBuffer.get(pkt->getAddr() & cacheBlockMask);
             if (entry) {
+                bool fullyforward = false;
                 DPRINTF(StoreBuffer, "sbuffer entry[%#x] coverage %s\n", entry->blockPaddr, pkt->print());
-                entry->recordForward(pkt, request);
+                fullyforward |= entry->recordForward(pkt, request);
                 if (entry->vice) {
                     DPRINTF(StoreBuffer, "sbuffer vice entry coverage\n");
-                    entry->vice->recordForward(pkt, request);
+                    fullyforward |= entry->vice->recordForward(pkt, request);
+                }
+                if (request->forwardPackets.size() > 0) {
+                    if (fullyforward) {
+                        stats.sbufferFullyForward++;
+                    } else {
+                        stats.sbufferPartialForward++;
+                    }
                 }
             }
         }
@@ -1983,6 +1996,12 @@ void LSQUnit::schedule(Event& ev, Tick when) { cpu->schedule(ev, when); }
 
 BaseMMU *LSQUnit::getMMUPtr() { return cpu->mmu; }
 
+uint64_t
+LSQUnit::allocateLoadReqSeqNum()
+{
+    return lsq->allocateLoadReqSeqNum();
+}
+
 unsigned int
 LSQUnit::cacheLineSize()
 {

diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
@@ -495,6 +495,8 @@ class LSQUnit
 
     BaseMMU *getMMUPtr();
 
+    uint64_t allocateLoadReqSeqNum();
+
   private:
     System *system;
 
@@ -685,6 +687,8 @@ class LSQUnit
         statistics::Scalar sbufferEvictDuetoFull;
         statistics::Scalar sbufferEvictDuetoSQFull;
         statistics::Scalar sbufferEvictDuetoTimeout;
+        statistics::Scalar sbufferPartialForward;
+        statistics::Scalar sbufferFullyForward;
 
         /** Distribution of cycle latency between the first time a load
          * is issued and its completion */

diff --git a/src/mem/cache/SConscript b/src/mem/cache/SConscript
@@ -53,6 +53,7 @@ DebugFlag('HWPrefetchOther')
 DebugFlag('StridePrefetcher')
 DebugFlag('MSHR')
 DebugFlag('HWPrefetchQueue')
+DebugFlag('CPUReqTrace')
 
 # CacheTags is so outrageously verbose, printing the cache's entire tag
 # array on each timing access, that you should probably have to ask for

diff --git a/src/mem/cache/base.cc b/src/mem/cache/base.cc
@@ -51,6 +51,7 @@
 #include "base/trace.hh"
 #include "base/types.hh"
 #include "debug/ArchDB.hh"
+#include "debug/CPUReqTrace.hh"
 #include "debug/Cache.hh"
 #include "debug/CacheComp.hh"
 #include "debug/CachePort.hh"
@@ -518,6 +519,11 @@ BaseCache::tryAccessTag(PacketPtr pkt)
 void
 BaseCache::recvTimingReq(PacketPtr pkt)
 {
+    if (debug::CPUReqTrace) [[unlikely]] {
+        if (pkt->req->getSeqNum()) {
+            DPRINTF(CPUReqTrace, "[reqTrace %llu] request at L%d\n", pkt->req->getSeqNum(), cacheLevel);
+        }
+    }
 
     if (pkt->isStorePFTrain()) {
         // send store prefetch train request
@@ -738,6 +744,13 @@ BaseCache::handleUncacheableWriteResp(PacketPtr pkt)
 void
 BaseCache::recvTimingResp(PacketPtr pkt)
 {
+
+    if (debug::CPUReqTrace) [[unlikely]] {
+        if (pkt->req->getSeqNum()) {
+            DPRINTF(CPUReqTrace, "[reqTrace %llu] response at L%d\n", pkt->req->getSeqNum(), cacheLevel);
+        }
+    }
+
     assert(pkt->isResponse());
 
     // all header delay should be paid for by the crossbar, unless

diff --git a/src/mem/packet_queue.cc b/src/mem/packet_queue.cc
@@ -175,6 +175,7 @@ PacketQueue::schedSendEvent(Tick when)
         for (auto it = transmitList.begin(); it != transmitList.end(); ++it) {
             if (it->pkt->isSendRightAway()) {
                 sendRightAway = true;
+                break;
             }
         }
 

diff --git a/src/mem/request.hh b/src/mem/request.hh
@@ -566,6 +566,8 @@ class Request
 
     bool _isHInst = false;
 
+    uint64_t seqNum = 0;
+
   public:
 
     /**
@@ -655,6 +657,14 @@ class Request
         reqNum = num;
     }
 
+    void setReqSeqNum(uint64_t sn) {
+        seqNum = sn;
+    }
+
+    uint64_t getSeqNum() {
+        return seqNum;
+    }
+
     /**
      * Set up Context numbers.
      */
-Original file line number
+Diff line change
@@ Expand Up / @@ -175,6 +175,7 @@ PacketQueue::schedSendEvent(Tick when) @@
             for (auto it = transmitList.begin(); it != transmitList.end(); ++it) {
                 if (it->pkt->isSendRightAway()) {
                     sendRightAway = true;
+                    break;
                 }
             }
@@ Expand Down @@