Skip to content

Commit

Permalink
mem: align cache latency use maprobe
Browse files Browse the repository at this point in the history
Change-Id: I9f07d78d2a205a4962ed83d3ac1f004274e14b5f
  • Loading branch information
tastynoob committed Dec 3, 2024
1 parent 3c1393c commit 40c6841
Show file tree
Hide file tree
Showing 12 changed files with 153 additions and 37 deletions.
16 changes: 4 additions & 12 deletions configs/common/CacheConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,16 +122,16 @@ def config_cache(options, system):
# same clock as the CPUs.
system.l2_caches = [l2_cache_class(clk_domain=system.cpu_clk_domain,
**_get_cache_opts('l2', options)) for i in range(options.num_cpus)]
system.tol2bus_list = [L2XBar(
clk_domain=system.cpu_clk_domain, width=256) for i in range(options.num_cpus)]
system.tol2bus_list = [L1ToL2Bus(
clk_domain=system.cpu_clk_domain) for i in range(options.num_cpus)]
for i in range(options.num_cpus):
# system.l2_caches.append(l2_cache_class(clk_domain=system.cpu_clk_domain,
# **_get_cache_opts('l2', options)))

# system.tol2bus_list.append(L2XBar(clk_domain = system.cpu_clk_domain, width=256))
system.l2_caches[i].cpu_side = system.tol2bus_list[i].mem_side_ports
system.tol2bus_list[i].snoop_filter.max_capacity = "16MB"
if options.kmh_align:
if system.l2_caches[i].prefetcher != NULL and options.kmh_align:
assert options.l2_hwp_type == 'L2CompositeWithWorkerPrefetcher'
system.l2_caches[i].prefetcher.enable_cmc = True
system.l2_caches[i].prefetcher.enable_bop = True
Expand Down Expand Up @@ -162,18 +162,10 @@ def config_cache(options, system):
system.l2_caches[i].response_latency = 66
system.l2_caches[i].writeback_clean = False

system.membus.frontend_latency = 0
system.membus.response_latency = 0
system.membus.forward_latency = 0
system.membus.header_latency = 0
system.membus.snoop_response_latency = 0
system.membus.width = 128 # byte per cycle


if options.l3cache:
system.l3 = L3Cache(clk_domain=system.cpu_clk_domain,
**_get_cache_opts('l3', options))
system.tol3bus = L2XBar(clk_domain=system.cpu_clk_domain, width=256)
system.tol3bus = L2ToL3Bus(clk_domain=system.cpu_clk_domain)
system.tol3bus.snoop_filter.max_capacity = "32MB"
system.l3.cpu_side = system.tol3bus.mem_side_ports
system.l3.mem_side = system.membus.cpu_side_ports
Expand Down
102 changes: 83 additions & 19 deletions configs/common/Caches.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,24 +66,26 @@ class L1_ICache(L1Cache):
data_latency = 1
sequential_access = False

response_latency = 4
response_latency = 0
enable_wayprediction = False

class L1_DCache(L1Cache):
mshrs = 32
mshrs = 16

writeback_clean = False

# aligned latency:
tag_latency = 1
data_latency = 1
sequential_access = False
# This is communication latency between l1 & l2
response_latency = 4
# recvTimingResp serviceMSHR latency, not really response latency
response_latency = 0

force_hit = False

demand_mshr_reserve = 8
demand_mshr_reserve = 6

enable_wayprediction = False

class L2Cache(Cache):
mshrs = 64
Expand All @@ -93,33 +95,27 @@ class L2Cache(Cache):
writeback_clean = True

# aligned latency:
tag_latency = 2
data_latency = 13
tag_latency = 1
data_latency = 2
sequential_access = True

# This is communication latency between l2 & l3
response_latency = 15
# recvTimingResp serviceMSHR latency
response_latency = 0

cache_level = 2
enable_wayprediction = False

class L3Cache(Cache):
mshrs = 128
mshrs = 64
tgts_per_mshr = 20
clusivity='mostly_excl'
writeback_clean = False

# aligned latency:
tag_latency = 2
data_latency = 17
data_latency = 5
sequential_access = True

# This is L3 miss latency, which act as padding for memory controller

# 5950x L3 miss latency (rand) = 70ns, L3 hit latency = 15ns, so Mem-->L3 = 55ns (165 cycle in 3GHz)
# But XS's miss latency on mcf (less random) is averagely 211 on padding=112.
# To make XS's L3 miss latency similar to 5950x, we reduce padding from 112 to (112 - (211-165)) = 66 cycle
response_latency = 66
# recvTimingResp serviceMSHR latency
response_latency = 0

cache_level = 3
enable_wayprediction = False
Expand Down Expand Up @@ -152,3 +148,71 @@ class PageTableWalkerCache(Cache):
is_read_only = True
# Writeback clean lines as well
writeback_clean = True

class L1ToL2Bus(CoherentXBar):
# 256-bit crossbar by default
width = 64 # half one cacheline

# Assume that most of this is covered by the cache latencies, with
# no more than a single pipeline stage for any packet.
frontend_latency = 0 # l1 -> l2 req additional latency
forward_latency = 3 # l1 -> l2 req/snoop latency
response_latency = 3 # l2 -> l1 resp latency
snoop_response_latency = 1

# Use a snoop-filter by default, and set the latency to zero as
# the lookup is assumed to overlap with the frontend latency of
# the crossbar
snoop_filter = SnoopFilter(lookup_latency = 0)

# This specialisation of the coherent crossbar is to be considered
# the point of unification, it connects the dcache and the icache
# to the first level of unified cache.
point_of_unification = True

class L2ToL3Bus(CoherentXBar):
# 256-bit crossbar by default
width = 64

# Assume that most of this is covered by the cache latencies, with
# no more than a single pipeline stage for any packet.
frontend_latency = 0
forward_latency = 5
response_latency = 5
snoop_response_latency = 1

# Use a snoop-filter by default, and set the latency to zero as
# the lookup is assumed to overlap with the frontend latency of
# the crossbar
snoop_filter = SnoopFilter(lookup_latency = 0)

# This specialisation of the coherent crossbar is to be considered
# the point of unification, it connects the dcache and the icache
# to the first level of unified cache.
point_of_unification = True

class L3ToMemBus(CoherentXBar):
# 128-bit crossbar by default
width = 64

# A handful pipeline stages for each portion of the latency
# contributions.
frontend_latency = 0
forward_latency = 30
response_latency = 30
snoop_response_latency = 4

# Use a snoop-filter by default
snoop_filter = SnoopFilter(lookup_latency = 1)

# This specialisation of the coherent crossbar is to be considered
# the point of coherency, as there are no (coherent) downstream
# caches.
point_of_coherency = True

# This specialisation of the coherent crossbar is to be considered
# the point of unification, it connects the dcache and the icache
# to the first level of unified cache. This is needed for systems
# without caches where the SystemXBar is also the point of
# unification.
point_of_unification = True
3 changes: 2 additions & 1 deletion configs/common/FSConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from m5.util import *
from common.Benchmarks import *
from common import ObjectList
from common.Caches import *

# Populate to reflect supported os types per target ISA
os_types = { 'mips' : [ 'linux' ],
Expand All @@ -64,7 +65,7 @@ class CowIdeDisk(IdeDisk):
def childImage(self, ci):
self.image.child.image_file = ci

class MemBus(SystemXBar):
class MemBus(L3ToMemBus):
badaddr_responder = BadAddr()
default = Self.badaddr_responder.pio

Expand Down
4 changes: 2 additions & 2 deletions src/cpu/o3/FuncUnitConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,8 @@ class ReadPort(FUDesc):
OpDesc(opClass='VectorWholeRegisterLoad', opLat=3)]

class WritePort(FUDesc):
opList = [ OpDesc(opClass='MemWrite', opLat=2),
OpDesc(opClass='FloatMemWrite', opLat=3),
opList = [ OpDesc(opClass='MemWrite', opLat=4),
OpDesc(opClass='FloatMemWrite', opLat=4),
OpDesc(opClass='VectorUnitStrideStore'),
OpDesc(opClass='VectorSegUnitStrideStore'),
OpDesc(opClass='VectorUnitStrideMaskStore'),
Expand Down
3 changes: 3 additions & 0 deletions src/cpu/o3/lsq.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1237,6 +1237,9 @@ LSQ::LSQRequest::addReq(Addr addr, unsigned size,
_inst->pcState().instAddr(), _inst->contextId(),
std::move(_amo_op));
req->setByteEnable(byte_enable);
if (isLoad()) {
req->setReqSeqNum(_port.allocateLoadReqSeqNum());
}

/* If the request is marked as NO_ACCESS, setup a local access */
if (_flags.isSet(Request::NO_ACCESS)) {
Expand Down
8 changes: 8 additions & 0 deletions src/cpu/o3/lsq.hh
Original file line number Diff line number Diff line change
Expand Up @@ -1033,6 +1033,14 @@ class LSQ

/** Number of Threads. */
ThreadID numThreads;

private:
uint64_t ldreq_seqNum=0;
public:
uint64_t allocateLoadReqSeqNum() {
ldreq_seqNum++;
return ldreq_seqNum;
}
};

} // namespace o3
Expand Down
25 changes: 22 additions & 3 deletions src/cpu/o3/lsq_unit.cc
Original file line number Diff line number Diff line change
Expand Up @@ -103,14 +103,17 @@ StoreBufferEntry::recordForward(PacketPtr pkt, LSQ::LSQRequest *req)
if (goffset > 0) {
assert(offset == 0);
}
bool partial_forward = false;
for (int i = 0; i < pkt->getSize(); i++) {
if (validMask[offset + i]) {
assert(goffset + i < req->_size);
req->forwardPackets.push_back(
LSQ::LSQRequest::FWDPacket{.idx = goffset + i, .byte = blockDatas[offset + i]});
} else {
partial_forward = true;
}
}
return false;
return !partial_forward;
}

void
Expand Down Expand Up @@ -557,6 +560,8 @@ LSQUnit::LSQUnitStats::LSQUnitStats(statistics::Group *parent)
ADD_STAT(sbufferEvictDuetoFull, statistics::units::Count::get(), ""),
ADD_STAT(sbufferEvictDuetoSQFull, statistics::units::Count::get(), ""),
ADD_STAT(sbufferEvictDuetoTimeout, statistics::units::Count::get(), ""),
ADD_STAT(sbufferPartialForward, statistics::units::Count::get(), ""),
ADD_STAT(sbufferFullyForward, statistics::units::Count::get(), ""),
ADD_STAT(loadToUse, "Distribution of cycle latency between the "
"first time a load is issued and its completion"),
ADD_STAT(loadTranslationLat, "Distribution of cycle latency between the "
Expand Down Expand Up @@ -1853,11 +1858,19 @@ LSQUnit::trySendPacket(bool isLoad, PacketPtr data_pkt, bool &bank_conflict, boo
if (isLoad) {
auto entry = storeBuffer.get(pkt->getAddr() & cacheBlockMask);
if (entry) {
bool fullyforward = false;
DPRINTF(StoreBuffer, "sbuffer entry[%#x] coverage %s\n", entry->blockPaddr, pkt->print());
entry->recordForward(pkt, request);
fullyforward |= entry->recordForward(pkt, request);
if (entry->vice) {
DPRINTF(StoreBuffer, "sbuffer vice entry coverage\n");
entry->vice->recordForward(pkt, request);
fullyforward |= entry->vice->recordForward(pkt, request);
}
if (request->forwardPackets.size() > 0) {
if (fullyforward) {
stats.sbufferFullyForward++;
} else {
stats.sbufferPartialForward++;
}
}
}
}
Expand Down Expand Up @@ -1983,6 +1996,12 @@ void LSQUnit::schedule(Event& ev, Tick when) { cpu->schedule(ev, when); }

BaseMMU *LSQUnit::getMMUPtr() { return cpu->mmu; }

uint64_t
LSQUnit::allocateLoadReqSeqNum()
{
return lsq->allocateLoadReqSeqNum();
}

unsigned int
LSQUnit::cacheLineSize()
{
Expand Down
4 changes: 4 additions & 0 deletions src/cpu/o3/lsq_unit.hh
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,8 @@ class LSQUnit

BaseMMU *getMMUPtr();

uint64_t allocateLoadReqSeqNum();

private:
System *system;

Expand Down Expand Up @@ -685,6 +687,8 @@ class LSQUnit
statistics::Scalar sbufferEvictDuetoFull;
statistics::Scalar sbufferEvictDuetoSQFull;
statistics::Scalar sbufferEvictDuetoTimeout;
statistics::Scalar sbufferPartialForward;
statistics::Scalar sbufferFullyForward;

/** Distribution of cycle latency between the first time a load
* is issued and its completion */
Expand Down
1 change: 1 addition & 0 deletions src/mem/cache/SConscript
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ DebugFlag('HWPrefetchOther')
DebugFlag('StridePrefetcher')
DebugFlag('MSHR')
DebugFlag('HWPrefetchQueue')
DebugFlag('CPUReqTrace')

# CacheTags is so outrageously verbose, printing the cache's entire tag
# array on each timing access, that you should probably have to ask for
Expand Down
13 changes: 13 additions & 0 deletions src/mem/cache/base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
#include "base/trace.hh"
#include "base/types.hh"
#include "debug/ArchDB.hh"
#include "debug/CPUReqTrace.hh"
#include "debug/Cache.hh"
#include "debug/CacheComp.hh"
#include "debug/CachePort.hh"
Expand Down Expand Up @@ -518,6 +519,11 @@ BaseCache::tryAccessTag(PacketPtr pkt)
void
BaseCache::recvTimingReq(PacketPtr pkt)
{
if (debug::CPUReqTrace) [[unlikely]] {
if (pkt->req->getSeqNum()) {
DPRINTF(CPUReqTrace, "[reqTrace %llu] request at L%d\n", pkt->req->getSeqNum(), cacheLevel);
}
}

if (pkt->isStorePFTrain()) {
// send store prefetch train request
Expand Down Expand Up @@ -738,6 +744,13 @@ BaseCache::handleUncacheableWriteResp(PacketPtr pkt)
void
BaseCache::recvTimingResp(PacketPtr pkt)
{

if (debug::CPUReqTrace) [[unlikely]] {
if (pkt->req->getSeqNum()) {
DPRINTF(CPUReqTrace, "[reqTrace %llu] response at L%d\n", pkt->req->getSeqNum(), cacheLevel);
}
}

assert(pkt->isResponse());

// all header delay should be paid for by the crossbar, unless
Expand Down
1 change: 1 addition & 0 deletions src/mem/packet_queue.cc
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ PacketQueue::schedSendEvent(Tick when)
for (auto it = transmitList.begin(); it != transmitList.end(); ++it) {
if (it->pkt->isSendRightAway()) {
sendRightAway = true;
break;
}
}

Expand Down
10 changes: 10 additions & 0 deletions src/mem/request.hh
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,8 @@ class Request

bool _isHInst = false;

uint64_t seqNum = 0;

public:

/**
Expand Down Expand Up @@ -655,6 +657,14 @@ class Request
reqNum = num;
}

void setReqSeqNum(uint64_t sn) {
seqNum = sn;
}

uint64_t getSeqNum() {
return seqNum;
}

/**
* Set up Context numbers.
*/
Expand Down

0 comments on commit 40c6841

Please sign in to comment.