Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cpu-o3: add mem stall topdown #257

Open
wants to merge 2 commits into
base: xs-dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 44 additions & 2 deletions src/cpu/o3/fetch.cc
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,23 @@ Fetch::FetchStatGroup::FetchStatGroup(CPU *cpu, Fetch *fetch)
ADD_STAT(decodeStallRate, statistics::units::Rate<
statistics::units::Count, statistics::units::Cycle>::get(),
"Number of decode stalls per cycle",
decodeStalls / cpu->baseStats.numCycles)
decodeStalls / cpu->baseStats.numCycles),
ADD_STAT(fetchBubbles, statistics::units::Count::get(),
"Unutilized issue-pipeline slots while there is no backend-stall"),
ADD_STAT(fetchBubbles_max, statistics::units::Count::get(),
"Cycles that fetch 0 instruction while there is no backend-stall"),
ADD_STAT(frontendBound, statistics::units::Rate<
statistics::units::Count, statistics::units::Cycle>::get(),
"Frontend Bound",
fetchBubbles / (cpu->baseStats.numCycles * fetch->decodeWidth)),
ADD_STAT(frontendLatencyBound, statistics::units::Rate<
statistics::units::Count, statistics::units::Cycle>::get(),
"Frontend Latency Bound",
fetchBubbles_max / cpu->baseStats.numCycles),
ADD_STAT(frontendBandwidthBound, statistics::units::Rate<
statistics::units::Count, statistics::units::Cycle>::get(),
"Frontend Bandwidth Bound",
frontendBound - frontendLatencyBound)
{
icacheStallCycles
.prereq(icacheStallCycles);
Expand Down Expand Up @@ -320,6 +336,16 @@ Fetch::FetchStatGroup::FetchStatGroup(CPU *cpu, Fetch *fetch)
.prereq(decodeStalls);
decodeStallRate
.flags(statistics::total);
fetchBubbles
.prereq(fetchBubbles);
fetchBubbles_max
.prereq(fetchBubbles_max);
frontendBound
.flags(statistics::total);
frontendLatencyBound
.flags(statistics::total);
frontendBandwidthBound
.flags(statistics::total);
}
void
Fetch::setTimeBuffer(TimeBuffer<TimeStruct> *time_buffer)
Expand Down Expand Up @@ -1163,6 +1189,9 @@ Fetch::tick()

wroteToTimeBuffer = false;

// get the distribution of fetch status
fetchStats.fetchStatusDist[fetchStatus[0]]++;

for (ThreadID i = 0; i < numThreads; ++i) {
issuePipelinedIfetch[i] = false;
}
Expand Down Expand Up @@ -1294,7 +1323,20 @@ Fetch::tick()

toDecode->fetchStallReason = stallReason;

fetchStats.fetchStatusDist[fetchStatus[*tid_itr]]++;
// Intel TopDown method for measuring frontend bubbles
// Count unutilized issue slots when backend is not stalled (decode not stalled)
// For N-wide machine, if frontend supplies 0 instructions:
// - fetchBubbles += N (count total empty slots)
// - fetchBubbles_max += 1 (count occurrence of all slots being empty)
if (!stalls[*tid_itr].decode) { // backend not stalled
int unused_slots = decode_width - insts_to_decode;
if (unused_slots > 0) { // has empty slots
fetchStats.fetchBubbles += unused_slots; // add number of empty slots
if (unused_slots == decode_width) { // all slots empty, insts_to_decode == 0
fetchStats.fetchBubbles_max++; // count max bubble occurrence
}
}
}

if (stalls[*tid_itr].decode) {
fetchStats.decodeStalls++;
Expand Down
10 changes: 10 additions & 0 deletions src/cpu/o3/fetch.hh
Original file line number Diff line number Diff line change
Expand Up @@ -652,6 +652,16 @@ class Fetch
statistics::Scalar decodeStalls;
/** Number of decode stalls per cycle */
statistics::Formula decodeStallRate;
/** Unutilized issue-pipeline slots while there is no backend-stall */
statistics::Scalar fetchBubbles;
/** Cycles that fetch 0 instruction while there is no backend-stall */
statistics::Scalar fetchBubbles_max;
/** Frontend Bound */
statistics::Formula frontendBound;
/** Frontend Latency Bound */
statistics::Formula frontendLatencyBound;
/** Frontend Bandwidth Bound */
statistics::Formula frontendBandwidthBound;
} fetchStats;

SquashVersion localSquashVer;
Expand Down
2 changes: 1 addition & 1 deletion src/cpu/o3/inst_queue.cc
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ InstructionQueue::InstructionQueue(CPU *cpu_ptr, IEW *iew_ptr,
memDepUnit[tid].setIQ(this);
}

scheduler->setCPU(cpu_ptr);
scheduler->setCPU(cpu_ptr, &iew_ptr->ldstQueue);
scheduler->resetDepGraph(numPhysRegs);
scheduler->setMemDepUnit(memDepUnit);

Expand Down
25 changes: 23 additions & 2 deletions src/cpu/o3/issue_queue.cc
Original file line number Diff line number Diff line change
Expand Up @@ -623,6 +623,16 @@ Scheduler::SpecWakeupCompletion::description() const
return "Spec wakeup completion";
}

Scheduler::SchedulerStats::SchedulerStats(statistics::Group* parent)
: statistics::Group(parent),
ADD_STAT(exec_stall_cycle, ""),
ADD_STAT(memstall_any_load, ""),
ADD_STAT(memstall_l1miss,""),
ADD_STAT(memstall_l2miss,""),
ADD_STAT(memstall_l3miss,"")
{
}

bool
Scheduler::disp_policy::operator()(IssueQue* a, IssueQue* b) const
{
Expand All @@ -632,7 +642,7 @@ Scheduler::disp_policy::operator()(IssueQue* a, IssueQue* b) const
return p0 < p1;
}

Scheduler::Scheduler(const SchedulerParams& params) : SimObject(params), issueQues(params.IQs)
Scheduler::Scheduler(const SchedulerParams& params) : SimObject(params), stats(this), issueQues(params.IQs)
{
dispTable.resize(enums::OpClass::Num_OpClass);
opExecTimeTable.resize(enums::OpClass::Num_OpClass, 1);
Expand Down Expand Up @@ -719,9 +729,10 @@ Scheduler::Scheduler(const SchedulerParams& params) : SimObject(params), issueQu
}

void
Scheduler::setCPU(CPU* cpu)
Scheduler::setCPU(CPU* cpu, LSQ* lsq)
{
this->cpu = cpu;
this->lsq = lsq;
for (auto it : issueQues) {
it->setCPU(cpu);
}
Expand Down Expand Up @@ -764,6 +775,16 @@ Scheduler::issueAndSelect()
for (auto it : issueQues) {
it->issueToFu();
}
if (instsToFu.size() < 4) {
stats.exec_stall_cycle++;
}
if (instsToFu.size() == 0) {
if (lsq->anyInflightLoadsNotComplete()) stats.memstall_any_load++;
if (lsq->anyInflightLoadsNotComplete(1)) stats.memstall_l1miss++;
if (lsq->anyInflightLoadsNotComplete(2)) stats.memstall_l2miss++;
if (lsq->anyInflightLoadsNotComplete(3)) stats.memstall_l3miss++;
}

// must wait for all insts was issued
for (auto it : issueQues) {
it->selectInst();
Expand Down
13 changes: 12 additions & 1 deletion src/cpu/o3/issue_queue.hh
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,17 @@ class Scheduler : public SimObject

CPU* cpu;
MemDepUnit* memDepUnit;
LSQ* lsq;

struct SchedulerStats : public statistics::Group
{
SchedulerStats(statistics::Group* parent);
statistics::Scalar exec_stall_cycle;
statistics::Scalar memstall_any_load;
statistics::Scalar memstall_l1miss;
statistics::Scalar memstall_l2miss;
statistics::Scalar memstall_l3miss;
} stats;

struct disp_policy
{
Expand Down Expand Up @@ -246,7 +257,7 @@ class Scheduler : public SimObject

public:
Scheduler(const SchedulerParams& params);
void setCPU(CPU* cpu);
void setCPU(CPU* cpu, LSQ* lsq);
void resetDepGraph(uint64_t numPhysRegs);
void setMemDepUnit(MemDepUnit* memDepUnit) { this->memDepUnit = memDepUnit; }

Expand Down
21 changes: 21 additions & 0 deletions src/cpu/o3/lsq.cc
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,16 @@ int LSQ::getCount(ThreadID tid) { return thread.at(tid).getCount(); }

int LSQ::numLoads(ThreadID tid) { return thread.at(tid).numLoads(); }

bool LSQ::anyInflightLoadsNotComplete(int miss_level)
{
for (auto it : thread.at(0).inflightLoads) {
if (it->isAnyOutstandingRequest() && (it->mainReq()->depth >= miss_level)) {
return true;
}
}
return false;
}

int LSQ::numStores(ThreadID tid) { return thread.at(tid).numStores(); }

int
Expand Down Expand Up @@ -1352,6 +1362,12 @@ LSQ::SingleDataRequest::recvTimingResp(PacketPtr pkt)
// Dump inst num, request addr, and packet addr
DPRINTF(LSQ, "Single Req::recvTimingResp: inst: %llu, pkt: %#lx\n", pkt->req->getReqInstSeqNum(),
pkt->getAddr());
if (isLoad()) {
auto it = std::find(lsqUnit()->inflightLoads.begin(), lsqUnit()->inflightLoads.end(), this);
if (it != lsqUnit()->inflightLoads.end()) {
lsqUnit()->inflightLoads.erase(it);
}
}
assert(_numOutstandingPackets == 1);
flags.set(Flag::Complete);
assert(pkt == _packets.front());
Expand Down Expand Up @@ -1520,6 +1536,11 @@ LSQ::SingleDataRequest::sendPacketToCache()
bool tag_read_fail = false;
bool success = lsqUnit()->trySendPacket(isLoad(), _packets.at(0), bank_conflict, tag_read_fail);
if (success) {
if (isLoad()) {
assert(lsqUnit()->inflightLoads.size() < lsqUnit()->numLoads());
lsqUnit()->inflightLoads.emplace_back(this);
}

if (!bank_conflict) {
_numOutstandingPackets = 1;
}
Expand Down
38 changes: 20 additions & 18 deletions src/cpu/o3/lsq.hh
Original file line number Diff line number Diff line change
Expand Up @@ -283,24 +283,6 @@ class LSQ
uint64_t* res=nullptr, AtomicOpFunctorPtr amo_op=nullptr,
bool stale_translation=false);

bool
isLoad() const
{
return flags.isSet(Flag::IsLoad);
}

bool
isHInst() const
{
return flags.isSet(Flag::IsHInst);
}

bool
isAtomic() const
{
return flags.isSet(Flag::IsAtomic);
}

/** Install the request in the LQ/SQ. */
void install();

Expand Down Expand Up @@ -344,6 +326,24 @@ class LSQ

public:

bool
isLoad() const
{
return flags.isSet(Flag::IsLoad);
}

bool
isHInst() const
{
return flags.isSet(Flag::IsHInst);
}

bool
isAtomic() const
{
return flags.isSet(Flag::IsAtomic);
}

void forward();

/** Convenience getters/setters. */
Expand Down Expand Up @@ -803,6 +803,8 @@ class LSQ
/** Returns the total number of loads for a single thread. */
int numLoads(ThreadID tid);

bool anyInflightLoadsNotComplete(int miss_level = -1);

/** Returns the total number of stores in the store queue. */
int numStores();
/** Returns the total number of stores for a single thread. */
Expand Down
6 changes: 6 additions & 0 deletions src/cpu/o3/lsq_unit.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1520,6 +1520,12 @@ LSQUnit::squash(const InstSeqNum &squashed_num)
DPRINTF(HtmCpu, ">> htmStarts (%d) : htmStops-- (%d)\n",
htmStarts, htmStops);
}
auto request = loadQueue.back().request();
auto it = std::find(inflightLoads.begin(), inflightLoads.end(), request);
if (it != inflightLoads.end()) {
inflightLoads.erase(it);
}

// Clear the smart pointer to make sure it is decremented.
loadQueue.back().instruction()->setSquashed();
loadQueue.back().clear();
Expand Down
2 changes: 2 additions & 0 deletions src/cpu/o3/lsq_unit.hh
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,8 @@ class LSQUnit
using LoadQueue = CircularQueue<LQEntry>;
using StoreQueue = CircularQueue<SQEntry>;

std::vector<LSQRequest*> inflightLoads;

public:
/** Constructs an LSQ unit. init() must be called prior to use. */
LSQUnit(uint32_t lqEntries, uint32_t sqEntries, uint32_t sbufferEntries,
Expand Down