OpenXiangShan · tastynoob · Jan 6, 2025 · Jan 8, 2025
diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc
@@ -245,7 +245,23 @@ Fetch::FetchStatGroup::FetchStatGroup(CPU *cpu, Fetch *fetch)
     ADD_STAT(decodeStallRate, statistics::units::Rate<
                     statistics::units::Count, statistics::units::Cycle>::get(),
              "Number of decode stalls per cycle",
-             decodeStalls / cpu->baseStats.numCycles)
+             decodeStalls / cpu->baseStats.numCycles),
+    ADD_STAT(fetchBubbles, statistics::units::Count::get(),
+             "Unutilized issue-pipeline slots while there is no backend-stall"),
+    ADD_STAT(fetchBubbles_max, statistics::units::Count::get(),
+             "Cycles that fetch 0 instruction while there is no backend-stall"),
+    ADD_STAT(frontendBound, statistics::units::Rate<
+                    statistics::units::Count, statistics::units::Cycle>::get(),
+             "Frontend Bound",
+             fetchBubbles / (cpu->baseStats.numCycles * fetch->decodeWidth)),
+    ADD_STAT(frontendLatencyBound, statistics::units::Rate<
+                    statistics::units::Count, statistics::units::Cycle>::get(),
+             "Frontend Latency Bound",
+             fetchBubbles_max / cpu->baseStats.numCycles),
+    ADD_STAT(frontendBandwidthBound, statistics::units::Rate<
+                    statistics::units::Count, statistics::units::Cycle>::get(),
+             "Frontend Bandwidth Bound",
+             frontendBound - frontendLatencyBound)
 {
         icacheStallCycles
             .prereq(icacheStallCycles);
@@ -320,6 +336,16 @@ Fetch::FetchStatGroup::FetchStatGroup(CPU *cpu, Fetch *fetch)
             .prereq(decodeStalls);
         decodeStallRate
             .flags(statistics::total);
+        fetchBubbles
+            .prereq(fetchBubbles);
+        fetchBubbles_max
+            .prereq(fetchBubbles_max);
+        frontendBound
+            .flags(statistics::total);
+        frontendLatencyBound
+            .flags(statistics::total);
+        frontendBandwidthBound
+            .flags(statistics::total);
 }
 void
 Fetch::setTimeBuffer(TimeBuffer<TimeStruct> *time_buffer)
@@ -1163,6 +1189,9 @@ Fetch::tick()
 
     wroteToTimeBuffer = false;
 
+    // get the distribution of fetch status
+    fetchStats.fetchStatusDist[fetchStatus[0]]++;
+
     for (ThreadID i = 0; i < numThreads; ++i) {
         issuePipelinedIfetch[i] = false;
     }
@@ -1294,7 +1323,20 @@ Fetch::tick()
 
     toDecode->fetchStallReason = stallReason;
 
-    fetchStats.fetchStatusDist[fetchStatus[*tid_itr]]++;
+    // Intel TopDown method for measuring frontend bubbles
+    // Count unutilized issue slots when backend is not stalled (decode not stalled)
+    // For N-wide machine, if frontend supplies 0 instructions:
+    // - fetchBubbles += N (count total empty slots)
+    // - fetchBubbles_max += 1 (count occurrence of all slots being empty)
+    if (!stalls[*tid_itr].decode) { // backend not stalled
+        int unused_slots = decode_width - insts_to_decode;
+        if (unused_slots > 0) { // has empty slots
+            fetchStats.fetchBubbles += unused_slots; // add number of empty slots
+            if (unused_slots == decode_width) { // all slots empty, insts_to_decode == 0
+                fetchStats.fetchBubbles_max++; // count max bubble occurrence
+            }
+        }
+    }
 
     if (stalls[*tid_itr].decode) {
         fetchStats.decodeStalls++;

diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh
@@ -652,6 +652,16 @@ class Fetch
         statistics::Scalar decodeStalls;
         /** Number of decode stalls per cycle */
         statistics::Formula decodeStallRate;
+        /** Unutilized issue-pipeline slots while there is no backend-stall */
+        statistics::Scalar fetchBubbles;
+        /** Cycles that fetch 0 instruction while there is no backend-stall */
+        statistics::Scalar fetchBubbles_max;
+        /** Frontend Bound */
+        statistics::Formula frontendBound;
+        /** Frontend Latency Bound */
+        statistics::Formula frontendLatencyBound;
+        /** Frontend Bandwidth Bound */
+        statistics::Formula frontendBandwidthBound;
     } fetchStats;
 
     SquashVersion localSquashVer;

diff --git a/src/cpu/o3/inst_queue.cc b/src/cpu/o3/inst_queue.cc
@@ -123,7 +123,7 @@ InstructionQueue::InstructionQueue(CPU *cpu_ptr, IEW *iew_ptr,
         memDepUnit[tid].setIQ(this);
     }
 
-    scheduler->setCPU(cpu_ptr);
+    scheduler->setCPU(cpu_ptr, &iew_ptr->ldstQueue);
     scheduler->resetDepGraph(numPhysRegs);
     scheduler->setMemDepUnit(memDepUnit);
 

diff --git a/src/cpu/o3/issue_queue.cc b/src/cpu/o3/issue_queue.cc
@@ -623,6 +623,16 @@ Scheduler::SpecWakeupCompletion::description() const
     return "Spec wakeup completion";
 }
 
+Scheduler::SchedulerStats::SchedulerStats(statistics::Group* parent)
+  : statistics::Group(parent),
+    ADD_STAT(exec_stall_cycle, ""),
+    ADD_STAT(memstall_any_load, ""),
+    ADD_STAT(memstall_l1miss,""),
+    ADD_STAT(memstall_l2miss,""),
+    ADD_STAT(memstall_l3miss,"")
+{
+}
+
 bool
 Scheduler::disp_policy::operator()(IssueQue* a, IssueQue* b) const
 {
@@ -632,7 +642,7 @@ Scheduler::disp_policy::operator()(IssueQue* a, IssueQue* b) const
     return p0 < p1;
 }
 
-Scheduler::Scheduler(const SchedulerParams& params) : SimObject(params), issueQues(params.IQs)
+Scheduler::Scheduler(const SchedulerParams& params) : SimObject(params), stats(this), issueQues(params.IQs)
 {
     dispTable.resize(enums::OpClass::Num_OpClass);
     opExecTimeTable.resize(enums::OpClass::Num_OpClass, 1);
@@ -719,9 +729,10 @@ Scheduler::Scheduler(const SchedulerParams& params) : SimObject(params), issueQu
 }
 
 void
-Scheduler::setCPU(CPU* cpu)
+Scheduler::setCPU(CPU* cpu, LSQ* lsq)
 {
     this->cpu = cpu;
+    this->lsq = lsq;
     for (auto it : issueQues) {
         it->setCPU(cpu);
     }
@@ -764,6 +775,16 @@ Scheduler::issueAndSelect()
     for (auto it : issueQues) {
         it->issueToFu();
     }
+    if (instsToFu.size() < 4) {
+        stats.exec_stall_cycle++;
+    }
+    if (instsToFu.size() == 0) {
+        if (lsq->anyInflightLoadsNotComplete()) stats.memstall_any_load++;
+        if (lsq->anyInflightLoadsNotComplete(1)) stats.memstall_l1miss++;
+        if (lsq->anyInflightLoadsNotComplete(2)) stats.memstall_l2miss++;
+        if (lsq->anyInflightLoadsNotComplete(3)) stats.memstall_l3miss++;
+    }
+
     // must wait for all insts was issued
     for (auto it : issueQues) {
         it->selectInst();

diff --git a/src/cpu/o3/issue_queue.hh b/src/cpu/o3/issue_queue.hh
@@ -204,6 +204,17 @@ class Scheduler : public SimObject
 
     CPU* cpu;
     MemDepUnit* memDepUnit;
+    LSQ* lsq;
+
+    struct SchedulerStats : public statistics::Group
+    {
+        SchedulerStats(statistics::Group* parent);
+        statistics::Scalar exec_stall_cycle;
+        statistics::Scalar memstall_any_load;
+        statistics::Scalar memstall_l1miss;
+        statistics::Scalar memstall_l2miss;
+        statistics::Scalar memstall_l3miss;
+    } stats;
 
     struct disp_policy
     {
@@ -246,7 +257,7 @@ class Scheduler : public SimObject
 
   public:
     Scheduler(const SchedulerParams& params);
-    void setCPU(CPU* cpu);
+    void setCPU(CPU* cpu, LSQ* lsq);
     void resetDepGraph(uint64_t numPhysRegs);
     void setMemDepUnit(MemDepUnit* memDepUnit) { this->memDepUnit = memDepUnit; }
 

diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc
@@ -381,6 +381,16 @@ int LSQ::getCount(ThreadID tid) { return thread.at(tid).getCount(); }
 
 int LSQ::numLoads(ThreadID tid) { return thread.at(tid).numLoads(); }
 
+bool LSQ::anyInflightLoadsNotComplete(int miss_level)
+{
+    for (auto it : thread.at(0).inflightLoads) {
+        if (it->isAnyOutstandingRequest() && (it->mainReq()->depth >= miss_level)) {
+            return true;
+        }
+    }
+    return false;
+}
+
 int LSQ::numStores(ThreadID tid) { return thread.at(tid).numStores(); }
 
 int
@@ -1352,6 +1362,12 @@ LSQ::SingleDataRequest::recvTimingResp(PacketPtr pkt)
     // Dump inst num, request addr, and packet addr
     DPRINTF(LSQ, "Single Req::recvTimingResp: inst: %llu, pkt: %#lx\n", pkt->req->getReqInstSeqNum(),
             pkt->getAddr());
+    if (isLoad()) {
+        auto it = std::find(lsqUnit()->inflightLoads.begin(), lsqUnit()->inflightLoads.end(), this);
+        if (it != lsqUnit()->inflightLoads.end()) {
+            lsqUnit()->inflightLoads.erase(it);
+        }
+    }
     assert(_numOutstandingPackets == 1);
     flags.set(Flag::Complete);
     assert(pkt == _packets.front());
@@ -1520,6 +1536,11 @@ LSQ::SingleDataRequest::sendPacketToCache()
     bool tag_read_fail = false;
     bool success = lsqUnit()->trySendPacket(isLoad(), _packets.at(0), bank_conflict, tag_read_fail);
     if (success) {
+        if (isLoad()) {
+            assert(lsqUnit()->inflightLoads.size() < lsqUnit()->numLoads());
+            lsqUnit()->inflightLoads.emplace_back(this);
+        }
+
         if (!bank_conflict) {
             _numOutstandingPackets = 1;
         }

diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
@@ -283,24 +283,6 @@ class LSQ
                 uint64_t* res=nullptr, AtomicOpFunctorPtr amo_op=nullptr,
                 bool stale_translation=false);
 
-        bool
-        isLoad() const
-        {
-            return flags.isSet(Flag::IsLoad);
-        }
-
-        bool
-        isHInst() const
-        {
-            return flags.isSet(Flag::IsHInst);
-        }
-
-        bool
-        isAtomic() const
-        {
-            return flags.isSet(Flag::IsAtomic);
-        }
-
         /** Install the request in the LQ/SQ. */
         void install();
 
@@ -344,6 +326,24 @@ class LSQ
 
       public:
 
+        bool
+        isLoad() const
+        {
+            return flags.isSet(Flag::IsLoad);
+        }
+
+        bool
+        isHInst() const
+        {
+            return flags.isSet(Flag::IsHInst);
+        }
+
+        bool
+        isAtomic() const
+        {
+            return flags.isSet(Flag::IsAtomic);
+        }
+
         void forward();
 
         /** Convenience getters/setters. */
@@ -803,6 +803,8 @@ class LSQ
     /** Returns the total number of loads for a single thread. */
     int numLoads(ThreadID tid);
 
+    bool anyInflightLoadsNotComplete(int miss_level = -1);
+
     /** Returns the total number of stores in the store queue. */
     int numStores();
     /** Returns the total number of stores for a single thread. */

diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc
@@ -1520,6 +1520,12 @@ LSQUnit::squash(const InstSeqNum &squashed_num)
             DPRINTF(HtmCpu, ">> htmStarts (%d) : htmStops-- (%d)\n",
               htmStarts, htmStops);
         }
+        auto request = loadQueue.back().request();
+        auto it = std::find(inflightLoads.begin(), inflightLoads.end(), request);
+        if (it != inflightLoads.end()) {
+            inflightLoads.erase(it);
+        }
+
         // Clear the smart pointer to make sure it is decremented.
         loadQueue.back().instruction()->setSquashed();
         loadQueue.back().clear();

diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
@@ -287,6 +287,8 @@ class LSQUnit
     using LoadQueue = CircularQueue<LQEntry>;
     using StoreQueue = CircularQueue<SQEntry>;
 
+    std::vector<LSQRequest*> inflightLoads;
+
   public:
     /** Constructs an LSQ unit. init() must be called prior to use. */
     LSQUnit(uint32_t lqEntries, uint32_t sqEntries, uint32_t sbufferEntries,