Skip to content

Commit

Permalink
Merge PR ceph#55616 into main
Browse files Browse the repository at this point in the history
* refs/pull/55616/head:
	PendingReleaseNotes: add note for replay completion warning
	qa: test to verify `MDS_ESTIMATED_REPLAY_TIME` warning
	doc: add a note for `MDS_ESTIMATED_REPLAY_TIME` MDS warning
	mds: emit warning for estinated replay time

Reviewed-by: Patrick Donnelly <[email protected]>
Reviewed-by: Milind Changire <[email protected]>
  • Loading branch information
vshankar committed Dec 27, 2024
2 parents 44ee2c8 + 65acd39 commit 2215d55
Show file tree
Hide file tree
Showing 11 changed files with 154 additions and 1 deletion.
2 changes: 2 additions & 0 deletions PendingReleaseNotes
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,8 @@ CephFS: Disallow delegating preallocated inode ranges to clients. Config
* NFS: The export create/apply of CephFS based exports will now have a additional parameter `cmount_path` under the FSAL block,
which specifies the path within the CephFS to mount this export on. If this and the other
`EXPORT { FSAL {} }` options are the same between multiple exports, those exports will share a single CephFS client. If not specified, the default is `/`.
* CephFS: MDS emits a warning with estimated replay completion time when replay
runs for more than 30 seconds.

>=18.0.0

Expand Down
8 changes: 8 additions & 0 deletions doc/cephfs/health-messages.rst
Original file line number Diff line number Diff line change
Expand Up @@ -269,3 +269,11 @@ other daemons, please see :ref:`health-checks`.

To evict and permanently block broken clients from connecting to the
cluster, set the ``required_client_feature`` bit ``client_mds_auth_caps``.

``MDS_ESTIMATED_REPLAY_TIME``
-----------------------------
Message
"HEALTH_WARN Replay: x% complete. Estimated time remaining *x* seconds

Description
When an MDS journal replay takes more than 30 seconds, this message indicates the estimated time to completion.
1 change: 1 addition & 0 deletions qa/suites/fs/multifs/tasks/failover.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ overrides:
- \(MDS_DAMAGE\)
- \(FS_DEGRADED\)
- \(MDS_CACHE_OVERSIZED\)
- \(MDS_ESTIMATED_REPLAY_TIME\)
ceph-fuse:
disabled: true
tasks:
Expand Down
55 changes: 55 additions & 0 deletions qa/tasks/cephfs/test_failover.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
import time
import signal
import logging
Expand Down Expand Up @@ -342,6 +343,60 @@ def test_thrash(self):

self.fs.wait_for_daemons(timeout=90)

class TestFailoverBeaconHealth(CephFSTestCase):
CLIENTS_REQUIRED = 1
MDSS_REQUIRED = 1

def initiate_journal_replay(self, num_files=100):
""" Initiate journal replay by creating files and restarting mds server."""

self.config_set("mds", "mds_delay_journal_replay_for_testing", "5000")
self.mounts[0].test_files = [str(x) for x in range(num_files)]
self.mounts[0].create_files()
self.fs.fail()
self.fs.set_joinable()

def test_replay_beacon_estimated_time(self):
"""
That beacon emits warning message with estimated time to complete replay
"""
self.initiate_journal_replay()
self.wait_for_health("MDS_ESTIMATED_REPLAY_TIME", 60)
# remove the config so that replay finishes and the cluster
# is HEALTH_OK
self.config_rm("mds", "mds_delay_journal_replay_for_testing")
self.wait_for_health_clear(timeout=60)

def test_replay_estimated_time_accuracy(self):
self.initiate_journal_replay(250)
def replay_complete():
health = self.ceph_cluster.mon_manager.get_mon_health(debug=False, detail=True)
codes = [s for s in health['checks']]
return 'MDS_ESTIMATED_REPLAY_TIME' not in codes

def get_estimated_time():
completion_percentage = 0.0
time_duration = pending_duration = 0
with safe_while(sleep=5, tries=360) as proceed:
while proceed():
health = self.ceph_cluster.mon_manager.get_mon_health(debug=False, detail=True)
codes = [s for s in health['checks']]
if 'MDS_ESTIMATED_REPLAY_TIME' in codes:
message = health['checks']['MDS_ESTIMATED_REPLAY_TIME']['detail'][0]['message']
### sample warning string: "mds.a(mds.0): replay: 50.0446% complete - elapsed time: 582s, estimated time remaining: 581s"
m = re.match(".* replay: (\d+(\.\d+)?)% complete - elapsed time: (\d+)s, estimated time remaining: (\d+)s", message)
if not m:
continue
completion_percentage = float(m.group(1))
time_duration = int(m.group(3))
pending_duration = int(m.group(4))
log.debug(f"MDS_ESTIMATED_REPLAY_TIME is present in health: {message}, duration: {time_duration}, completion_percentage: {completion_percentage}")
if completion_percentage >= 50:
return (completion_percentage, time_duration, pending_duration)
_, _, pending_duration = get_estimated_time()
# wait for 25% more time to avoid false negative failures
self.wait_until_true(replay_complete, timeout=pending_duration * 1.25)

class TestFailover(CephFSTestCase):
CLIENTS_REQUIRED = 1
MDSS_REQUIRED = 2
Expand Down
6 changes: 6 additions & 0 deletions src/common/options/mds.yaml.in
Original file line number Diff line number Diff line change
Expand Up @@ -1713,6 +1713,12 @@ options:
default: 1000
services:
- mds
- name: mds_delay_journal_replay_for_testing
type: millisecs
level: dev
desc: Delay the journal replay to verify the replay time estimate
long_desc: Jorunal replay warning is activated if the mds has been in replay state for more than 30 seconds. This config delays replay for validating the replay warning in tests.
default: 0
flags:
- runtime
- name: mds_server_dispatch_killpoint_random
Expand Down
14 changes: 14 additions & 0 deletions src/mds/Beacon.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "mds/MDSRank.h"
#include "mds/MDSMap.h"
#include "mds/Locker.h"
#include "mds/mdstypes.h"

#include "Beacon.h"

Expand Down Expand Up @@ -550,6 +551,19 @@ void Beacon::notify_health(MDSRank const *mds)
}
}
}
if (mds->is_replay()) {
CachedStackStringStream css;
auto estimate = mds->mdlog->get_estimated_replay_finish_time();
// this probably should be configurable, however, its fine to report
// if replay is running for more than 30 seconds.
if (estimate.elapsed_time > std::chrono::seconds(30)) {
*css << "replay: " << estimate.percent_complete << "% complete - elapsed time: "
<< estimate.elapsed_time << ", estimated time remaining: "
<< estimate.estimated_time;
MDSHealthMetric m(MDS_HEALTH_ESTIMATED_REPLAY_TIME, HEALTH_WARN, css->strv());
health.metrics.push_back(m);
}
}
}

MDSMap::DaemonState Beacon::get_want_state() const
Expand Down
48 changes: 47 additions & 1 deletion src/mds/MDLog.cc
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,46 @@ uint64_t MDLog::get_safe_pos() const
return journaler->get_write_safe_pos();
}

// estimate the replay completion time based on mdlog journal pointers
EstimatedReplayTime MDLog::get_estimated_replay_finish_time() {
ceph_assert(mds->is_replay());

EstimatedReplayTime estimated_time{0, std::chrono::seconds::zero(), std::chrono::seconds::zero()};
if (!journaler) {
return estimated_time;
}

auto read_pos = journaler->get_read_pos();
auto write_pos = journaler->get_write_pos();
auto trimmed_pos = journaler->get_trimmed_pos();

dout(20) << __func__ << ": read_pos=" << read_pos << ", write_pos="
<< write_pos << ", trimmed_pos=" << trimmed_pos << dendl;

if (read_pos == trimmed_pos || write_pos == trimmed_pos) {
return estimated_time;
}

auto total_bytes = write_pos - trimmed_pos;
double percent_complete = ((double)(read_pos - trimmed_pos)) / (double)total_bytes;
auto elapsed_time = std::chrono::duration_cast<std::chrono::seconds>
(ceph::coarse_mono_clock::now() - replay_start_time);
auto time = ((1 - percent_complete) / percent_complete) * elapsed_time;

dout(20) << __func__ << "percent_complete=" << percent_complete
<< ", elapsed_time=" << elapsed_time
<< ", estimated_time=" << std::chrono::round<std::chrono::seconds>(time)
<< dendl;

estimated_time.percent_complete = percent_complete * 100;
estimated_time.elapsed_time = elapsed_time;
estimated_time.estimated_time = std::chrono::round<std::chrono::seconds>(time);
dout(20) << __func__ << "estimated_time.percent_complete=" << estimated_time.percent_complete
<< ", estimated_time.elapsed_time=" << estimated_time.elapsed_time
<< ", estimated_time.estimated_time=" << estimated_time.estimated_time
<< dendl;
return estimated_time;
}

void MDLog::create(MDSContext *c)
{
Expand Down Expand Up @@ -1137,6 +1176,7 @@ void MDLog::_recovery_thread(MDSContext *completion)
{
std::lock_guard l(mds->mds_lock);
journaler = front_journal;
replay_start_time = ceph::coarse_mono_clock::now();
}

C_SaferCond recover_wait;
Expand Down Expand Up @@ -1374,11 +1414,17 @@ void MDLog::_reformat_journal(JournalPointer const &jp_in, Journaler *old_journa
// i am a separate thread
void MDLog::_replay_thread()
{
dout(10) << "_replay_thread start" << dendl;
dout(10) << __func__ << ": start time: " << replay_start_time << ", now: "
<< ceph::coarse_mono_clock::now() << dendl;

// loop
int r = 0;
while (1) {
auto sleep_time = g_conf().get_val<std::chrono::milliseconds>("mds_delay_journal_replay_for_testing");
if (unlikely(sleep_time > 0ms)) {
dout(10) << __func__ << ": sleeping for " << sleep_time << "ms" << dendl;
std::this_thread::sleep_for(sleep_time);
}
// wait for read?
journaler->check_isreadable();
if (journaler->get_error()) {
Expand Down
4 changes: 4 additions & 0 deletions src/mds/MDLog.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ enum {
#include "LogSegment.h"
#include "MDSMap.h"
#include "SegmentBoundary.h"
#include "mdstypes.h"

#include <list>
#include <map>
Expand Down Expand Up @@ -162,6 +163,7 @@ class MDLog {
void reopen(MDSContext *onopen);
void append();
void replay(MDSContext *onfinish);
EstimatedReplayTime get_estimated_replay_finish_time();

void standby_trim_segments();

Expand Down Expand Up @@ -328,5 +330,7 @@ class MDLog {
std::atomic<bool> upkeep_log_trim_shutdown{false};

std::map<uint64_t, std::vector<Context*>> waiting_for_expire; // protected by mds_lock

ceph::coarse_mono_time replay_start_time = ceph::coarse_mono_clock::zero();
};
#endif
5 changes: 5 additions & 0 deletions src/mds/mdstypes.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1042,3 +1042,8 @@ void snaprealm_reconnect_t::generate_test_instances(std::list<snaprealm_reconnec
ls.back()->realm.seq = 2;
ls.back()->realm.parent = 1;
}

void EstimatedReplayTime::print(std::ostream& out) {
out << "replay: " << percent_complete << "% complete - elapsed time: "
<< elapsed_time << ", estimated time remaining: " << estimated_time;
}
8 changes: 8 additions & 0 deletions src/mds/mdstypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -1044,4 +1044,12 @@ inline bool operator==(const MDSCacheObjectInfo& l, const MDSCacheObjectInfo& r)
}
WRITE_CLASS_ENCODER(MDSCacheObjectInfo)

struct EstimatedReplayTime {
double percent_complete;
std::chrono::seconds estimated_time;
std::chrono::seconds elapsed_time;

void print(std::ostream& out);
};

#endif
4 changes: 4 additions & 0 deletions src/messages/MMDSBeacon.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ enum mds_metric_t {
MDS_HEALTH_CLIENTS_LAGGY,
MDS_HEALTH_CLIENTS_LAGGY_MANY,
MDS_HEALTH_CLIENTS_BROKEN_ROOTSQUASH,
MDS_HEALTH_ESTIMATED_REPLAY_TIME,
MDS_HEALTH_DUMMY, // not a real health warning, for testing
};

Expand All @@ -69,6 +70,7 @@ inline const char *mds_metric_name(mds_metric_t m)
case MDS_HEALTH_CLIENTS_LAGGY: return "MDS_CLIENTS_LAGGY";
case MDS_HEALTH_CLIENTS_LAGGY_MANY: return "MDS_CLIENTS_LAGGY_MANY";
case MDS_HEALTH_CLIENTS_BROKEN_ROOTSQUASH: return "MDS_CLIENTS_BROKEN_ROOTSQUASH";
case MDS_HEALTH_ESTIMATED_REPLAY_TIME: return "MDS_ESTIMATED_REPLAY_TIME";
case MDS_HEALTH_DUMMY: return "MDS_DUMMY";
default:
return "???";
Expand Down Expand Up @@ -107,6 +109,8 @@ inline const char *mds_metric_summary(mds_metric_t m)
return "%num% client(s) laggy due to laggy OSDs";
case MDS_HEALTH_CLIENTS_BROKEN_ROOTSQUASH:
return "%num% MDS report clients with broken root_squash implementation";
case MDS_HEALTH_ESTIMATED_REPLAY_TIME:
return "%num% estimated journal replay time";
default:
return "???";
}
Expand Down

0 comments on commit 2215d55

Please sign in to comment.