Skip to content

Commit

Permalink
Merge pull request ceph#60067 from VallariAg/wip-nvmeof-healthcheck
Browse files Browse the repository at this point in the history
mon: add nvmeof healthchecks
  • Loading branch information
VallariAg authored Nov 14, 2024
2 parents ce0d6fc + 73d5c01 commit 874ae37
Show file tree
Hide file tree
Showing 7 changed files with 81 additions and 0 deletions.
19 changes: 19 additions & 0 deletions doc/rados/operations/health-checks.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1640,6 +1640,25 @@ We encourage you to fix this by making the weights even on both dividing buckets
This can be done by making sure the combined weight of the OSDs on each dividing
bucket are the same.

NVMeoF Gateway
--------------

NVMEOF_SINGLE_GATEWAY
_____________________

One of the gateway group has only one gateway. This is not ideal because it makes
high availability (HA) impossible with a single gatway in a group. This can lead to
problems with failover and failback operations for the NVMeoF gateway.

It's recommended to have multiple NVMeoF gateways in a group.

NVMEOF_GATEWAY_DOWN
___________________

Some of the gateways are in the GW_UNAVAILABLE state. If a NVMeoF daemon has crashed,
the daemon log file (found at ``/var/log/ceph/``) may contain troubleshooting information.


Miscellaneous
-------------

Expand Down
2 changes: 2 additions & 0 deletions qa/suites/nvmeof/basic/clusters/4-gateways-2-initiator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,5 @@ overrides:
mon:
# cephadm can take up to 5 minutes to bring up remaining mons
mon down mkfs grace: 300
log-ignorelist:
- NVMEOF_SINGLE_GATEWAY
3 changes: 3 additions & 0 deletions qa/suites/nvmeof/thrash/thrashers/nvmeof_mon_thrash.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ overrides:
- out of quorum
# nvmeof daemon thrashing
- CEPHADM_FAILED_DAEMON
- NVMEOF_SINGLE_GATEWAY
- NVMEOF_GATEWAY_DOWN
- are in unavailable state
- is in error state
- failed cephadm daemon

Expand Down
3 changes: 3 additions & 0 deletions qa/suites/nvmeof/thrash/thrashers/nvmeof_thrash.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ overrides:
log-ignorelist:
# nvmeof daemon thrashing
- CEPHADM_FAILED_DAEMON
- NVMEOF_SINGLE_GATEWAY
- NVMEOF_GATEWAY_DOWN
- are in unavailable state
- is in error state
- failed cephadm daemon

Expand Down
43 changes: 43 additions & 0 deletions src/mon/NVMeofGwMap.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
#include "NVMeofGwMon.h"
#include "NVMeofGwMap.h"
#include "OSDMonitor.h"
#include "mon/health_check.h"

using std::list;
using std::map;
using std::make_pair;
using std::ostream;
Expand Down Expand Up @@ -893,6 +895,47 @@ struct CMonRequestProposal : public Context {
}
};

void NVMeofGwMap::get_health_checks(health_check_map_t *checks) const
{
list<string> singleGatewayDetail;
list<string> gatewayDownDetail;
for (const auto& created_map_pair: created_gws) {
const auto& group_key = created_map_pair.first;
auto& group = group_key.second;
const NvmeGwMonStates& gw_created_map = created_map_pair.second;
if ( gw_created_map.size() == 1) {
ostringstream ss;
ss << "NVMeoF Gateway Group '" << group << "' has 1 gateway." ;
singleGatewayDetail.push_back(ss.str());
}
for (const auto& gw_created_pair: gw_created_map) {
const auto& gw_id = gw_created_pair.first;
const auto& gw_created = gw_created_pair.second;
if (gw_created.availability == gw_availability_t::GW_UNAVAILABLE) {
ostringstream ss;
ss << "NVMeoF Gateway '" << gw_id << "' is unavailable." ;
gatewayDownDetail.push_back(ss.str());
}
}
}
if (!singleGatewayDetail.empty()) {
ostringstream ss;
ss << singleGatewayDetail.size() << " group(s) have only 1 nvmeof gateway"
<< "; HA is not possible with single gateway.";
auto& d = checks->add("NVMEOF_SINGLE_GATEWAY", HEALTH_WARN,
ss.str(), singleGatewayDetail.size());
d.detail.swap(singleGatewayDetail);
}
if (!gatewayDownDetail.empty()) {
ostringstream ss;
ss << gatewayDownDetail.size() << " gateway(s) are in unavailable state"
<< "; gateway might be down, try to redeploy.";
auto& d = checks->add("NVMEOF_GATEWAY_DOWN", HEALTH_WARN,
ss.str(), gatewayDownDetail.size());
d.detail.swap(gatewayDownDetail);
}
}

int NVMeofGwMap::blocklist_gw(
const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
NvmeAnaGrpId grpid, epoch_t &epoch, bool failover)
Expand Down
5 changes: 5 additions & 0 deletions src/mon/NVMeofGwMap.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
#include "NVMeofGwTypes.h"

using ceph::coarse_mono_clock;

class health_check_map_t;

class Monitor;
/*-------------------*/
class NVMeofGwMap
Expand Down Expand Up @@ -140,6 +143,8 @@ class NVMeofGwMap
decode(fsm_timers, bl);
DECODE_FINISH(bl);
}

void get_health_checks(health_check_map_t *checks) const;
};

#include "NVMeofGwSerialize.h"
Expand Down
6 changes: 6 additions & 0 deletions src/mon/NVMeofGwMon.cc
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,11 @@ void NVMeofGwMon::encode_pending(MonitorDBStore::TransactionRef t)
<< HAVE_FEATURE(mon.get_quorum_con_features(), NVMEOFHA) << dendl;
put_version(t, pending_map.epoch, bl);
put_last_committed(t, pending_map.epoch);

//health
health_check_map_t checks;
pending_map.get_health_checks(&checks);
encode_health(checks, t);
}

void NVMeofGwMon::update_from_paxos(bool *need_bootstrap)
Expand All @@ -193,6 +198,7 @@ void NVMeofGwMon::update_from_paxos(bool *need_bootstrap)
bufferlist bl;
int err = get_version(version, bl);
ceph_assert(err == 0);
load_health();

auto p = bl.cbegin();
map.decode(p);
Expand Down

0 comments on commit 874ae37

Please sign in to comment.