Skip to content

Commit

Permalink
Merge pull request ceph#60777 from VallariAg/wip-nvmeof-prometheus-rb…
Browse files Browse the repository at this point in the history
…d-image-reused

monitoring: Add prometheus alert NVMeoFMultipleNamespacesOfRBDImage

Reviewed-by: Afreen Misbah <[email protected]>
  • Loading branch information
afreen23 authored Dec 20, 2024
2 parents 0a515df + 61b3289 commit 919c2a6
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 0 deletions.
10 changes: 10 additions & 0 deletions monitoring/ceph-mixin/prometheus_alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -855,6 +855,16 @@
description: 'Subsystems have a max namespace limit defined at creation time. This alert means that no more namespaces can be added to {{ $labels.nqn }}',
},
},
{
alert: 'NVMeoFMultipleNamespacesOfRBDImage',
'for': '1m',
expr: 'count by(pool_name, rbd_name) (count by(bdev_name, pool_name, rbd_name) (ceph_nvmeof_bdev_metadata and on (bdev_name) ceph_nvmeof_subsystem_namespace_metadata)) > 1',
labels: { severity: 'warning', type: 'ceph_default' },
annotations: {
summary: 'RBD image {{ $labels.pool_name }}/{{ $labels.rbd_name }} cannot be reused for multiple NVMeoF namespace ',
description: 'Each NVMeoF namespace must have a unique RBD pool and image, across all different gateway groups.',
},
},
{
alert: 'NVMeoFTooManyGateways',
'for': '1m',
Expand Down
9 changes: 9 additions & 0 deletions monitoring/ceph-mixin/prometheus_alerts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -765,6 +765,15 @@ groups:
labels:
severity: "warning"
type: "ceph_default"
- alert: "NVMeoFMultipleNamespacesOfRBDImage"
annotations:
description: "Each NVMeoF namespace must have a unique RBD pool and image, across all different gateway groups."
summary: "RBD image {{ $labels.pool_name }}/{{ $labels.rbd_name }} cannot be reused for multiple NVMeoF namespace "
expr: "count by(pool_name, rbd_name) (count by(bdev_name, pool_name, rbd_name) (ceph_nvmeof_bdev_metadata and on (bdev_name) ceph_nvmeof_subsystem_namespace_metadata)) > 1"
for: "1m"
labels:
severity: "warning"
type: "ceph_default"
- alert: "NVMeoFTooManyGateways"
annotations:
description: "You may create many gateways, but 4 is the tested limit"
Expand Down
48 changes: 48 additions & 0 deletions monitoring/ceph-mixin/tests_alerts/test_alerts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2270,6 +2270,54 @@ tests:
summary: "wah subsystem has reached its maximum number of namespaces on cluster mycluster"
description: "Subsystems have a max namespace limit defined at creation time. This alert means that no more namespaces can be added to wah"

# NVMeoFMultipleNamespacesOfRBDImage
- interval: 1m
input_series:
- series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev1", instance="ceph-nvme-vm1", pool_name="mypool", rbd_name="myimage1"}'
values: '1x10'
- series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev1", instance="ceph-nvme-vm2", pool_name="mypool", rbd_name="myimage1"}'
values: '1x10'
- series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev2", instance="ceph-nvme-vm1", pool_name="mypool", rbd_name="myimage2"}'
values: '1x10'
- series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev2", instance="ceph-nvme-vm2", pool_name="mypool", rbd_name="myimage2"}'
values: '1x10'
- series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev3", instance="ceph-nvme-vm1", pool_name="mypool", rbd_name="myimage1"}'
values: '1x10'
- series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev3", instance="ceph-nvme-vm2", pool_name="mypool", rbd_name="myimage1"}'
values: '1x10'
- series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev4", instance="ceph-nvme-vm1", pool_name="mypool", rbd_name="myimage1"}' # bdev with no ns
values: '1x10'
- series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn1", nsid="1", bdev_name="bdev1", instance="ceph-nvme-vm1", cluster="mycluster"}'
values: '1x10'
- series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn1", nsid="1", bdev_name="bdev1", instance="ceph-nvme-vm2", cluster="mycluster"}'
values: '1x10'
- series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn1", nsid="2", bdev_name="bdev2", instance="ceph-nvme-vm1", cluster="mycluster"}'
values: '1x10'
- series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn1", nsid="2", bdev_name="bdev2", instance="ceph-nvme-vm2", cluster="mycluster"}'
values: '1x10'
- series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn2", nsid="1", bdev_name="bdev3", instance="ceph-nvme-vm1", cluster="mycluster"}'
values: '1x10'
- series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn2", nsid="1", bdev_name="bdev3", instance="ceph-nvme-vm2", cluster="mycluster"}'
values: '1x10'
promql_expr_test:
- expr: count by(pool_name, rbd_name) (count by(bdev_name, pool_name, rbd_name) (ceph_nvmeof_bdev_metadata and on (bdev_name) ceph_nvmeof_subsystem_namespace_metadata)) > 1
eval_time: 1m
exp_samples:
- labels: '{pool_name="mypool", rbd_name="myimage1"}'
value: 2
alert_rule_test:
- eval_time: 5m
alertname: NVMeoFMultipleNamespacesOfRBDImage
exp_alerts:
- exp_labels:
pool_name: mypool
rbd_name: myimage1
severity: warning
type: ceph_default
exp_annotations:
summary: "RBD image mypool/myimage1 cannot be reused for multiple NVMeoF namespace "
description: "Each NVMeoF namespace must have a unique RBD pool and image, across all different gateway groups."

# NVMeoFTooManyGateways
- interval: 1m
input_series:
Expand Down

0 comments on commit 919c2a6

Please sign in to comment.