From 28d9a405aca32f239c92f6e33eb9700eca4c46b5 Mon Sep 17 00:00:00 2001 From: dmaiocchi Date: Thu, 17 Oct 2019 11:08:34 +0200 Subject: [PATCH 1/2] Create skeletron for metrics specification --- README.md | 12 ++++++++++-- doc/metric_spec.md | 15 +++++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) create mode 100644 doc/metric_spec.md diff --git a/README.md b/README.md index ae71303..900ef87 100644 --- a/README.md +++ b/README.md @@ -9,8 +9,9 @@ It should run inside a node of the cluster or both. ## Table of Contents: 1. [Usage](#Usage) 2. [Features](#Features) -3. [Devel](#Devel) -4. [Design](#Design) +4. [Metrics Specification](#Metrics-specifications) +5. [Devel](#Devel) +6. [Design](#Design) ## Usage: @@ -34,6 +35,13 @@ For a terraform deployment you can also read: https://github.com/SUSE/ha-sap-ter - show SBD disk health metrics +- show DRBD metrics (local and remote disks resource metrics) + + +## Metrics Specification + +We mantain a complete list of the [metric specification](doc/metric_spec.md), usage and possible values. + ## Devel: Build the binary with `make` and run it inside a node of the ha cluster, it will show the metrics on port `9002` by default. diff --git a/doc/metric_spec.md b/doc/metric_spec.md new file mode 100644 index 0000000..e1ef0ca --- /dev/null +++ b/doc/metric_spec.md @@ -0,0 +1,15 @@ +# Metrics Specification: + +This is a specification of metrics exposed by the ha_cluster exporter. + +All metrics from the exporter start with the prefix `ha_cluster` + +Below you have a complete specification, ordered by component. + +# Pacemaker metrics + +# Corosync metrics + +# Drbd metrics + +# SBD metrics From 3b6e564f915382088bc9b9332940f975fd51d642 Mon Sep 17 00:00:00 2001 From: dmaiocchi Date: Thu, 17 Oct 2019 11:36:36 +0200 Subject: [PATCH 2/2] Add some doc for pacemaker metrics --- README.md | 8 +--- doc/metric_spec.md | 107 ++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 104 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 900ef87..80f4254 100644 --- a/README.md +++ b/README.md @@ -9,9 +9,8 @@ It should run inside a node of the cluster or both. ## Table of Contents: 1. [Usage](#Usage) 2. [Features](#Features) -4. [Metrics Specification](#Metrics-specifications) -5. [Devel](#Devel) -6. [Design](#Design) +3. [Devel](#Devel) +4. [Design](#Design) ## Usage: @@ -37,9 +36,6 @@ For a terraform deployment you can also read: https://github.com/SUSE/ha-sap-ter - show DRBD metrics (local and remote disks resource metrics) - -## Metrics Specification - We mantain a complete list of the [metric specification](doc/metric_spec.md), usage and possible values. ## Devel: diff --git a/doc/metric_spec.md b/doc/metric_spec.md index e1ef0ca..e9bbb06 100644 --- a/doc/metric_spec.md +++ b/doc/metric_spec.md @@ -1,4 +1,4 @@ -# Metrics Specification: +# Metrics specification: This is a specification of metrics exposed by the ha_cluster exporter. @@ -6,10 +6,107 @@ All metrics from the exporter start with the prefix `ha_cluster` Below you have a complete specification, ordered by component. -# Pacemaker metrics +1. [pacemaker](#pacemaker) +2. [drbd](#drbd) +3. [sbd](#sbd) +4. [corosyncl](#corosync) -# Corosync metrics +# Pacemaker -# Drbd metrics +The Pacemaker cluster metrics are atomic metrics and represent and updated snapshot of the HA cluster, retrieved fetching the XML CIB of pacemaker. -# SBD metrics +Some of the pacemaker metrics like `ha_cluster_node_resources` and `ha_cluster_nodes` metrics with labels share a common trait: + +they can be either set to `1` or they are absent, this is because they track the real state of the cluster resources monitored. + +1. [ha_cluster_node_resources](#ha_cluster_node_resources) +2. [ha_cluster_nodes](#ha_cluster_nodes) +3. [ha_cluster_nodes_configured_total](#ha_cluster_nodes_configured_total) +4. [ha_cluster_resources_configured_total](#ha_cluster_resources_configured_total) + + + +## ha_cluster_node_resources + +This metric show the current status of a cluster resource. + +A resource that previously was in the cluster but isn't anymore, will not monitored. Example: + +```ha_cluster_node_resources{managed="true",node_name="1b115",resource_name="cluster_md",role="started",status="active"} 1``` + +The metric will absent and not `0` + + +All the values are 1:1 with Pacemaker schema. + +- `managed`: indicates `true` or `false` if the resource is managed in cluster +- `node_name`: name of node of cluster +- `resource_name`: resource id/name of the CIB pacemaker +- `role`: allowed values `Started/Stopped/Master/Slave` or pending state `Starting/Stopping/Migrating/Promoting/Demoting` which are same as pacemaker roles for resources. +- `status` allowed values `active/orphaned/blocked/failed/failureIgnored/` status of resource from pacemaker XML. + Additionaly for the same resource we can have a combination of status. + +Example: + +``` +ha_cluster_node_resources{managed="true",node_name="1b115",resource_name="cluster_md",role="started",status="active"} 1 +ha_cluster_node_resources{managed="true",node_name="1b115",resource_name="clvm",role="started",status="active"} 1 +ha_cluster_node_resources{managed="true",node_name="1b115",resource_name="dlm",role="started",status="active"} 1 +ha_cluster_node_resources{managed="true",node_name="1b115",resource_name="drbd_passive",role="master",status="active"} 1 +ha_cluster_node_resources{managed="true",node_name="1b115",resource_name="fs_cluster_md",role="started",status="active"} 1 +ha_cluster_node_resources{managed="true",node_name="1b115",resource_name="fs_drbd_passive",role="started",status="active"} 1 +ha_cluster_node_resources{managed="true",node_name="1b115",resource_name="stonith-sbd",role="started",status="active"} 1 +ha_cluster_node_resources{managed="true",node_name="1b115",resource_name="vg_cluster_md",role="started",status="active"} 1 +ha_cluster_node_resources{managed="true",node_name="1b211",resource_name="dlm",role="started",status="active"} 1 +ha_cluster_node_resources{managed="true",node_name="1b211",resource_name="fs_cluster_md",role="stopped",status="active"} 1 +ha_cluster_node_resources{managed="true",node_name="1b211",resource_name="vg_cluster_md",role="stopped",status="active"} 1 +``` + +## ha_cluster_nodes + +- `node_name`: name of cluster node +- `type`: allowed values `online/standby/standby_onfail/maintanance/pending/unclean/shutdown/expected_up/dc/member/ping/remote/`. This are the possible type of pacemaker ha cluster + +Again here, when the resource is absent will be not showed. There is no `0` value, since it is a real snapshot from the HA cluster. +Examples: +``` +ha_cluster_nodes{node_name="1b115",type="dc"} 1 +ha_cluster_nodes{node_name="1b115",type="expected_up"} 1 +ha_cluster_nodes{node_name="1b115",type="member"} 1 +ha_cluster_nodes{node_name="1b115",type="online"} 1 +ha_cluster_nodes{node_name="1b211",type="expected_up"} 1 +ha_cluster_nodes{node_name="1b211",type="member"} 1 +ha_cluster_nodes{node_name="1b211",type="online"} 1 +``` + +## ha_cluster_nodes_configured_total + +Show the total number of configured noded in the HA cluster + +Example: + +``` +ha_cluster_nodes_configured_total 2 +``` + + +## ha_cluster_resources_configured_total + +Show the total number of resource configured in HA cluster +Example: +``` +ha_cluster_resources_configured_total 14 +``` + + +# Corosync + +`TODO` + +# Drbd + +`TODO`@MalloZup + +# SBD + +`TODO`