Skip to content

Commit

Permalink
Merge pull request #80 from MalloZup/configuration-update
Browse files Browse the repository at this point in the history
implement resource_agent_changes metric
  • Loading branch information
MalloZup authored Oct 29, 2019
2 parents ad595ee + 3b4a0fc commit 67953ef
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 4 deletions.
7 changes: 5 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
default: clean static-checks test build
default: clean static-checks test build post-build

download:
go mod download
Expand Down Expand Up @@ -32,6 +32,9 @@ clean:
go clean
rm -f coverage.out

post-build:
go mod tidy

release:

.PHONY: default download install static-checks vet-check fmt-check test clean release
.PHONY: default download install static-checks vet-check fmt-check test clean release post-build
8 changes: 7 additions & 1 deletion doc/metric_spec.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ The Pacemaker subsystem collects an atomic snapshot of the HA cluster directly f
5. [`ha_cluster_pacemaker_stonith_enabled`](#ha_cluster_pacemaker_stonith_enabled)
6. [`ha_cluster_pacemaker_fail_count`](#ha_cluster_pacemaker_fail_count)
7. [`ha_cluster_pacemaker_migration_threshold`](#ha_cluster_pacemaker_migration_threshold)

8. [`ha_cluster_pacemaker_config_last_change`](#ha_cluster_pacemaker_config_last_change)

### `ha_cluster_pacemaker_nodes`

Expand Down Expand Up @@ -102,6 +102,12 @@ The value can vary from 0, 1 , 5 etc to `+Inf`, that correspond to the infinity
The number of migration threshold pro node and resource ID set by a pacemaker cluster.
Possible values are positive numbers.

### `ha_cluster_pacemaker_config_last_change`

#### Description

The relevant part of this metric is its timestamp, which corresponds to the last time Pacemaker configuration changed.
The actual metric value will always be 1 and can be ignored.

## Corosync

Expand Down
16 changes: 16 additions & 0 deletions pacemaker_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"os/exec"
"strconv"
"strings"
"time"

"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
Expand Down Expand Up @@ -34,6 +35,9 @@ type summary struct {
Nodes struct {
Number int `xml:"number,attr"`
} `xml:"nodes_configured"`
LastChange struct {
Time string `xml:"time,attr"`
} `xml:"last_change"`
Resources struct {
Number int `xml:"number,attr"`
Disabled int `xml:"disabled,attr"`
Expand Down Expand Up @@ -89,6 +93,7 @@ var (
"stonith_enabled": NewMetricDesc("pacemaker", "stonith_enabled", "Whether or not stonith is enabled", nil),
"fail_count": NewMetricDesc("pacemaker", "fail_count", "The Fail count number per node and resource id", []string{"node", "resource"}),
"migration_threshold": NewMetricDesc("pacemaker", "migration_threshold", "The migration_threshold number per node and resource id", []string{"node", "resource"}),
"config_last_change": NewMetricDesc("pacemaker", "config_last_change", "Indicate if a configuration of resource has changed in cluster", []string{}),
}

crmMonPath = "/usr/sbin/crm_mon"
Expand Down Expand Up @@ -138,6 +143,7 @@ func (c *pacemakerCollector) Collect(ch chan<- prometheus.Metric) {
c.recordNodeMetrics(pacemakerStatus, ch)
c.recordFailCountMetrics(pacemakerStatus, ch)
c.recordMigrationThresholdMetrics(pacemakerStatus, ch)
c.recordResourceAgentsChanges(pacemakerStatus, ch)
}

func getPacemakerStatus() (pacemakerStatus, error) {
Expand Down Expand Up @@ -237,6 +243,16 @@ func (c *pacemakerCollector) recordFailCountMetrics(pacemakerStatus pacemakerSta
}
}

func (c *pacemakerCollector) recordResourceAgentsChanges(pacemakerStatus pacemakerStatus, ch chan<- prometheus.Metric) {
t, err := time.Parse(time.ANSIC, pacemakerStatus.Summary.LastChange.Time)
if err != nil {
log.Warnln(err)
return
}
// is the resource have changed we set a different timeout from pacemaker
ch <- prometheus.NewMetricWithTimestamp(t, prometheus.MustNewConstMetric(c.metrics["config_last_change"], prometheus.CounterValue, 1))
}

func (c *pacemakerCollector) recordMigrationThresholdMetrics(pacemakerStatus pacemakerStatus, ch chan<- prometheus.Metric) {
for _, node := range pacemakerStatus.NodeHistory.Node {
for _, resHistory := range node.ResourceHistory {
Expand Down
4 changes: 4 additions & 0 deletions pacemaker_metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,10 @@ func TestParsePacemakerXML(t *testing.T) {
t.Errorf("Blocked was incorrect, got: %d, expected: %d ", status.Summary.Resources.Blocked, expected)
}

if status.Summary.LastChange.Time != "Tue Jan 15 22:19:59 2019" {
t.Errorf("Blocked was incorrect, got: %s, expected: Tue Jan 15 22:19:59 2019", status.Summary.LastChange.Time)
}

expected = 2
if status.Summary.Nodes.Number != expected {
t.Errorf("sbdDevice was incorrect, got: %d, expected: %d ", status.Summary.Nodes.Number, expected)
Expand Down
3 changes: 2 additions & 1 deletion test/fake_crm_mon.sh
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ cat <<EOF
</node_history>
<tickets>
</tickets>
<bans>
<bans>
<ban id="cli-ban-msl_SAPHana_PRD_HDB00-on-damadog-hana01" resource="msl_SAPHana_PRD_HDB00" node="damadog-hana01" weight="-1000000" master_only="false" />
</bans>
</crm_mon>
EOF
3 changes: 3 additions & 0 deletions test/pacemaker.metrics
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# HELP ha_cluster_pacemaker_config_last_change Indicate if a configuration of resource has changed in cluster
# TYPE ha_cluster_pacemaker_config_last_change counter
ha_cluster_pacemaker_config_last_change 1 1571399302000
# HELP ha_cluster_pacemaker_fail_count The Fail count number per node and resource id
# TYPE ha_cluster_pacemaker_fail_count gauge
ha_cluster_pacemaker_fail_count{node="hana01",resource="rsc_SAPHanaTopology_PRD_HDB00"} 0 1234
Expand Down

0 comments on commit 67953ef

Please sign in to comment.