Skip to content

Commit

Permalink
Merge pull request #19 from ClusterLabs/sbd
Browse files Browse the repository at this point in the history
add SBD device status metrics
  • Loading branch information
MalloZup authored Oct 2, 2019
2 parents f8f9870 + 245fb3d commit 52785eb
Show file tree
Hide file tree
Showing 5 changed files with 325 additions and 7 deletions.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
module github.com/MalloZup/ha_cluster_exporter
module github.com/ClusterLabs/ha_cluster_exporter

go 1.12

Expand Down
48 changes: 44 additions & 4 deletions ha_cluster_exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ type resource struct {

var (
// corosync metrics

corosyncRingErrorsTotal = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "corosync_ring_errors_total",
Help: "Total number of ring errors in corosync",
Expand All @@ -93,6 +92,13 @@ var (

// metrics with labels. (prefer these always as guideline)

// sbd metrics
sbdDevStatus = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cluster_sbd_device_status",
Help: "cluster sbd status for each SBD device. 1 is healthy device, 0 is not",
}, []string{"device_name"})

// corosync quorum
corosyncQuorum = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Expand Down Expand Up @@ -122,6 +128,8 @@ func init() {
prometheus.MustRegister(corosyncRingErrorsTotal)
prometheus.MustRegister(corosyncQuorum)
prometheus.MustRegister(corosyncQuorate)
prometheus.MustRegister(sbdDevStatus)

}

// this function is for some cluster metrics which have resource as labels.
Expand Down Expand Up @@ -170,7 +178,39 @@ func main() {

// for each different metrics, handle it in differents gorutines, and use same timeout.

// 1a) set corosync metrics: Ring errors
// set SBD device metrics
go func() {
for {
log.Println("[INFO]: Reading cluster SBD configuration..")
// read configuration of SBD
sbdConfiguration, err := readSdbFile()
if err != nil {
log.Panic("couldn't read SBD /etc/sysconfig/sbd config file")
}
// retrieve a list of sbd devices
sbdDevices := getSbdDevices(sbdConfiguration)
// set and return a map of sbd devices with true healthy, false not
sbdStatus := setSbdDeviceHealth(sbdDevices)

if len(sbdStatus) == 0 {
log.Println("[WARN]: Could not retrieve any sbd device")
continue
}

for sbdDev, sbdStatusBool := range sbdStatus {
// true it means the sbd device is healthy
if sbdStatusBool == true {
sbdDevStatus.WithLabelValues(sbdDev).Set(float64(1))
} else {
sbdDevStatus.WithLabelValues(sbdDev).Set(float64(0))
}
}

time.Sleep(time.Duration(int64(*timeoutSeconds)) * time.Second)
}
}()

// set corosync metrics: Ring errors
go func() {
for {
ringStatus := getCorosyncRingStatus()
Expand All @@ -184,7 +224,7 @@ func main() {
time.Sleep(time.Duration(int64(*timeoutSeconds)) * time.Second)
}
}()
// 1b) set corosync metrics: quorum metrics
// set corosync metrics: quorum metrics
go func() {
for {
quoromStatus := getQuoromClusterInfo()
Expand All @@ -209,7 +249,7 @@ func main() {
time.Sleep(time.Duration(int64(*timeoutSeconds)) * time.Second)
}
}()
// 2) set cluster pacemaker metrics
// set cluster pacemaker metrics
go func() {
for {

Expand Down
56 changes: 56 additions & 0 deletions sbd_metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
package main

import (
"fmt"
"io/ioutil"
"os"
"os/exec"
"regexp"
"strings"
)

func readSdbFile() ([]byte, error) {
sbdConfFile, err := os.Open("/etc/sysconfig/sbd")
if err != nil {
return nil, fmt.Errorf("[ERROR] Could not open sbd config file %s", err)
}

defer sbdConfFile.Close()
sbdConfigRaw, err := ioutil.ReadAll(sbdConfFile)

if err != nil {
return nil, fmt.Errorf("[ERROR] Could not read sbd config file %s", err)
}
return sbdConfigRaw, nil
}

// return a list of sbd devices that we get from config
func getSbdDevices(sbdConfigRaw []byte) []string {
// in config it can be both SBD_DEVICE="/dev/foo" or SBD_DEVICE=/dev/foo;/dev/bro
wordOnly := regexp.MustCompile("SBD_DEVICE=\"?[a-zA-Z-/;]+\"?")
sbdDevicesConfig := wordOnly.FindString(string(sbdConfigRaw))
// remove the SBD_DEVICE
sbdArray := strings.Split(sbdDevicesConfig, "SBD_DEVICE=")[1]
// make a list of devices by ; seperators and remove double quotes if present
sbdDevices := strings.Split(strings.Trim(sbdArray, "\""), ";")

return sbdDevices
}

// this function take a list of sbd devices and return
// a map of devices with the status, true is healthy , false isn't
func setSbdDeviceHealth(sbdDevices []string) map[string]bool {
sbdStatus := make(map[string]bool)

for _, sbdDev := range sbdDevices {
_, err := exec.Command("sbd", "-d", sbdDev, "dump").Output()

// in case of error the device is not healthy
if err != nil {
sbdStatus[sbdDev] = false
} else {
sbdStatus[sbdDev] = true
}
}
return sbdStatus
}
223 changes: 223 additions & 0 deletions sbd_metrics_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
package main

import (
"fmt"
"testing"
)

func TestReadSbdConfFileError(t *testing.T) {
fmt.Println("=== Testing SBD : reading config file")
sbdConfFile, err := readSdbFile()

if sbdConfFile != nil {
t.Errorf("SbdConfig file should be empty in case of error")
}

// we expect that in ci we fail to read the config file
// since there shouldn't be any sbd config in ci
if err == nil {
t.Errorf("Error should be not nil got %s", err)
}

}

func TestGetSbdDevicesWithoutDoubleQuotes(t *testing.T) {
fmt.Println("=== Testing SBD devices retrival from config without quotes")

// this is a full config file more or less , in other tests it is cutted
sbdConfig := `
# SBD_DEVICE specifies the devices to use for exchanging sbd messages
# and to monitor. If specifying more than one path, use ";" as
# separator.
#
#SBD_DEVICE=""
## Type: yesno
## Default: yes
#
# Whether to enable the pacemaker integration.
#
SBD_PACEMAKER=yes
## Type: list(always,clean)
## Default: always
#
# Specify the start mode for sbd. Setting this to "clean" will only
# allow sbd to start if it was not previously fenced. See the -S option
# in the man page.
#
SBD_STARTMODE=always
## Type: yesno / integer
## Default: no
#
# Whether to delay after starting sbd on boot for "msgwait" seconds.
# This may be necessary if your cluster nodes reboot so fast that the
# other nodes are still waiting in the fence acknowledgement phase.
# This is an occasional issue with virtual machines.
#
# This can also be enabled by being set to a specific delay value, in
# seconds. Sometimes a longer delay than the default, "msgwait", is
# needed, for example in the cases where it's considered to be safer to
# wait longer than:
# corosync token timeout + consensus timeout + pcmk_delay_max + msgwait
#
# Be aware that the special value "1" means "yes" rather than "1s".
#
# Consider that you might have to adapt the startup-timeout accordingly
# if the default isn't sufficient. (TimeoutStartSec for systemd)
#
# This option may be ignored at a later point, once pacemaker handles
# this case better.
#
SBD_DELAY_START=no
## Type: string
## Default: /dev/watchdog
#
# Watchdog device to use. If set to /dev/null, no watchdog device will
# be used.
#
SBD_WATCHDOG_DEV=/dev/watchdog
## Type: integer
## Default: 5
#
# How long, in seconds, the watchdog will wait before panicking the
# node if no-one tickles it.
#
# This depends mostly on your storage latency; the majority of devices
# must be successfully read within this time, or else the node will
# self-fence.
#
# If your sbd device(s) reside on a multipath setup or iSCSI, this
# should be the time required to detect a path failure.
#
# Be aware that watchdog timeout set in the on-disk metadata takes
# precedence.
#
SBD_WATCHDOG_TIMEOUT=5
## Type: string
## Default: "flush,reboot"
#
# Actions to be executed when the watchers don't timely report to the sbd
# master process or one of the watchers detects that the master process
# has died.
#
# Set timeout-action to comma-separated combination of
# noflush|flush plus reboot|crashdump|off.
# If just one of both is given the other stays at the default.
#
# This doesn't affect actions like off, crashdump, reboot explicitly
# triggered via message slots.
# And it does as well not configure the action a watchdog would
# trigger should it run off (there is no generic interface).
#
SBD_TIMEOUT_ACTION=flush,reboot
## Type: string
## Default: ""
#
# Additional options for starting sbd
#
SBD_OPTS=
SBD_DEVICE=/dev/vdc;/dev/brother;/dev/syster
`

sbdDevices := getSbdDevices([]byte(sbdConfig))
// we should have 3 devices
expected := "/dev/vdc"
if sbdDevices[0] != expected {
t.Errorf("sbdDevice was incorrect, got: %s, expected: %s ", sbdDevices[0], expected)
}
expected = "/dev/brother"
if sbdDevices[1] != expected {
t.Errorf("sbdDevice was incorrect, got: %s, expected: %s ", sbdDevices[0], expected)
}

expected = "/dev/syster"
if sbdDevices[2] != expected {
t.Errorf("sbdDevice was incorrect, got: %s, expected: %s ", sbdDevices[0], expected)
}

if len(sbdDevices) != 3 {
t.Errorf("length of SbdDevice should be 3 got %d", len(sbdDevices))
}

}

// test the other case with double quotes, and put the string in random place
func TestGetSbdDevicesWithDoubleQuotes(t *testing.T) {
fmt.Println("=== Testing SBD devices retrival from config with Double quotes")

sbdConfig := `## Type: string
## Default: ""
#
# SBD_DEVICE specifies the devices to use for exchanging sbd messages
# and to monitor. If specifying more than one path, use ";" as
# separator.
#
#SBD_DEVICE=""
SBD_WATCHDOG_TIMEOUT=5
SBD_DEVICE="/dev/vdc;/dev/brother;/dev/syster"
SBD_TIMEOUT_ACTION=flush,reboot
## Type: string
## Default: ""
#
# Additional options for starting sbd
#
SBD_OPTS=`

sbdDevices := getSbdDevices([]byte(sbdConfig))
// we should have 3 devices
expected := "/dev/vdc"
if sbdDevices[0] != expected {
t.Errorf("sbdDevice was incorrect, got: %s, expected: %s ", sbdDevices[0], expected)
}
expected = "/dev/brother"
if sbdDevices[1] != expected {
t.Errorf("sbdDevice was incorrect, got: %s, expected: %s ", sbdDevices[0], expected)
}

expected = "/dev/syster"
if sbdDevices[2] != expected {
t.Errorf("sbdDevice was incorrect, got: %s, expected: %s ", sbdDevices[0], expected)
}

if len(sbdDevices) != 3 {
t.Errorf("length of SbdDevice should be 3 got %d", len(sbdDevices))
}

}

// test the other case with double quotes, and put the string in random place
func TestOnlyOneDeviceSbd(t *testing.T) {
fmt.Println("=== Testing Only 1 device")

sbdConfig := `## Type: string
## Default: ""
SBD_DEVICE=/dev/vdc
## Type: string
## Default: "flush,reboot"
`

sbdDevices := getSbdDevices([]byte(sbdConfig))

// we should have 1 device
expected := "/dev/vdc"
if sbdDevices[0] != expected {
t.Errorf("sbdDevice was incorrect, got: %s, expected: %s ", sbdDevices[0], expected)
}

if len(sbdDevices) != 1 {
t.Errorf("length of SbdDevice should be 1 got %d", len(sbdDevices))
}

}
3 changes: 1 addition & 2 deletions tools/deploy-to-cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,8 @@

# this script is just for deploying the binary to the cluster. Nothing else.

node="[email protected].29.106"
node="[email protected].31.221"

ssh $node "rm /root/ha_cluster_exporter"
echo "copying binary"
scp ha_cluster_exporter $node:

0 comments on commit 52785eb

Please sign in to comment.