From 68d88a7ff3e5efe4cc924e04c578faaa7c170526 Mon Sep 17 00:00:00 2001
From: Klaus Wenninger <klaus.wenninger@aon.at>
Date: Wed, 24 Aug 2016 16:57:09 +0200
Subject: [PATCH] avoid some probably unnecessary watchdog-reboots with
 pacemaker_remote by using the knowledge of the cib most recently received
 before a connection-loss

---
 src/sbd-pacemaker.c | 81 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 78 insertions(+), 3 deletions(-)

diff --git a/src/sbd-pacemaker.c b/src/sbd-pacemaker.c
index b6a8fb6..d506ab9 100644
--- a/src/sbd-pacemaker.c
+++ b/src/sbd-pacemaker.c
@@ -94,6 +94,7 @@ mon_timer_reconnect(gpointer data)
 		cl_log(LOG_INFO, "CIB reconnect successful");
 	}
 
+	notify_parent();
 	return FALSE;
 }
 
@@ -101,11 +102,71 @@ static void
 mon_cib_connection_destroy(gpointer user_data)
 {
 	if (cib) {
-		cib->cmds->signoff(cib);
-		set_servant_health(pcmk_health_transient, LOG_WARNING, "Disconnected from CIB");
-		timer_id_reconnect = g_timeout_add(reconnect_msec, mon_timer_reconnect, NULL);
+        int new_servant_health = pcmk_health_transient;
+        xmlNode *cib_copy = NULL;
+        pe_working_set_t data_set;
+        node_t *node;
+
+        /* use leftover info to check if we are running
+         * on a remote-node without active resources
+         */
+        if (current_cib) {
+            cib_copy = copy_xml(current_cib);
+            if (cli_config_update(&cib_copy, NULL, FALSE)) {
+                last_refresh = time(NULL);
+                set_working_set_defaults(&data_set);
+                data_set.input = cib_copy;
+                data_set.flags |= pe_flag_have_stonith_resource;
+                cluster_status(&data_set);
+                compute_status(&data_set);
+                node = pe_find_node(data_set.nodes, local_uname);
+                if ((node != NULL) &&
+                    (node->details->type == node_remote) &&
+                    (node->details->running_rsc == NULL)) {
+
+                    if (data_set.no_quorum_policy != no_quorum_suicide) {
+                        new_servant_health = pcmk_health_online;
+                        cl_log(LOG_INFO,
+                            "found remote node without remaining resources");
+                    } else {
+                        /* Assuming reason for setting no-quorum-policy
+                         * to suicide is hope to regain connectivity
+                         * by rebooting or simply for the case there
+                         * might be something wrong with ourself.
+                         * Alternative would be to make configurable
+                         * independently for remote-nodes in sbd-config,
+                         * via cluster-property or even on a per
+                         * remote-node basis as resource-attribute.
+                         */
+                        cl_log(LOG_INFO,
+                            "found remote node without remaining resources - "
+                            "still suiciding derived from no-quorum-policy");
+                    }
+                } else {
+                    cl_log(LOG_INFO, "assuming that there are still "
+                           "resources to be handled");
+                }
+                cleanup_calculations(&data_set);
+            } else {
+                cl_log(LOG_INFO, "cli_config_update failed");
+            }
+        } else {
+            cl_log(LOG_INFO, "found cib on destroy but current_cib is empty");
+        }
+        cib->cmds->signoff(cib);
+        set_servant_health(new_servant_health, LOG_WARNING,
+                           "Disconnected from CIB");
+		timer_id_reconnect = g_timeout_add(reconnect_msec,
+                                           mon_timer_reconnect, NULL);
 	}
 	cib_connected = 0;
+    if (current_cib) {
+        /* cleanup the copy and prevent status being evaluated
+         * again based on the old data
+         */
+        free_xml(current_cib);
+        current_cib = NULL;
+    }
 	return;
 }
 
@@ -204,6 +265,18 @@ compute_status(pe_working_set_t * data_set)
 
     updates++;
 
+    if (node != NULL) {
+        if (node->details->type == node_remote) {
+            cl_log(LOG_INFO,
+                "'%s' is a remote-node that has %sresources running",
+                local_uname, (node->details->running_rsc == NULL)?"no ":"");
+        } else {
+            cl_log(LOG_INFO, "'%s' is a full cluster-node", local_uname);
+        }
+    } else {
+        cl_log(LOG_INFO, "didn't find ourself (%s) in cib", local_uname);
+    }
+
     if (data_set->dc_node == NULL) {
         set_servant_health(pcmk_health_transient, LOG_INFO, "We don't have a DC right now.");
         notify_parent();
@@ -406,8 +479,10 @@ servant_pcmk(const char *diskname, int mode, const void* argp)
 		} while (exit_code == -ENOTCONN);
 
 		if (exit_code != 0) {
+            cl_log(LOG_WARNING, "Failed connecting to the cib");
 			clean_up(-exit_code);
 		}
+		cl_log(LOG_INFO, "Connected to cib");
 	}
 
 	mainloop = g_main_new(FALSE);