Merge branch 'master' into migrating-to-modules

openark · Jul 27, 2021 · fd9918e · fd9918e
2 parents ae7fc1f + 03f032e
commit fd9918e
Show file tree

Hide file tree

Showing 15 changed files with 613 additions and 69 deletions.
diff --git a/docs/configuration-discovery-classifying.md b/docs/configuration-discovery-classifying.md
@@ -10,6 +10,7 @@
   "DataCenterPattern": "",
   "DetectDataCenterQuery": "select substring_index(substring_index(@@hostname, '-',3), '-', -1) as dc",
   "PhysicalEnvironmentPattern": "",
+  "DetectSemiSyncEnforcedQuery": ""
 }
 ```
 
@@ -60,3 +61,75 @@ You will configure data center awareness in one of two methods:
 ### Cluster domain
 
 To a lesser importance, and mostly for visibility, `DetectClusterDomainQuery` should return the VIP or CNAME or otherwise the address of the cluster's master
+
+### Semi-sync topology 
+
+In some environments, it is important to control the not only the number of semi-sync replicas, but also if a replica is a semi-sync or an async replica. 
+`orchestrator` can detect an undesired semi-sync configuration and toggle the semi-sync flags 
+`rpl_semi_sync_slave_enabled` and `rpl_semi_sync_master_enabled` to correct the situation.
+
+#### Semi-sync master (`rpl_semi_sync_master_enabled`)
+
+`orchestrator` enables the semi-sync master flag during a master failover (e.g. `DeadMaster`) if `DetectSemiSyncEnforcedQuery` returns a value > 0
+for the new master. `orchestrator` does not trigger any recoveries if the master flag is otherwise changed or incorrectly set.
+
+A semi-sync master can enter two failure scenarios: [`LockedSemiSyncMaster`](failure-detection.md#lockedsemisyncmaster) and 
+[`MasterWithTooManySemiSyncReplicas`](failure-detection.md#masterwithtoomanysemisyncreplicas). `orchestrator` disables the 
+semi-sync master flag on semi-sync replicas during a recovery of either of these two conditions.
+
+#### Semi-sync replicas (`rpl_semi_sync_slave_enabled`)
+
+`orchestrator` can detect if there is an incorrect number of semi-sync replicas in the topology ([`LockedSemiSyncMaster`](failure-detection.md#lockedsemisyncmaster) and
+[`MasterWithTooManySemiSyncReplicas`](failure-detection.md#masterwithtoomanysemisyncreplicas)), and can then correct the situation by enabling/disabling
+the semi-sync replica flags accordingly.
+
+This behavior can be controlled by the following options:
+
+- `DetectSemiSyncEnforcedQuery`: query that returns the semi-sync priority (zero means async replica; higher number means higher priority)
+- `EnforceExactSemiSyncReplicas`: flag that decides whether to enforce a _strict_ semi-sync replica topology. If enabled, the recovery of `LockedSemiSyncMaster` 
+   and `MasterWithTooManyReplicas` will enable _and disable_ semi-sync on the replicas to match the desired topology exactly based on the priority order.
+- `RecoverLockedSemiSyncMaster`: flag that decides whether to recover from a `LockedSemiSyncMaster` scenario. If enabled, the recovery of `LockedSemiSyncMaster`
+  will enable _(but never disable)_ semi-sync on the replicas in the priority order to match the master wait count. This option has no effect if 
+  `EnforceExactSemiSyncReplicas` is set. It is only useful if you'd like to only handle a situation which which there are too few semi-sync replicas, 
+  but not if there are too many.
+- `ReasonableLockedSemiSyncMasterSeconds`: number of seconds after which the `LockedSemiSyncMaster` condition is triggered; if not set, falls back to `ReasonableReplicationLagSeconds`
+
+The priority order is defined by `DetectSemiSyncEnforcedQuery` (zero means async replica; higher number is higher priority), the promotion rule (`DetectPromotionRuleQuery`)
+and the hostname (fallback). 
+
+**Example 1**: Enforcing a strict semi-sync replica topology with two replicas and `rpl_semi_sync_master_wait_for_slave_count=1`:
+
+```
+  "DetectSemiSyncEnforcedQuery": "select priority from meta.semi_sync where cluster_member = @@hostname",
+  "EnforceExactSemiSyncReplicas": true
+```
+
+Assuming this topology:
+
+```
+         ,- replica1 (priority = 10, rpl_semi_sync_slave_enabled = 1)
+  master 
+         `- replica2 (priority = 20, rpl_semi_sync_slave_enabled = 1)
+```
+
+`orchestrator` would detect a [`MasterWithTooManySemiSyncReplicas`](failure-detection.md#masterwithtoomanysemisyncreplicas) scenario
+and disable semi-sync on replica2.
+
+**Example 2**: Enforcing a weak semi-sync replica toplogy with two replicas and `rpl_semi_sync_master_wait_for_slave_count=1`:
+
+```
+  "DetectSemiSyncEnforcedQuery": "select 2586",
+  "DetectPromotionRuleQuery": "select promotion_rule from meta.promotion_rules where cluster_member = @@hostname",
+  "RecoverLockedSemiSyncMaster": true
+```
+
+Assuming this topology:
+
+```
+         ,- replica1 (priority = 2586, promotion rule = prefer, rpl_semi_sync_slave_enabled = 0)
+  master 
+         `- replica2 (priority = 2586, promotion rule = neutral, rpl_semi_sync_slave_enabled = 0)
+```
+
+`orchestrator` would detect a [`LockedSemiSyncMaster`](failure-detection.md#lockedsemisyncmaster) scenario
+and enable semi-sync on replica1.
diff --git a/docs/failure-detection.md b/docs/failure-detection.md
@@ -38,6 +38,7 @@ Observe the following list of potential failures:
 * UnreachableMasterWithLaggingReplicas
 * UnreachableMaster
 * LockedSemiSyncMaster
+* MasterWithTooManySemiSyncReplicas
 * AllMasterReplicasNotReplicating
 * AllMasterReplicasNotReplicatingOrDead
 * DeadCoMaster
@@ -96,15 +97,43 @@ This scenario can happen when the master is overloaded. Clients would see a "Too
 
 `orchestrator` responds to this scenario by restarting replication on all of master's immediate replicas. This will close the old client connections on those replicas and attempt to initiate new ones. These may now fail to connect, leading to a complete replication failure on all replicas. This will next lead `orchestrator` to analyze a `DeadMaster`.
 
-### LockedSemiSyncMaster
+#### `LockedSemiSyncMaster`
 
-1. Master is running with semi-sync enabled
+1. Master is running with semi-sync enabled (`rpl_semi_sync_master_enabled=1`)
 2. Number of connected semi-sync replicas falls short of expected `rpl_semi_sync_master_wait_for_slave_count`
 3. `rpl_semi_sync_master_timeout` is high enough such that master locks writes and does not fall back to asynchronous replication
 
-Remediation can be to disable semi-sync on the master, or to bring up (or enable) sufficient semi-sync replicas.
+This condition only triggers after `ReasonableLockedSemiSyncMasterSeconds` has passed. If `ReasonableLockedSemiSyncMasterSeconds` is not set, 
+it trigger after `ReasonableReplicationLagSeconds`.
 
-At this time `orchestrator` does not invoke processes for this type of analysis.
+Remediation of this condition can be to disable semi-sync on the master, or to bring up (or enable) sufficient semi-sync replicas.
+
+If `EnforceExactSemiSyncReplicas` is enabled, `orchestrator` will determine the desired semi-sync topology and enable/disable semi-sync on the replicas to match it.
+The desired topology is defined by the priority order (see below) and the master wait count.
+
+If `RecoverLockedSemiSyncMaster` is enabled, `orchestrator` will enable (but never disable) semi-sync on the replicas in priority order until
+the number of semi-sync replicas matches the master wait count. Please note that `RecoverLockedSemiSyncMaster` has no effect if `EnforceExactSemiSyncReplicas` is set.
+
+The priority order is defined by `DetectSemiSyncEnforcedQuery` (higher number is higher priority), the promotion rule (`DetectPromotionRuleQuery`) and the hostname (fallback).
+
+If `EnforceExactSemiSyncReplicas` and `RecoverLockedSemiSyncMaster` are both disabled (default), `orchestrator` does not invoke any recovery processes for this type of analysis.
+
+Please also consult the [semi-sync topology](configuration-discovery-classifying.md#semi-sync-topology) documentation for more details.
+
+#### `MasterWithTooManySemiSyncReplicas`
+
+1. Master is running with semi-sync enabled (`rpl_semi_sync_master_enabled=1`)
+2. Number of connected semi-sync replicas is higher than the expected `rpl_semi_sync_master_wait_for_slave_count`
+3. `EnforceExactSemiSyncReplicas` is enabled (this analysis is not triggered if this flag is not enabled)
+
+If `EnforceExactSemiSyncReplicas` is enabled, `orchestrator` will determine the desired semi-sync topology and enable/disable semi-sync on the replicas to match it.
+The desired topology is defined by the priority order and the master wait count.
+
+The priority order is defined by `DetectSemiSyncEnforcedQuery` (higher number is higher priority), the promotion rule (`DetectPromotionRuleQuery`) and the hostname (fallback).
+
+If `EnforceExactSemiSyncReplicas` is disabled (default), `orchestrator` does not invoke any recovery processes for this type of analysis.
+
+Please also consult the [semi-sync topology](configuration-discovery-classifying.md#semi-sync-topology) documentation for more details.
 
 ### Failures of no interest
 

diff --git a/etc/systemd/orchestrator.service b/etc/systemd/orchestrator.service
@@ -9,6 +9,7 @@ WorkingDirectory=/usr/local/orchestrator
 ExecStart=/usr/local/orchestrator/orchestrator http
 EnvironmentFile=-/etc/sysconfig/orchestrator
 ExecReload=/bin/kill -HUP $MAINPID
+LimitNOFILE=16384
 
 [Install]
 WantedBy=multi-user.target
diff --git a/go/app/cli.go b/go/app/cli.go
@@ -450,7 +450,7 @@ func Cli(command string, strict bool, instance string, destination string, owner
 			}
 			validateInstanceIsFound(instanceKey)
 
-			lostReplicas, movedReplicas, cannotReplicateReplicas, promotedReplica, err := inst.RegroupReplicasGTID(instanceKey, false, func(candidateReplica *inst.Instance) { fmt.Println(candidateReplica.Key.DisplayString()) }, postponedFunctionsContainer, nil)
+			lostReplicas, movedReplicas, cannotReplicateReplicas, promotedReplica, err := inst.RegroupReplicasGTID(instanceKey, false, true, func(candidateReplica *inst.Instance) { fmt.Println(candidateReplica.Key.DisplayString()) }, postponedFunctionsContainer, nil)
 			lostReplicas = append(lostReplicas, cannotReplicateReplicas...)
 
 			if promotedReplica == nil {

diff --git a/go/config/config.go b/go/config/config.go
@@ -136,10 +136,11 @@ type Configuration struct {
 	DefaultInstancePort                        int      // In case port was not specified on command line
 	SlaveLagQuery                              string   // Synonym to ReplicationLagQuery
 	ReplicationLagQuery                        string   // custom query to check on replica lg (e.g. heartbeat table). Must return a single row with a single numeric column, which is the lag.
-	ReplicationCredentialsQuery                string   // custom query to get replication credentials. Must return a single row, with two text columns: 1st is username, 2nd is password. This is optional, and can be used by orchestrator to configure replication after master takeover or setup of co-masters. You need to ensure the orchestrator user has the privileges to run this query
+	ReplicationCredentialsQuery                string   // custom query to get replication credentials. Must return a single row, with five text columns: 1st is username, 2nd is password, 3rd is SSLCaCert, 4th is SSLCert, 5th is SSLKey. This is optional, and can be used by orchestrator to configure replication after master takeover or setup of co-masters. You need to ensure the orchestrator user has the privileges to run this query
 	DiscoverByShowSlaveHosts                   bool     // Attempt SHOW SLAVE HOSTS before PROCESSLIST
 	UseSuperReadOnly                           bool     // Should orchestrator super_read_only any time it sets read_only
 	InstancePollSeconds                        uint     // Number of seconds between instance reads
+	ReasonableInstanceCheckSeconds             uint     // Number of seconds an instance read is allowed to take before it is considered invalid, i.e. before LastCheckValid will be false
 	InstanceWriteBufferSize                    int      // Instance write buffer size (max number of instances to flush in one INSERT ODKU)
 	BufferInstanceWrites                       bool     // Set to 'true' for write-optimization on backend table (compromise: writes can be stale and overwrite non stale data)
 	InstanceFlushIntervalMilliseconds          int      // Max interval between instance write buffer flushes
@@ -262,7 +263,7 @@ type Configuration struct {
 	GraphitePollSeconds                        int               // Graphite writes interval. 0 disables.
 	URLPrefix                                  string            // URL prefix to run orchestrator on non-root web path, e.g. /orchestrator to put it behind nginx.
 	DiscoveryIgnoreReplicaHostnameFilters      []string          // Regexp filters to apply to prevent auto-discovering new replicas. Usage: unreachable servers due to firewalls, applications which trigger binlog dumps
-	DiscoveryIgnoreMasterHostnameFilters       []string          // Regexp filters to apply to prevent auto-discovering a master. Usage: pointing your master temporarily to replicate seom data from external host
+	DiscoveryIgnoreMasterHostnameFilters       []string          // Regexp filters to apply to prevent auto-discovering a master. Usage: pointing your master temporarily to replicate some data from external host
 	DiscoveryIgnoreHostnameFilters             []string          // Regexp filters to apply to prevent discovering instances of any kind
 	ConsulAddress                              string            // Address where Consul HTTP api is found. Example: 127.0.0.1:8500
 	ConsulScheme                               string            // Scheme (http or https) for Consul
@@ -274,6 +275,9 @@ type Configuration struct {
 	KVClusterMasterPrefix                      string            // Prefix to use for clusters' masters entries in KV stores (internal, consul, ZK), default: "mysql/master"
 	WebMessage                                 string            // If provided, will be shown on all web pages below the title bar
 	MaxConcurrentReplicaOperations             int               // Maximum number of concurrent operations on replicas
+	EnforceExactSemiSyncReplicas               bool              // If true, semi-sync replicas will be enabled/disabled to match the wait count in the desired priority order; this applies to LockedSemiSyncMaster and MasterWithTooManySemiSyncReplicas
+	RecoverLockedSemiSyncMaster                bool              // If true, orchestrator will recover from a LockedSemiSync state by enabling semi-sync on replicas to match the wait count; this behavior can be overridden by EnforceExactSemiSyncReplicas
+	ReasonableLockedSemiSyncMasterSeconds      uint              // Time to evaluate the LockedSemiSyncHypothesis before triggering the LockedSemiSync analysis; falls back to ReasonableReplicationLagSeconds if not set
 }
 
 // ToJSONString will marshal this configuration as JSON
@@ -320,6 +324,7 @@ func newConfiguration() *Configuration {
 		DefaultInstancePort:                        3306,
 		TLSCacheTTLFactor:                          100,
 		InstancePollSeconds:                        5,
+		ReasonableInstanceCheckSeconds:             1,
 		InstanceWriteBufferSize:                    100,
 		BufferInstanceWrites:                       false,
 		InstanceFlushIntervalMilliseconds:          100,
@@ -444,6 +449,9 @@ func newConfiguration() *Configuration {
 		KVClusterMasterPrefix:                      "mysql/master",
 		WebMessage:                                 "",
 		MaxConcurrentReplicaOperations:             5,
+		EnforceExactSemiSyncReplicas:               false,
+		RecoverLockedSemiSyncMaster:                false,
+		ReasonableLockedSemiSyncMasterSeconds:      0,
 	}
 }
 
@@ -606,6 +614,9 @@ func (this *Configuration) postReadAdjustments() error {
 	} else if this.ConsulMaxKVsPerTransaction > ConsulMaxTransactionOps {
 		this.ConsulMaxKVsPerTransaction = ConsulMaxTransactionOps
 	}
+	if this.ReasonableLockedSemiSyncMasterSeconds == 0 {
+		this.ReasonableLockedSemiSyncMasterSeconds = uint(this.ReasonableReplicationLagSeconds)
+	}
 
 	return nil
 }

diff --git a/go/http/api.go b/go/http/api.go
@@ -1157,7 +1157,7 @@ func (this *HttpAPI) RegroupReplicasGTID(params martini.Params, r render.Render,
 		return
 	}
 
-	lostReplicas, movedReplicas, cannotReplicateReplicas, promotedReplica, err := inst.RegroupReplicasGTID(&instanceKey, false, nil, nil, nil)
+	lostReplicas, movedReplicas, cannotReplicateReplicas, promotedReplica, err := inst.RegroupReplicasGTID(&instanceKey, false, true, nil, nil, nil)
 	lostReplicas = append(lostReplicas, cannotReplicateReplicas...)
 
 	if err != nil {

diff --git a/go/inst/analysis.go b/go/inst/analysis.go
@@ -40,6 +40,7 @@ const (
 	AllMasterReplicasNotReplicatingOrDead                                = "AllMasterReplicasNotReplicatingOrDead"
 	LockedSemiSyncMasterHypothesis                                       = "LockedSemiSyncMasterHypothesis"
 	LockedSemiSyncMaster                                                 = "LockedSemiSyncMaster"
+	MasterWithTooManySemiSyncReplicas                                    = "MasterWithTooManySemiSyncReplicas"
 	MasterWithoutReplicas                                                = "MasterWithoutReplicas"
 	DeadCoMaster                                                         = "DeadCoMaster"
 	DeadCoMasterAndSomeReplicas                                          = "DeadCoMasterAndSomeReplicas"
@@ -228,5 +229,5 @@ func (this *ReplicationAnalysis) GetAnalysisInstanceType() AnalysisInstanceType
 // ValidSecondsFromSeenToLastAttemptedCheck returns the maximum allowed elapsed time
 // between last_attempted_check to last_checked before we consider the instance as invalid.
 func ValidSecondsFromSeenToLastAttemptedCheck() uint {
-	return config.Config.InstancePollSeconds + 1
+	return config.Config.InstancePollSeconds + config.Config.ReasonableInstanceCheckSeconds
 }
diff --git a/go/inst/analysis_dao.go b/go/inst/analysis_dao.go
@@ -55,7 +55,7 @@ func initializeAnalysisDaoPostConfiguration() {
 func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) ([]ReplicationAnalysis, error) {
 	result := []ReplicationAnalysis{}
 
-	args := sqlutils.Args(config.Config.ReasonableReplicationLagSeconds, ValidSecondsFromSeenToLastAttemptedCheck(), config.Config.ReasonableReplicationLagSeconds, clusterName)
+	args := sqlutils.Args(config.Config.ReasonableLockedSemiSyncMasterSeconds, ValidSecondsFromSeenToLastAttemptedCheck(), config.Config.ReasonableReplicationLagSeconds, clusterName)
 	analysisQueryReductionClause := ``
 
 	if config.Config.ReduceReplicationAnalysisCount {
@@ -531,6 +531,10 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints)
 					a.Description = "Semi sync master seems to be locked, more samplings needed to validate"
 				}
 				//
+			} else if config.Config.EnforceExactSemiSyncReplicas && a.IsMaster && a.SemiSyncMasterEnabled && a.SemiSyncMasterStatus && a.SemiSyncMasterWaitForReplicaCount > 0 && a.SemiSyncMasterClients > a.SemiSyncMasterWaitForReplicaCount {
+				a.Analysis = MasterWithTooManySemiSyncReplicas
+				a.Description = "Semi sync master has more semi sync replicas than configured"
+				//
 			} else if a.IsMaster && a.LastCheckValid && a.IsReadOnly && a.CountValidReplicatingReplicas > 0 && config.Config.RecoverNonWriteableMaster {
 				a.Analysis = NoWriteableMasterStructureWarning
 				a.Description = "Master with replicas is read_only"

diff --git a/go/inst/instance.go b/go/inst/instance.go
@@ -94,7 +94,7 @@ type Instance struct {
 	HasReplicationCredentials         bool
 	ReplicationCredentialsAvailable   bool
 	SemiSyncAvailable                 bool // when both semi sync plugins (master & replica) are loaded
-	SemiSyncEnforced                  bool
+	SemiSyncPriority                  uint // higher value means higher priority, zero means async replica
 	SemiSyncMasterEnabled             bool
 	SemiSyncReplicaEnabled            bool
 	SemiSyncMasterTimeout             uint64