-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathrestore-etcd-single.sh
294 lines (256 loc) · 11.8 KB
/
restore-etcd-single.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
#!/bin/bash
if hash tput 2>/dev/null; then
red=$(tput setaf 1)
green=$(tput setaf 2)
reset=$(tput sgr0)
fi
function grecho() {
echo "${green}$1${reset}"
}
function recho() {
echo "${red}$1${reset}"
}
ETCD_BACKUP_TIME=$(date +%Y-%m-%d--%H%M%S)
if [[ $? -ne 0 ]]; then
grecho "Setting timestamp failed, does the \"date\" command exist\?"
exit 1
fi
rootcmd() {
if [[ $EUID -ne 0 ]]; then
grecho "Running as non root user, issuing command with sudo."
sudo $1
else
$1
fi
}
function checkpipecmd() {
RC=("${PIPESTATUS[@]}")
if [[ "$2" != "" ]]; then
PIPEINDEX=$2
else
PIPEINDEX=0
fi
if [ "${RC[${PIPEINDEX}]}" != "0" ]; then
echo "${green}$1${reset}"
exit 1
fi
}
if [[ -d "/opt/rke/var/lib/etcd" ]]; then
ETCD_DIR="/opt/rke/var/lib/etcd"
elif [[ -d "/var/lib/etcd" ]]; then
ETCD_DIR="/var/lib/etcd"
else
grecho "Unable to locate an etcd directory, exiting script!"
exit 1
fi
grecho "Found ${ETCD_DIR}, setting ETCD_DIR to this value"
if [[ -d "/opt/rke/etc/kubernetes" ]]; then
CERT_DIR="/opt/rke/etc/kubernetes"
elif [[ -d "/etc/kubernetes" ]]; then
CERT_DIR="/etc/kubernetes"
else
grecho "Unable to locate the kubernetes certificate directory, exiting script!"
exit 1
fi
grecho "Found ${CERT_DIR}, setting CERT_DIR to this value"
if [ -d "/opt/rke/etcd" ]; then
grecho "/opt/rke/etcd exists, moving it to /opt/rke/etcd--${ETCD_BACKUP_TIME}."
rootcmd "mv /opt/rke/etcd /opt/rke/etcd--${ETCD_BACKUP_TIME}"
fi
if [ ! "$(docker ps -a --filter "name=^/etcd$" --format '{{.Names}}')" == "etcd" ]; then
grecho "etcd container does not exist, script cannot proceed. Check docker ps -a for old containers and rename one of them back to etcd."
exit 1
fi
if [ "$(docker ps -a --filter "name=^/etcd-restore$" --format '{{.Names}}')" == "etcd-restore" ]; then
grecho "etcd-restore container exists, deleting container"
docker rm -f etcd-restore
checkpipecmd "Unable to delete etcd-restore, exiting script!"
fi
if [ "$(docker ps -a --filter "name=^/etcd-reinit$" --format '{{.Names}}')" == "etcd-reinit" ]; then
grecho "etcd-reinit container exists, deleting container"
docker rm -f etcd-reinit
checkpipecmd "Unable to delete etcd-reinit, exiting script!"
fi
#Help menu
USAGE='To restore a snapshot: ./restore-etcd-single.sh </path/to/snapshot>
To restore lost quorum to a single node and remove other members without a snapshot: ./restore-etcd-single.sh FORCE_NEW_CLUSTER'
if [[ $1 == '' ]] || [[ $@ =~ " -h" ]] || [[ $1 == "-h" ]] || [[ $@ =~ " --help" ]] || [[ $1 =~ "--help" ]]; then
grecho "${USAGE}"
exit 1
fi
if [[ $1 == 'FORCE_NEW_CLUSTER' ]]; then
FORCE_NEW_CLUSTER=yes
else
RESTORE_SNAPSHOT=$1
#check if image exists
ls -lash "${RESTORE_SNAPSHOT}"
if [[ $? -ne 0 ]]; then
grecho "Image ${RESTORE_SNAPSHOT} does not exist, aborting script!"
exit 1
fi
#check if zip file and extract if it is
if [[ "${RESTORE_SNAPSHOT/${RESTORE_SNAPSHOT/\.zip/}/}" == ".zip" ]]; then
if ! hash unzip 2>/dev/null; then
grecho '!!!unzip was not found!!!'
exit 1
fi
grecho "Zipped snapshot detected, unzipping ${RESTORE_SNAPSHOT}..."
unzip -o "${RESTORE_SNAPSHOT}"
RESULT="$?"
if [[ "$RESULT" -gt "1" ]]; then
grecho "Unzip returned exit code higher than 1 which indicates a failure. Exiting script!"
exit 1
else
grecho "${RESTORE_SNAPSHOT} unzipped successfully!"
fi
mv ./backup/"${RESTORE_SNAPSHOT/\.zip/}" .
checkpipecmd "Failed to move snapshot to current directory!"
RESTORE_SNAPSHOT="${RESTORE_SNAPSHOT/\.zip/}"
fi
#move stale snapshot out of way if it exists
if [ -f "${CERT_DIR}/snapshot.db" ]; then
recho "Found stale snapshot at ${CERT_DIR}/snapshot.db, moving it out of the way to ${CERT_DIR}/snapshot.db--${ETCD_BACKUP_TIME}"
rootcmd "mv ${CERT_DIR}/snapshot.db ${CERT_DIR}/snapshot.db--${ETCD_BACKUP_TIME}"
fi
#copy snapshot into place
recho "Copying ${RESTORE_SNAPSHOT} to ${CERT_DIR}/snapshot.db"
rootcmd "cp ${RESTORE_SNAPSHOT} ${CERT_DIR}/snapshot.db"
checkpipecmd "Failed to copy ${RESTORE_SNAPSHOT} to ${CERT_DIR}/snapshot.db, aborting script!"
fi
#check for runlike container
RUNLIKE=$(docker run --rm -v /var/run/docker.sock:/var/run/docker.sock patrick0057/runlike etcd)
checkpipecmd "runlike container failed to run, aborting script!"
recho "Setting etcd restart policy to never restart \"no\""
docker update --restart=no etcd
recho "Renaming original etcd container to etcd-old--${ETCD_BACKUP_TIME}"
docker rename etcd etcd-old--"${ETCD_BACKUP_TIME}"
checkpipecmd "Failed to rename etcd to etcd-old--${ETCD_BACKUP_TIME}, aborting script!"
recho "Stopping original etcd container"
docker stop etcd-old--${ETCD_BACKUP_TIME}
checkpipecmd "Failed to stop etcd-old--${ETCD_BACKUP_TIME}"
if [[ "${FORCE_NEW_CLUSTER}" == "yes" ]]; then
recho "Copying old etcd data directory ${ETCD_DIR} to ${ETCD_DIR}-old--${ETCD_BACKUP_TIME}"
rootcmd "cp -arfv ${ETCD_DIR} ${ETCD_DIR}-old--${ETCD_BACKUP_TIME}"
checkpipecmd "Failed to copy ${ETCD_DIR} to ${ETCD_DIR}-old--${ETCD_BACKUP_TIME}, aborting script!"
else
recho "Moving old etcd data from ${ETCD_DIR} to ${ETCD_DIR}-old--${ETCD_BACKUP_TIME}"
rootcmd "mkdir ${ETCD_DIR}-old--${ETCD_BACKUP_TIME}"
checkpipecmd "Failed to created backup etcd directory, exiting script!"
if [[ "$(ls -A ${ETCD_DIR})" ]]; then
recho "${ETCD_DIR} is not empty, moving files out into ${ETCD_DIR}-old--${ETCD_BACKUP_TIME}"
rootcmd "mv ${ETCD_DIR}/* ${ETCD_DIR}-old--${ETCD_BACKUP_TIME}/"
checkpipecmd "Failed to move etcd data files to backup directory ${ETCD_DIR}/* -> ${ETCD_DIR}-old--${ETCD_BACKUP_TIME}/, exiting script!"
else
grecho "${ETCD_DIR} is empty, no need to move any files out."
fi
fi
ETCD_HOSTNAME=$(sed 's,^.*--hostname=\([^ ]*\).*,\1,g' <<<${RUNLIKE})
ETCDCTL_ENDPOINT="https://0.0.0.0:2379"
ETCDCTL_CACERT=$(sed 's,^.*ETCDCTL_CACERT=\([^ ]*\).*,\1,g' <<<${RUNLIKE})
ETCDCTL_CERT=$(sed 's,^.*ETCDCTL_CERT=\([^ ]*\).*,\1,g' <<<${RUNLIKE})
ETCDCTL_KEY=$(sed 's,^.*ETCDCTL_KEY=\([^ ]*\).*,\1,g' <<<${RUNLIKE})
ETCD_VERSION=$(sed 's,^.*rancher/coreos-etcd:\([^ ]*\).*,\1,g' <<<${RUNLIKE})
INITIAL_ADVERTISE_PEER_URL=$(sed 's,^.*initial-advertise-peer-urls=\([^ ]*\).*,\1,g' <<<${RUNLIKE})
ETCD_NAME=$(sed 's,^.*name=\([^ ]*\).*,\1,g' <<<${RUNLIKE})
INITIAL_CLUSTER=$(sed 's,^.*--initial-cluster=.*\('"${ETCD_NAME}"'\)=\([^,^ ]*\).*,\1=\2,g' <<<${RUNLIKE})
#ETCD_SNAPSHOT_LOCATION="snapshot.db"
INITIAL_CLUSTER_TOKEN=$(sed 's,^.*initial-cluster-token=\([^ ]*\).*,\1,g' <<<${RUNLIKE})
ETCD_IMAGE="$(docker inspect etcd-old--${ETCD_BACKUP_TIME} --format='{{.Config.Image}}')"
grecho "ETCD_IMAGE set to ${ETCD_IMAGE}"
if [[ "${FORCE_NEW_CLUSTER}" != "yes" ]]; then
RESTORE_RUNLIKE='docker run
--name=etcd-restore
--hostname='${ETCD_HOSTNAME}'
--env="ETCDCTL_API=3"
--env="ETCDCTL_ENDPOINT='${ETCDCTL_ENDPOINT}'"
--env="ETCDCTL_CACERT='${ETCDCTL_CACERT}'"
--env="ETCDCTL_CERT='${ETCDCTL_CERT}'"
--env="ETCDCTL_KEY='${ETCDCTL_KEY}'"
--env="PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
--volume="'${ETCD_DIR}':/var/lib/rancher/etcd/:z"
--volume="'${CERT_DIR}':/etc/kubernetes:z"
--volume="/opt/rke:/opt/rke:z"
--network=host
--label io.rancher.rke.container.name="etcd"
-ti '${ETCD_IMAGE}' /usr/local/bin/etcdctl snapshot restore /etc/kubernetes/snapshot.db
--initial-advertise-peer-urls='${INITIAL_ADVERTISE_PEER_URL}'
--initial-cluster='${INITIAL_CLUSTER}'
--initial-cluster-token='${INITIAL_CLUSTER_TOKEN}'
--data-dir=/opt/rke/etcd
--name='${ETCD_NAME}''
#RESTORE ETCD
recho "Restoring etcd snapshot with the following command:"
echo ${RESTORE_RUNLIKE}
eval ${RESTORE_RUNLIKE}
checkpipecmd "Failed to restore etcd snapshot!"
#grecho "Sleeping for 10 seconds so etcd can do its restore"
#sleep 10
recho "Stopping etcd-restore container"
docker stop etcd-restore
recho "Moving restored etcd directory in place"
rootcmd "mv /opt/rke/etcd/* ${ETCD_DIR}/"
rootcmd "rm -fr /opt/rke/etcd/"
recho "Deleting etcd-restore container"
docker rm -f etcd-restore
fi
#INITIALIZE NEW RUNLIKE
NEW_RUNLIKE=${RUNLIKE}
#ADD --force-new-cluster
NEW_RUNLIKE=$(sed 's,^\(.*'${ETCD_VERSION}' \)\([^ ]*\)\(.*\),\1\2 --force-new-cluster\3,g' <<<${NEW_RUNLIKE})
#REMOVE OTHER ETCD NODES FROM --initial-cluster
ORIG_INITIAL_CLUSTER=$(sed 's,^.*initial-cluster=\([^ ]*\).*,\1,g' <<<${RUNLIKE})
NEW_RUNLIKE=$(sed 's`'"${ORIG_INITIAL_CLUSTER}"'`'"${INITIAL_CLUSTER}"'`g' <<<${NEW_RUNLIKE})
#CHANGE NAME TO etcd-reinit
NEW_RUNLIKE=$(sed 's`'--name=etcd'`'--name=etcd-reinit'`g' <<<${NEW_RUNLIKE})
#REINIT ETCD
recho "Running etcd-reinit with the following command:"
echo ${NEW_RUNLIKE}
eval ${NEW_RUNLIKE}
checkpipecmd "Failed to run etcd-reinit!"
grecho "Sleeping for 10 seconds so etcd can do reinit things"
sleep 10
#echo ${green}Tailing last 40 lines of etcd-reinit${reset}
#docker logs etcd-reinit --tail 40
#STOP AND REMOVE etcd-reinit
recho "Stopping and removing etcd-reinit"
docker stop etcd-reinit
docker rm -f etcd-reinit
#CHANGE NAME BACK TO etcd
NEW_RUNLIKE=$(sed 's`'--name=etcd-reinit'`'--name=etcd'`g' <<<${NEW_RUNLIKE})
#REMOVE --force-new-cluster
NEW_RUNLIKE=$(sed 's`--force-new-cluster ``g' <<<${NEW_RUNLIKE})
#FINALLY RUN NEW SHINY RESTORED ETCD
recho "Launching shiny new etcd"
echo ${NEW_RUNLIKE}
eval ${NEW_RUNLIKE}
checkpipecmd "Failed to launch shiny new etcd!"
grecho "Script sleeping for 5 seconds"
sleep 5
echo
recho "Restarting kubelet and kube-apiserver if they exist"
docker restart kubelet kube-apiserver
if [[ "$FORCE_NEW_CLUSTER" != "yes" ]]; then
echo "${red}Removing ${CERT_DIR}/snapshot.db${reset}"
#rootcmd "mv ${CERT_DIR}/snapshot.db ${CERT_DIR}/snapshot.db--${ETCD_BACKUP_TIME}"
rootcmd "rm -f ${CERT_DIR}/snapshot.db"
fi
recho "Setting etcd restart policy to always restart"
docker update --restart=always etcd
#PRINT OUT MEMBER LIST
#CHECK IF WE NEED TO ADD --endpoints TO THE COMMAND
grecho "Running an 'etcdctl member list' as a final test."
if [[ ! "${ETCD_IMAGE}" =~ "v3.0" ]] && [[ ! "${ETCD_IMAGE}" =~ "v3.1" ]] && [[ ! "${ETCD_IMAGE}" =~ "v3.2" ]] && [[ ! "${ETCD_IMAGE}" =~ "v3.3" ]]; then
grecho "We're running etcd 3.4 or newer, automatically omitting endpoints."
docker exec etcd etcdctl member list
else
REQUIRE_ENDPOINT=$(docker exec etcd netstat -lpna | grep \:2379 | grep tcp | grep LISTEN | tr -s ' ' | cut -d' ' -f4)
if [[ $REQUIRE_ENDPOINT =~ ":::" ]]; then
grecho "etcd is listening on ${REQUIRE_ENDPOINT}, no need to pass --endpoints"
docker exec etcd etcdctl member list
else
grecho "etcd is only listening on ${REQUIRE_ENDPOINT}, we need to pass --endpoints"
docker exec etcd etcdctl --endpoints ${REQUIRE_ENDPOINT} member list
fi
fi
grecho "Single restore has completed, please be sure to restart kubelet and kube-apiserver on other nodes."
grecho "If you are planning to rejoin another node to this etcd cluster you'll want to use etcd-join.sh on that node"