-
Notifications
You must be signed in to change notification settings - Fork 0
/
healthcheck.sh
executable file
·39 lines (32 loc) · 1.32 KB
/
healthcheck.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#!/usr/bin/env bash
set -eax
# Put last healthcheck logs in an file (You might persist this for healthcheck failures analysis as the container is ephemeral)
# Otherwise healthcheck logs are found in docker inspect <container_id>
exec > /tmp/healthcheck.log 2>&1
if [ "$SERVICE_MODE" = "http" ]
then
# HTTP mode healthcheck
curl --fail http://localhost:80/healthcheck || exit 1
else
# Update last alive
python -c "from celery_app.register import register; register(False)"
# Check if Celery worker process is running
PID=`pgrep -f "celeryapp worker"`
if [ -z "$PID" ]; then
echo "HealthCheck FAIL: Celery worker process not running"
exit 1
fi
# Check if GPU is in use
has_gpu=$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits | grep -v '^0$' | wc -l)
if [ "$has_gpu" -gt 0 ]; then
echo "HealthCheck PASS: GPU is being utilized, marking service as healthy."
exit 0
fi
# Attempt to ping Celery worker
if ! celery --app=celery_app.celeryapp inspect ping -d ${SERVICE_NAME}_worker@$HOSTNAME --timeout=20; then
echo "HealthCheck FAIL: Celery worker not responding in time and GPU is not being utilized"
exit 1
fi
echo "HealthCheck PASS: Celery worker is responsive but idle, marking service as healthy."
exit 0
fi