Skip to content

Commit

Permalink
Merge branch 'fix/autoheal' into 'master'
Browse files Browse the repository at this point in the history
Restart SNMP collector if BrokenProcessPool exception is caught

See merge request grafolean/grafolean-collector-snmp!10
  • Loading branch information
grafolean committed Oct 9, 2019
2 parents a288369 + 8a2e75a commit b9e2536
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 3 deletions.
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,5 @@ RUN \
echo "alias l='ls -altr'" >> /root/.bashrc
COPY --from=build-backend /snmpcollector/ /snmpcollector/
WORKDIR /snmpcollector
HEALTHCHECK --interval=10s --retries=1 CMD /bin/bash -c "[ ! -f /tmp/fail_health_check ]"
CMD ["python", "-m", "snmpcollector"]
26 changes: 26 additions & 0 deletions collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,32 @@ def run_job(job, jobstore_alias, run_times, logger_name):

return events

def _run_job_error(self, job_id, exc, traceback=None):
"""
> Called by the executor with the exception if there is an error calling `run_job`.
Sometimes we start getting traceback, after which collector no longer works:
-----
2019-10-04 19:45:38 | ERR | Error submitting job "SNMPCollector.do_snmp (trigger: <collector.MultipleIntervalsTrigger object at 0x7fd866b9aee8>, next run at: 2019-10-04 19:45:38 UTC)" to executor "iaexecutor"
Traceback (most recent call last):
File "/usr/local/lib/python3.6/site-packages/apscheduler/schedulers/base.py", line 974, in _process_jobs
executor.submit_job(job, run_times)
File "/usr/local/lib/python3.6/site-packages/apscheduler/executors/base.py", line 71, in submit_job
self._do_submit_job(job, run_times)
File "./collector.py", line 92, in _do_submit_job
File "/usr/local/lib/python3.6/concurrent/futures/process.py", line 452, in submit
raise BrokenProcessPool('A child process terminated '
concurrent.futures.process.BrokenProcessPool: A child process terminated abruptly, the process pool is not usable anymore
-----
The idea is that we remember that we are in this state, so that we can make Docker health check fail.
"""
super()._run_job_error(job_id, exc, traceback)

if 'BrokenProcessPool' in exc.__class__.__name__:
# this file is checked by the Docker health check and if it exists, container should be restarted:
open('/tmp/fail_health_check', 'a').close()


class Collector(object):
__slots__ = 'backend_url', 'bot_token', 'scheduler', 'known_jobs', 'jobs_refresh_interval'
Expand Down
16 changes: 13 additions & 3 deletions docker-compose.dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,21 @@ services:
# so that Docker networking is bypassed.
network_mode: "host"


redis:
image: redis:5-alpine
container_name: grafolean-collector-snmp-redis
ports:
- "127.0.0.1:6379:6379"
# We advise not to use `network_mode: "host"` in production, because it would expose Redis to host network
# (even if access is limited to 127.0.0.1).
- "6379:6379"
# We advise not to use `network_mode: "host"` in production, because it would expose Redis to the network.
network_mode: "host"


autoheal:
image: willfarrell/autoheal
container_name: autoheal-snmp
environment:
- AUTOHEAL_CONTAINER_LABEL=all
volumes:
- /var/run/docker.sock:/var/run/docker.sock
restart: always
12 changes: 12 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,19 @@ services:
- REDIS_HOST=redis
restart: always


redis:
image: redis:5-alpine
container_name: grafolean-collector-snmp-redis
restart: always


autoheal:
# This container automatically restarts any container that fails its health check. Not a bullet-proof solution, but better than nothing.
image: willfarrell/autoheal
container_name: autoheal-snmp
environment:
- AUTOHEAL_CONTAINER_LABEL=all
volumes:
- /var/run/docker.sock:/var/run/docker.sock
restart: always

0 comments on commit b9e2536

Please sign in to comment.