forked from tin6150/bofhbot
-
Notifications
You must be signed in to change notification settings - Fork 2
/
bot_analyzer.py
48 lines (41 loc) · 1.71 KB
/
bot_analyzer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
"""
P(ssh = down | state = ok) = 0
P(ssh = up | state = ok) = 1
"""
""" Determine the state based on the indicators """
def analyze(status, use_reason=True):
return (analyze_reason if use_reason else analyze_no_reason)(status)
def analyze_no_reason(status):
if not status['SSH'] and 'POWER' in status and status['POWER'] == 'on':
return 'NODE_KILLED_IPMI_ON'
if not status['SSH'] and 'POWER' in status and status['POWER'] == 'off':
return 'NODE_KILLED_IPMI_OFF'
if not status['SSH']:
return 'UNKNOWN'
# All of these are when SSH is working
if status['REASON'] == 'Not responding':
if len(status['USER_PROCESSES']):
return 'SLURM_FAILED_USER_PROCESSES_ALIVE'
else:
return 'SLURM_FAILED_NO_USER_PROCESSES'
if status['OVERALL']:
return 'NODE_WORKING'
return 'UNKNOWN'
def analyze_reason(status):
if not status['SSH'] and status['REASON'] == 'Not responding' and 'POWER' in status and status['POWER'] == 'on':
return 'NODE_KILLED_IPMI_ON'
if not status['SSH'] and status['REASON'] == 'Not responding' and 'POWER' in status and status['POWER'] == 'off':
return 'NODE_KILLED_IPMI_OFF'
if not status['SSH']:
return 'UNKNOWN'
# All of these are when SSH is working
if status['REASON'] == 'Not responding':
if len(status['USER_PROCESSES']):
return 'SLURM_FAILED_USER_PROCESSES_ALIVE'
else:
return 'SLURM_FAILED_NO_USER_PROCESSES'
if status['REASON'] == 'Node unexpectedly rebooted' and status['OVERALL']:
return 'NODE_WORKING'
if status['REASON'] == 'batch job complete failure' and status['OVERALL']:
return 'NODE_WORKING'
return 'UNKNOWN'