-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathavailable_resources.py
117 lines (98 loc) · 4.59 KB
/
available_resources.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import subprocess
import re
import itertools
import pandas as pd
def get_node_names(partition_name):
# Get node names list
sinfo_cmd = ["sinfo", "-p", partition_name]
result = subprocess.run(sinfo_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
if result.returncode != 0:
print("Error fetching nodes:", result.stderr)
return []
# Parse the output to extract node names
node_names = set()
for line in result.stdout.splitlines():
if not line.startswith("PARTITION") and line.strip():
parts = line.split()
if len(parts) >= 5:
nodelist = parts[-1]
# Process node list (e.g., udc-an38-[1,9,13,17,25,29,33])
expanded_nodes = expand_nodelist(nodelist)
node_names.update(expanded_nodes)
return list(node_names)
def expand_nodelist(nodelist):
# Use regex to expand node list
expanded_nodes = []
pattern = re.compile(r'([a-zA-Z\-\d]+)\[(\d+(?:-\d+)?(?:,\d+(?:-\d+)?)*)\]')
matches = pattern.findall(nodelist)
if matches:
for match in matches:
prefix, ids = match
for part in ids.split(','):
if '-' in part:
start, end = map(int, part.split('-'))
expanded_nodes.extend([f"{prefix}{i}" for i in range(start, end + 1)])
else:
expanded_nodes.append(f"{prefix}{part}")
else:
expanded_nodes.append(nodelist)
return expanded_nodes
def get_node_resources(node_name):
# Get node resource information
scontrol_cmd = ["scontrol", "show", "node", node_name]
result = subprocess.run(scontrol_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
if result.returncode != 0:
print(f"Error fetching node {node_name} info:", result.stderr)
return None
# Extract current available CPU, memory, and GPU information using regex
cpu_alloc_match = re.search(r'CPUAlloc=(\d+)', result.stdout)
cpu_total_match = re.search(r'CPUTot=(\d+)', result.stdout)
mem_alloc_match = re.search(r'AllocMem=(\d+)', result.stdout)
mem_total_match = re.search(r'RealMemory=(\d+)', result.stdout)
gpu_alloc_match = re.search(r'AllocTRES=.*gres/gpu=(\d+)', result.stdout)
gpu_total_match = re.findall(r'Gres=gpu:(\w+(?:-\d+)?):\d+', result.stdout)
available_features_match = re.search(r'AvailableFeatures=([\w,]+)', result.stdout)
cpus_total = int(cpu_total_match.group(1)) if cpu_total_match else None
cpus_alloc = int(cpu_alloc_match.group(1)) if cpu_alloc_match else None
cpus_available = cpus_total - cpus_alloc if cpus_total is not None and cpus_alloc is not None else None
memory_total = int(mem_total_match.group(1)) if mem_total_match else None
memory_alloc = int(mem_alloc_match.group(1)) if mem_alloc_match else None
memory_available = memory_total - memory_alloc if memory_total is not None and memory_alloc is not None else None
memory_available_gb = (memory_available // 102.4) / 10 if memory_available is not None else None
gpus_available = 0
gpu_type = []
if available_features_match:
gpu_type = available_features_match.group(1).split(',')
if gpu_total_match:
for gpu in gpu_total_match:
gpu_model = gpu
gpus_alloc = int(gpu_alloc_match.group(1)) if gpu_alloc_match else 0
gpus_available += 8 - gpus_alloc if gpus_alloc is not None else 8
gpu_type = [tp for tp in gpu_type if "gb" in tp] if len(gpu_type) > 1 else gpu_type
gpu_type_str = ", ".join(gpu_type) if gpu_type else None
return {
'Node': node_name,
'Available CPUs': cpus_available,
'Available Memory (GB)': memory_available_gb,
'Available GPUs': gpus_available,
'GPU Type': gpu_type_str
}
def main():
partitions = ["gpu-a6000", "gpu-a100-80", "gpu-a100-40", "gpu-a40", "gpu-v100", "interactive"]
all_resources = []
for partition_name in partitions:
print(f"Checking partition: {partition_name}")
node_names = get_node_names(partition_name)
if not node_names:
print("No nodes found for partition:", partition_name)
continue
for node in node_names:
resources = get_node_resources(node)
if resources:
all_resources.append(resources)
if all_resources:
df = pd.DataFrame(all_resources)
df_filtered = df[(df['Available GPUs'] > 0) & (df['Available CPUs'] > 0) & (df['Available Memory (GB)'] >= 6)]
print(df_filtered)
if __name__ == "__main__":
main()