Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/gpu system monitor #619

Merged
merged 11 commits into from
Nov 14, 2024
2 changes: 2 additions & 0 deletions bitbots_misc/system_monitor/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@ system_monitor:

# These settings are quick_switches to completely disable certain parts of statistic collection
do_cpu: true
do_gpu: true
do_memory: true
do_network: false

# these are the threshold values at which we start going into a warn state
cpu_load_percentage: 80.0
gpu_load_percentage: 95.0
memory_load_percentage: 80.0
network_rate_received_errors: 10.0
network_rate_send_errors: 10.0
5 changes: 3 additions & 2 deletions bitbots_misc/system_monitor/system_monitor/cpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ def collect_all():
def _get_cpu_stats():
"""
read and parse /proc/stat
:returns timings which contains accumulative busy and total cpu time

:returns: timings which contains accumulative busy and total cpu time
"""
timings = {}
with open("/proc/stat") as file_obj:
Expand All @@ -51,7 +52,7 @@ def _get_cpu_stats():

def _calculate_usage(cpu_num, total, busy):
"""
calculate usage percentage based on busy/total time
calculate usage percentage based on busy/total time(load, vram_used, vram_total, temperature)
HR05 marked this conversation as resolved.
Show resolved Hide resolved
"""
diff_total = total - _prev_total[cpu_num]
diff_busy = busy - _prev_busy[cpu_num]
Expand Down
19 changes: 19 additions & 0 deletions bitbots_misc/system_monitor/system_monitor/gpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import pyamdgpuinfo


def collect_all():
"""
use pyamdgpuinfo to get gpu metrics

:return: (load, vram_used, vram_total, temperature)
"""
if pyamdgpuinfo.detect_gpus() == 0:
return (0, 0, 0, 0)

gpu = pyamdgpuinfo.get_gpu(0)
load = gpu.query_load()
vram_total = gpu.memory_info["vram_size"]
vram_used = gpu.query_vram_usage()
temperature = gpu.query_temperature()

return (load, vram_used, vram_total, temperature)
26 changes: 23 additions & 3 deletions bitbots_misc/system_monitor/system_monitor/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from rclpy.node import Node

from bitbots_msgs.msg import Workload as WorkloadMsg
from system_monitor import cpus, memory, network_interfaces
from system_monitor import cpus, gpu, memory, network_interfaces


def main():
Expand All @@ -23,31 +23,39 @@ def main():
# start all names with "SYSTEM" for diagnostic analyzer
diag_cpu.name = "SYSTEMCPU"
diag_cpu.hardware_id = "CPU"
diag_gpu = DiagnosticStatus()
diag_gpu.name = "SYSTEMGPU"
diag_gpu.hardware_id = "GPU"
diag_mem = DiagnosticStatus()
diag_mem.name = "SYSTEMMemory"
diag_mem.hardware_id = "Memory"

node.declare_parameter("update_frequency", 10.0)
node.declare_parameter("do_memory", True)
node.declare_parameter("do_cpu", True)
node.declare_parameter("do_gpu", True)
node.declare_parameter("do_memory", True)
node.declare_parameter("do_network", True)
node.declare_parameter("cpu_load_percentage", 80.0)
node.declare_parameter("gpu_load_percentage", 80.0)
Flova marked this conversation as resolved.
Show resolved Hide resolved
node.declare_parameter("memory_load_percentage", 80.0)
node.declare_parameter("network_rate_received_errors", 10.0)
node.declare_parameter("network_rate_send_errors", 10.0)

rate = node.get_parameter("update_frequency").get_parameter_value().double_value
do_memory = node.get_parameter("do_memory").get_parameter_value().bool_value
do_cpu = node.get_parameter("do_cpu").get_parameter_value().bool_value
do_gpu = node.get_parameter("do_gpu").get_parameter_value().bool_value
do_memory = node.get_parameter("do_memory").get_parameter_value().bool_value
do_network = node.get_parameter("do_network").get_parameter_value().bool_value
cpu_load_percentage = node.get_parameter("cpu_load_percentage").get_parameter_value().double_value
gpu_load_percentage = node.get_parameter("gpu_load_percentage").get_parameter_value().double_value
memory_load_percentage = node.get_parameter("memory_load_percentage").get_parameter_value().double_value
network_rate_received_errors = node.get_parameter("network_rate_received_errors").get_parameter_value().double_value
network_rate_send_errors = node.get_parameter("network_rate_send_errors").get_parameter_value().double_value

while rclpy.ok():
last_send_time = time.time()
running_processes, cpu_usages, overall_usage_percentage = cpus.collect_all() if do_cpu else (-1, [], 0)
gpu_load, gpu_vram_used, gpu_vram_total, gpu_temperature = gpu.collect_all() if do_gpu else (0, 0, 0, 0)
memory_available, memory_used, memory_total = memory.collect_all() if do_memory else (-1, -1, -1)
interfaces = network_interfaces.collect_all(node.get_clock()) if do_network else []

Expand All @@ -56,6 +64,10 @@ def main():
cpus=cpu_usages,
running_processes=running_processes,
cpu_usage_overall=overall_usage_percentage,
gpu_load=gpu_load,
gpu_vram_used=gpu_vram_used,
gpu_vram_total=gpu_vram_total,
gpu_temperature=gpu_temperature,
memory_available=memory_available,
memory_used=memory_used,
memory_total=memory_total,
Expand All @@ -73,6 +85,14 @@ def main():
diag_cpu.level = DiagnosticStatus.OK
diag_array.status.append(diag_cpu)

gpu_usage = gpu_load * 100
diag_gpu.message = str(gpu_usage) + "%"
if gpu_usage >= gpu_load_percentage:
diag_gpu.level = DiagnosticStatus.WARN
else:
diag_gpu.level = DiagnosticStatus.OK
diag_array.status.append(diag_gpu)

memory_usage = round((memory_used / memory_total) * 100, 2)
diag_mem.message = str(memory_usage) + "%"
if memory_usage >= memory_load_percentage:
Expand Down
5 changes: 5 additions & 0 deletions bitbots_msgs/msg/Workload.msg
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ Cpu[] cpus
int32 running_processes
float32 cpu_usage_overall

float32 gpu_load
int64 gpu_vram_used
int64 gpu_vram_total
float32 gpu_temperature

int64 memory_available
int64 memory_used
int64 memory_total
Expand Down
1 change: 1 addition & 0 deletions requirements/robot.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ mycroft-mimic3-tts
protobuf==3.20.3 # Required for mycroft-mimic3-tts, but we want to enshure that the version is compatible binaries build using the system version, but it should also be compatiple with all the python dependencies
pyttsx3
playsound
pyamdgpuinfo
Loading