From 0134e885f8a5645aa081ac3a71818a79df2014ea Mon Sep 17 00:00:00 2001 From: Tom Stocker Date: Wed, 4 Sep 2024 13:43:33 +0200 Subject: [PATCH] added template_nvidia-smi_integration_v7_vGPU.yaml, added trigger desc, replaced uuids, prettyfied and updated --- .../7.0/README.md | 68 ++++++ .../template_nvidia-smi_integration_v7.yaml | 203 ++++++++++++++++++ ...mplate_nvidia-smi_integration_v7_vGPU.yaml | 106 +++++++++ 3 files changed, 377 insertions(+) create mode 100644 Server_Hardware/Other/template_nvidia-smi_integration/7.0/README.md create mode 100644 Server_Hardware/Other/template_nvidia-smi_integration/7.0/template_nvidia-smi_integration_v7.yaml create mode 100644 Server_Hardware/Other/template_nvidia-smi_integration/7.0/template_nvidia-smi_integration_v7_vGPU.yaml diff --git a/Server_Hardware/Other/template_nvidia-smi_integration/7.0/README.md b/Server_Hardware/Other/template_nvidia-smi_integration/7.0/README.md new file mode 100644 index 000000000..04c8406ce --- /dev/null +++ b/Server_Hardware/Other/template_nvidia-smi_integration/7.0/README.md @@ -0,0 +1,68 @@ +# NVidia Sensors + +## Overview + +This template integrates NVidia SMI for a single graphics card with Zabbix. + +The template adds monitoring of: + +* GPU Utilisation +* GPU Power Consumption +* GPU Memory (Used, Free, Total) +* GPU Temperature +* GPU Fan Speed + +The following agent parameters can be used to add the metrics into Zabbix. + +UserParameter=gpu.temp,nvidia-smi --query-gpu=temperature.gpu --format=csv,noheader,nounits -i 0 +UserParameter=gpu.memtotal,nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits -i 0 +UserParameter=gpu.used,nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0 +UserParameter=gpu.free,nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits -i 0 +UserParameter=gpu.fanspeed,nvidia-smi --query-gpu=fan.speed --format=csv,noheader,nounits -i 0 +UserParameter=gpu.utilisation,nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits -i 0 +UserParameter=gpu.power,nvidia-smi --query-gpu=power.draw --format=csv,noheader,nounits -i 0 + +## Author + +Richard Kavanagh +Updated & prettyfied by Tom Stocker, tom.stocker@id.ethz.ch + +## Macros used + +There are no macros links in this template. + +## Template links + +There are no template links in this template. + +## Discovery rules + +There are no discovery rules in this template. + +## Items collected + +|Name|Description|Type|Key and additional info| +|----|-----------|----|----| +|GPU Power|

-

|`Zabbix agent`|gpu.power

Update: 30

| +|GPU Free Memory|

-

|`Zabbix agent`|gpu.free

Update: 30

| +|GPU Utilisation|

-

|`Zabbix agent`|gpu.utilisation

Update: 30

| +|GPU Total Memory|

-

|`Zabbix agent`|gpu.memtotal

Update: 30

| +|GPU Temperature|

-

|`Zabbix agent`|gpu.temp

Update: 30

| +|GPU Used Memory|

-

|`Zabbix agent`|gpu.used

Update: 30

| +|GPU Fan Speed|

-

|`Zabbix agent`|gpu.fanspeed

Update: 30

| + + +## Triggers + +GPU Temperature over 95c {HOSTNAME} +last(/NVidia Sensors/gpu.temp,#2)>95 + +## Notes about GRID driver for virtual GPUs, or directly import template_nvidia-smi_integration_v7_vGPU.yaml where those are already removed + +You may want to disable those items. as you won't see any output: + +|Name|Description|Type|Key and additional info| +|----|-----------|----|----| +|GPU Power|

-

|`Zabbix agent`|gpu.power

Update: 30

| +|GPU Temperature|

-

|`Zabbix agent`|gpu.temp

Update: 30

| +|GPU Fan Speed|

-

|`Zabbix agent`|gpu.fanspeed

Update: 30

| diff --git a/Server_Hardware/Other/template_nvidia-smi_integration/7.0/template_nvidia-smi_integration_v7.yaml b/Server_Hardware/Other/template_nvidia-smi_integration/7.0/template_nvidia-smi_integration_v7.yaml new file mode 100644 index 000000000..db62f7054 --- /dev/null +++ b/Server_Hardware/Other/template_nvidia-smi_integration/7.0/template_nvidia-smi_integration_v7.yaml @@ -0,0 +1,203 @@ +zabbix_export: + version: '7.0' + template_groups: + - uuid: 5bb2de64036c44f793c9f82c25ea9fdc + name: Templates + templates: + - + uuid: 499019c3dfce41dfa20c6052b49e6eea + template: 'NVidia Sensors' + name: 'NVidia Sensors' + description: | + ## Overview + + This template integrates NVidia SMI for a single graphics card with Zabbix. + + The template adds monitoring of: + + * GPU Utilisation + * GPU Power Consumption + * GPU Memory (Used, Free, Total) + * GPU Temperature + * GPU Fan Speed + + The following agent parameters can be used to add the metrics into Zabbix. + + UserParameter=gpu.temp,nvidia-smi --query-gpu=temperature.gpu --format=csv,noheader,nounits -i 0 + UserParameter=gpu.memtotal,nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits -i 0 + UserParameter=gpu.used,nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0 + UserParameter=gpu.free,nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits -i 0 + UserParameter=gpu.fanspeed,nvidia-smi --query-gpu=fan.speed --format=csv,noheader,nounits -i 0 + UserParameter=gpu.utilisation,nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits -i 0 + UserParameter=gpu.power,nvidia-smi --query-gpu=power.draw --format=csv,noheader,nounits -i 0 + + ## Author + + Richard Kavanagh + Updated & prettyfied by Tom Stocker, tom.stocker@id.ethz.ch + + groups: + - + name: Templates + items: + - + uuid: 847765be9f1b45358653eb1561e69876 + name: 'GPU Fan Speed' + key: gpu.fanspeed + delay: '30' + value_type: FLOAT + units: '%' + tags: + - + tag: Application + value: Nvidia + - + uuid: f512bb1996184861a37119aa81c260aa + name: 'GPU Free Memory' + key: gpu.free + delay: '30' + value_type: FLOAT + units: B + preprocessing: + - + type: MULTIPLIER + parameters: + - '1048576' + tags: + - + tag: Application + value: Nvidia + - + uuid: 1d82cd528b15453ab7433b91f6dd1b29 + name: 'GPU Total Memory' + key: gpu.memtotal + delay: '30' + value_type: FLOAT + units: B + preprocessing: + - + type: MULTIPLIER + parameters: + - '1048576' + tags: + - + tag: Application + value: Nvidia + - + uuid: ef1723bd932a4efdb6bb98feabad8b9d + name: 'GPU Power' + key: gpu.power + delay: '30' + value_type: FLOAT + units: W + tags: + - + tag: Application + value: Nvidia + - + uuid: c866b39f748a471c97511940ae637db9 + name: 'GPU Temperature' + key: gpu.temp + delay: '30' + value_type: FLOAT + units: C + tags: + - + tag: Application + value: Nvidia + triggers: + - + uuid: ce8aabec9b4345ca9beeba9075901d78 + expression: 'last(/NVidia Sensors/gpu.temp,#2)>95' + name: 'GPU Temperature over 95c {HOSTNAME}' + priority: AVERAGE + - + uuid: 79c04191119a4390a98aa1a97f17bc21 + name: 'GPU Used Memory' + key: gpu.used + delay: '30' + value_type: FLOAT + units: B + preprocessing: + - + type: MULTIPLIER + parameters: + - '1048576' + tags: + - + tag: Application + value: Nvidia + - + uuid: d8cf86331a54458e8a9a09ffea7f295e + name: 'GPU Utilisation' + key: gpu.utilisation + delay: '30' + value_type: FLOAT + units: '%' + tags: + - + tag: Application + value: Nvidia + graphs: + - + uuid: 0b1890f24cff4e29b32ac6d6e94b4590 + name: 'GPU Memory' + graph_items: + - + color: C80000 + item: + host: 'NVidia Sensors' + key: gpu.free + - + sortorder: '1' + color: 00C800 + item: + host: 'NVidia Sensors' + key: gpu.memtotal + - + sortorder: '2' + color: 0000C8 + item: + host: 'NVidia Sensors' + key: gpu.used + - + uuid: fac5f402ae1345e2a9102a0d470167f7 + name: 'GPU Power' + graph_items: + - + color: C80000 + item: + host: 'NVidia Sensors' + key: gpu.power + - + uuid: 07f45984487a4d82a487fdba9b73f2d4 + name: 'GPU Temperature' + graph_items: + - + color: C80000 + item: + host: 'NVidia Sensors' + key: gpu.temp + - + sortorder: '1' + color: 0000EE + yaxisside: RIGHT + item: + host: 'NVidia Sensors' + key: gpu.fanspeed + - + uuid: ce35cca2d64f46bdaff181aad5bbd4d5 + name: 'GPU Utilisation' + graph_items: + - + color: C80000 + item: + host: 'NVidia Sensors' + key: gpu.utilisation + - + sortorder: '1' + color: 33FF33 + yaxisside: RIGHT + item: + host: 'NVidia Sensors' + key: gpu.power diff --git a/Server_Hardware/Other/template_nvidia-smi_integration/7.0/template_nvidia-smi_integration_v7_vGPU.yaml b/Server_Hardware/Other/template_nvidia-smi_integration/7.0/template_nvidia-smi_integration_v7_vGPU.yaml new file mode 100644 index 000000000..46cc6d3c1 --- /dev/null +++ b/Server_Hardware/Other/template_nvidia-smi_integration/7.0/template_nvidia-smi_integration_v7_vGPU.yaml @@ -0,0 +1,106 @@ +zabbix_export: + version: '7.0' + template_groups: + - uuid: 0fc75b67ced9466abe64c1a8c83d46e9 + name: Templates + templates: + - uuid: 0794f363201240ab92c4daf13c12f1b5 + template: 'NVidia Sensors vGPU' + name: 'NVidia Sensors vGPU' + description: | + ## Overview + + This template integrates NVidia SMI for a single virtual graphics card with Zabbix. + + The template adds monitoring of: + + * GPU Utilisation + * GPU Memory (Used, Free, Total) + + The following agent parameters can be used to add the metrics into Zabbix. + + UserParameter=gpu.memtotal,nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits -i 0 + UserParameter=gpu.used,nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0 + UserParameter=gpu.free,nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits -i 0 + UserParameter=gpu.utilisation,nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits -i 0 + + ## Author + + Richard Kavanagh + Updated & prettyfied by Tom Stocker, tom.stocker@id.ethz.ch + groups: + - name: Templates + items: + - uuid: 742df99113b44f149539ec5fb7eb1f2e + name: 'GPU Free Memory' + key: gpu.free + delay: '30' + value_type: FLOAT + units: B + preprocessing: + - type: MULTIPLIER + parameters: + - '1048576' + tags: + - tag: Application + value: Nvidia + - uuid: 9a8ed162aae342f1b63986b6a156bff1 + name: 'GPU Total Memory' + key: gpu.memtotal + delay: '30' + value_type: FLOAT + units: B + preprocessing: + - type: MULTIPLIER + parameters: + - '1048576' + tags: + - tag: Application + value: Nvidia + - uuid: 649f37f9b0fd4daea820a980ad5b1a54 + name: 'GPU Used Memory' + key: gpu.used + delay: '30' + value_type: FLOAT + units: B + preprocessing: + - type: MULTIPLIER + parameters: + - '1048576' + tags: + - tag: Application + value: Nvidia + - uuid: 244a7ef6b66e4e02b535914ebf78a96f + name: 'GPU Utilisation' + key: gpu.utilisation + delay: '30' + value_type: FLOAT + units: '%' + tags: + - tag: Application + value: Nvidia + graphs: + - uuid: 14a164bdb28648b7a9f57aa9c0b3d78c + name: 'GPU Memory' + graph_items: + - color: C80000 + item: + host: 'NVidia Sensors vGPU' + key: gpu.free + - sortorder: '1' + color: 00C800 + item: + host: 'NVidia Sensors vGPU' + key: gpu.memtotal + - sortorder: '2' + color: 0000C8 + item: + host: 'NVidia Sensors vGPU' + key: gpu.used + - uuid: d3f857f7b7444c4faff21b736bdca130 + name: 'GPU Utilisation' + graph_items: + - color: C80000 + item: + host: 'NVidia Sensors vGPU' + key: gpu.utilisation