Prometheus and Grafana with Custom Script for nvidia-smi
I'm learning Prometheus
and Grafana
basics.
Setup windows_exporter
was easy and quick. (See documentation: https://github.com/prometheus-community/windows_exporter)
Using Docker, I quickly setup Prometheus and Grafana.
Then with Javascript (NodeJS), I successfully created a simple web server to expose the data returned by nvidia-smi
(I wanted to learn and practice the custom exporter part to undertstand the logic behind)
Setup
Setup windows_exporter
Simply download the latest release and launch the command in powershell as administrator:
msiexec /i windows_exporter-0.20.0-amd64.msi ENABLED_COLLECTORS=cpu,cpu_info,cs,logical_disk,logon,memory,net,os,process,service,system,time,thermalzone,tcp
Start Grafana
docker run -d -p 3000:3000 --name grafana --restart always -v grafana-storage:/var/lib/grafana grafana/grafana-oss
Prometheus
Setup Prometheus
Create 2 files (Replace the content with your information):
prometheus.yml
global:
scrape_interval: 15s
external_labels:
monitor: "prom-monitor"
scrape_configs:
- job_name: "prometheus"
scrape_interval: 5s
static_configs:
- targets: ["localhost:9090"]
- job_name: "Dynamic"
file_sd_configs:
- files:
- targets.json
targets.json
[
{
"targets": ["192.168.2.29:9182"],
"labels": {
"env": "prod",
"job": "with_gpu"
}
},
{
"targets": ["192.168.2.29:9999"],
"labels": {
"env": "prod",
"job": "with_gpu"
}
}
]
Start Prometheus
docker run \
-d \
--name prometheus \
--restart always \
-p 9090:9090 \
-v $(pwd)/prometheus.yml:/etc/prometheus/prometheus.yml \
-v $(pwd)/targets.json:/etc/prometheus/targets.json \
prom/prometheus
Custom NodeJS Script to expose the nvidia-smi metrics
I didn't put much time on this, was only focusing on the learning part and not the code quality nor the structure.
The code and project is available here: https://github.com/yet-another-tool/prometheus-nvidia-smi
At this point, you should adapt this script and create a service.
Grafana
My Dashboard
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"target": {
"limit": 100,
"matchAny": false,
"tags": [],
"type": "dashboard"
},
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 1,
"links": [],
"liveNow": false,
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": ""
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "%",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"id": 18,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": ""
},
"editorMode": "code",
"expr": "utilization_gpu*100",
"hide": false,
"legendFormat": "__auto",
"range": true,
"refId": "B"
}
],
"title": "GPU Usage",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": ""
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "GB",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"id": 14,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"pluginVersion": "9.3.6",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": ""
},
"editorMode": "code",
"expr": "memory_used_mib/1024/1024/1024",
"hide": false,
"legendFormat": "__auto",
"range": true,
"refId": "B"
}
],
"title": "GPU Memory Usage",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": ""
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"displayName": "°C",
"mappings": [],
"max": 80,
"min": 20,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "green",
"value": 0
},
{
"color": "#EAB839",
"value": 60
},
{
"color": "red",
"value": 70
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 8
},
"id": 20,
"options": {
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"pluginVersion": "9.3.6",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": ""
},
"editorMode": "code",
"expr": "temperature_gpu",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "GPU Temperature",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": ""
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"displayName": "GB Available",
"mappings": [],
"max": 12,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 0
},
{
"color": "#EAB839",
"value": 4
},
{
"color": "green",
"value": 5
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 8
},
"id": 22,
"options": {
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"pluginVersion": "9.3.6",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": ""
},
"editorMode": "code",
"expr": "memory_free_mib/1024/1024/1024",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "GPU Free Memory",
"type": "gauge"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 16
},
"id": 16,
"panels": [],
"title": "GPU",
"type": "row"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 17
},
"id": 2,
"panels": [],
"title": "High End Hardware",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": ""
},
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "GB",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineStyle": {
"fill": "solid"
},
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "dark-red",
"value": 2
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 18
},
"id": 4,
"options": {
"legend": {
"calcs": ["diffperc"],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": ""
},
"editorMode": "code",
"expr": "(windows_cs_physical_memory_bytes{env=\"prod\"}-windows_os_physical_memory_free_bytes{env=\"prod\"})/1073741824",
"format": "time_series",
"hide": false,
"legendFormat": "__auto",
"range": true,
"refId": "Memory Usage"
}
],
"title": "Memory Usage",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": ""
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "GB",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 4
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 18
},
"id": 8,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": ""
},
"editorMode": "code",
"expr": "(windows_os_physical_memory_free_bytes{env=\"prod\"})/1073741824",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Memory Available",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": ""
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "CPU",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 1
}
]
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "192.168.2.29:9182"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#73BF69",
"mode": "fixed"
}
}
]
}
]
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 26
},
"id": 6,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": ""
},
"editorMode": "code",
"expr": "100-(avg by (instance) (irate(windows_cpu_time_total{mode=\"idle\",env=\"prod\"}[2m])) * 100)",
"legendFormat": "__auto",
"range": true,
"refId": "CPU Usage"
}
],
"title": "CPU",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": ""
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "Usage",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 26
},
"id": 10,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": ""
},
"editorMode": "code",
"expr": "rate(windows_net_bytes_total{env=\"prod\"}[2m]) / windows_net_current_bandwidth_bytes{env=\"prod\"} * 100",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Network Usage",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": ""
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "IOPS",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 11,
"w": 24,
"x": 0,
"y": 34
},
"id": 12,
"options": {
"legend": {
"calcs": [],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": ""
},
"editorMode": "code",
"expr": "rate(windows_logical_disk_reads_total{env=\"prod\",volume=~\".:\"}[2m]) + rate(windows_logical_disk_writes_total{env=\"prod\",volume=~\".:\"}[2m])",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "IOPS",
"type": "timeseries"
}
],
"refresh": "30s",
"schemaVersion": 37,
"style": "dark",
"tags": [],
"templating": {
"list": []
},
"time": {
"from": "now-30m",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "My Computer",
"uid": "",
"version": 7,
"weekStart": ""
}