You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
823 lines
18 KiB
JSON
823 lines
18 KiB
JSON
{
|
|
"__requires": [
|
|
{
|
|
"type": "panel",
|
|
"id": "gauge",
|
|
"name": "Gauge",
|
|
"version": ""
|
|
},
|
|
{
|
|
"type": "grafana",
|
|
"id": "grafana",
|
|
"name": "Grafana",
|
|
"version": "6.7.3"
|
|
},
|
|
{
|
|
"type": "panel",
|
|
"id": "graph",
|
|
"name": "Graph",
|
|
"version": ""
|
|
},
|
|
{
|
|
"type": "datasource",
|
|
"id": "prometheus",
|
|
"name": "Prometheus",
|
|
"version": "1.0.0"
|
|
}
|
|
],
|
|
"annotations": {
|
|
"list": [
|
|
{
|
|
"$$hashKey": "object:192",
|
|
"builtIn": 1,
|
|
"datasource": "-- Grafana --",
|
|
"enable": true,
|
|
"hide": true,
|
|
"iconColor": "rgba(0, 211, 255, 1)",
|
|
"name": "Annotations & Alerts",
|
|
"type": "dashboard"
|
|
}
|
|
]
|
|
},
|
|
"description": "This dashboard is to display the metrics from DCGM Exporter on a Kubernetes (1.19+) cluster",
|
|
"editable": true,
|
|
"gnetId": 12239,
|
|
"graphTooltip": 0,
|
|
"id": null,
|
|
"iteration": 1588401887165,
|
|
"links": [],
|
|
"panels": [
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "$datasource",
|
|
"fill": 1,
|
|
"fillGradient": 0,
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 18,
|
|
"x": 0,
|
|
"y": 0
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 12,
|
|
"legend": {
|
|
"alignAsTable": true,
|
|
"avg": true,
|
|
"current": true,
|
|
"max": true,
|
|
"min": false,
|
|
"rightSide": true,
|
|
"show": true,
|
|
"total": false,
|
|
"values": true
|
|
},
|
|
"lines": true,
|
|
"linewidth": 2,
|
|
"nullPointMode": "null",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pointradius": 2,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "DCGM_FI_DEV_GPU_TEMP{instance=~\"$instance\", gpu=~\"$gpu\"}",
|
|
"instant": false,
|
|
"interval": "",
|
|
"legendFormat": "GPU {{gpu}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "GPU Temperature",
|
|
"tooltip": {
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "individual"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "celsius",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
},
|
|
{
|
|
"datasource": "$datasource",
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 6,
|
|
"x": 18,
|
|
"y": 0
|
|
},
|
|
"id": 14,
|
|
"options": {
|
|
"fieldOptions": {
|
|
"calcs": [
|
|
"mean"
|
|
],
|
|
"defaults": {
|
|
"color": {
|
|
"mode": "thresholds"
|
|
},
|
|
"mappings": [],
|
|
"max": 100,
|
|
"min": 0,
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{
|
|
"color": "green",
|
|
"value": null
|
|
},
|
|
{
|
|
"color": "#EAB839",
|
|
"value": 83
|
|
},
|
|
{
|
|
"color": "red",
|
|
"value": 87
|
|
}
|
|
]
|
|
},
|
|
"unit": "celsius"
|
|
},
|
|
"overrides": [],
|
|
"values": false
|
|
},
|
|
"orientation": "auto",
|
|
"showThresholdLabels": false,
|
|
"showThresholdMarkers": true
|
|
},
|
|
"pluginVersion": "6.7.3",
|
|
"targets": [
|
|
{
|
|
"expr": "avg(DCGM_FI_DEV_GPU_TEMP{instance=~\"$instance\", gpu=~\"$gpu\"})",
|
|
"interval": "",
|
|
"legendFormat": "",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"timeFrom": null,
|
|
"timeShift": null,
|
|
"title": "GPU Avg. Temp",
|
|
"type": "gauge"
|
|
},
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "$datasource",
|
|
"fill": 1,
|
|
"fillGradient": 0,
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 18,
|
|
"x": 0,
|
|
"y": 8
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 10,
|
|
"legend": {
|
|
"alignAsTable": true,
|
|
"avg": true,
|
|
"current": true,
|
|
"max": true,
|
|
"min": false,
|
|
"rightSide": true,
|
|
"show": true,
|
|
"total": false,
|
|
"values": true
|
|
},
|
|
"lines": true,
|
|
"linewidth": 2,
|
|
"nullPointMode": "null",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pluginVersion": "6.5.2",
|
|
"pointradius": 2,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "DCGM_FI_DEV_POWER_USAGE{instance=~\"$instance\", gpu=~\"$gpu\"}",
|
|
"interval": "",
|
|
"legendFormat": "GPU {{gpu}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "GPU Power Usage",
|
|
"tooltip": {
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "individual"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "watt",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
},
|
|
{
|
|
"cacheTimeout": null,
|
|
"datasource": "$datasource",
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 6,
|
|
"x": 18,
|
|
"y": 8
|
|
},
|
|
"id": 16,
|
|
"links": [],
|
|
"options": {
|
|
"fieldOptions": {
|
|
"calcs": [
|
|
"sum"
|
|
],
|
|
"defaults": {
|
|
"color": {
|
|
"mode": "thresholds"
|
|
},
|
|
"mappings": [],
|
|
"max": 2400,
|
|
"min": 0,
|
|
"nullValueMode": "connected",
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{
|
|
"color": "green",
|
|
"value": null
|
|
},
|
|
{
|
|
"color": "#EAB839",
|
|
"value": 1800
|
|
},
|
|
{
|
|
"color": "red",
|
|
"value": 2200
|
|
}
|
|
]
|
|
},
|
|
"unit": "watt"
|
|
},
|
|
"overrides": [],
|
|
"values": false
|
|
},
|
|
"orientation": "horizontal",
|
|
"showThresholdLabels": false,
|
|
"showThresholdMarkers": true
|
|
},
|
|
"pluginVersion": "6.7.3",
|
|
"targets": [
|
|
{
|
|
"expr": "sum(DCGM_FI_DEV_POWER_USAGE{instance=~\"$instance\", gpu=~\"$gpu\"})",
|
|
"instant": true,
|
|
"interval": "",
|
|
"legendFormat": "",
|
|
"range": false,
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"timeFrom": null,
|
|
"timeShift": null,
|
|
"title": "GPU Power Total",
|
|
"type": "gauge"
|
|
},
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "$datasource",
|
|
"fill": 1,
|
|
"fillGradient": 0,
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 16
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 2,
|
|
"interval": "",
|
|
"legend": {
|
|
"alignAsTable": true,
|
|
"avg": true,
|
|
"current": true,
|
|
"max": true,
|
|
"min": false,
|
|
"rightSide": true,
|
|
"show": true,
|
|
"sideWidth": null,
|
|
"total": false,
|
|
"values": true
|
|
},
|
|
"lines": true,
|
|
"linewidth": 2,
|
|
"nullPointMode": "null",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pointradius": 2,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "DCGM_FI_DEV_SM_CLOCK{instance=~\"$instance\", gpu=~\"$gpu\"} * 1000000",
|
|
"format": "time_series",
|
|
"interval": "",
|
|
"intervalFactor": 1,
|
|
"legendFormat": "GPU {{gpu}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "GPU SM Clocks",
|
|
"tooltip": {
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "individual"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"decimals": null,
|
|
"format": "hertz",
|
|
"label": "",
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
},
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "$datasource",
|
|
"fill": 1,
|
|
"fillGradient": 0,
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 24
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 6,
|
|
"legend": {
|
|
"alignAsTable": true,
|
|
"avg": true,
|
|
"current": true,
|
|
"max": true,
|
|
"min": false,
|
|
"rightSide": true,
|
|
"show": true,
|
|
"total": false,
|
|
"values": true
|
|
},
|
|
"lines": true,
|
|
"linewidth": 2,
|
|
"nullPointMode": "null",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pointradius": 2,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "DCGM_FI_DEV_GPU_UTIL{instance=~\"$instance\", gpu=~\"$gpu\"}",
|
|
"interval": "",
|
|
"legendFormat": "GPU {{gpu}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "GPU Utilization",
|
|
"tooltip": {
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "cumulative"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "percent",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": "100",
|
|
"min": "0",
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
},
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "$datasource",
|
|
"fill": 1,
|
|
"fillGradient": 0,
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 32
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 18,
|
|
"legend": {
|
|
"alignAsTable": true,
|
|
"avg": true,
|
|
"current": true,
|
|
"max": true,
|
|
"min": false,
|
|
"rightSide": true,
|
|
"show": true,
|
|
"total": false,
|
|
"values": true
|
|
},
|
|
"lines": true,
|
|
"linewidth": 2,
|
|
"nullPointMode": "null",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pointradius": 2,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "DCGM_FI_DEV_FB_USED{instance=~\"$instance\", gpu=~\"$gpu\"}",
|
|
"interval": "",
|
|
"legendFormat": "GPU {{gpu}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "GPU Framebuffer Mem Used",
|
|
"tooltip": {
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "individual"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "decmbytes",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
},
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "$datasource",
|
|
"fill": 1,
|
|
"fillGradient": 0,
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 24
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 4,
|
|
"legend": {
|
|
"alignAsTable": true,
|
|
"avg": true,
|
|
"current": true,
|
|
"max": true,
|
|
"min": false,
|
|
"rightSide": true,
|
|
"show": true,
|
|
"total": false,
|
|
"values": true
|
|
},
|
|
"lines": true,
|
|
"linewidth": 2,
|
|
"nullPointMode": "null",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pointradius": 2,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{instance=~\"$instance\", gpu=~\"$gpu\"}",
|
|
"interval": "",
|
|
"legendFormat": "GPU {{gpu}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "Tensor Core Utilization",
|
|
"tooltip": {
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "cumulative"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "percentunit",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": "1",
|
|
"min": "0",
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
}
|
|
],
|
|
"refresh": false,
|
|
"schemaVersion": 22,
|
|
"style": "dark",
|
|
"tags": [],
|
|
"templating": {
|
|
"list": [
|
|
{
|
|
"current": {
|
|
"selected": true,
|
|
"text": "Prometheus",
|
|
"value": "Prometheus"
|
|
},
|
|
"hide": 0,
|
|
"includeAll": false,
|
|
"multi": false,
|
|
"name": "datasource",
|
|
"options": [],
|
|
"query": "prometheus",
|
|
"queryValue": "",
|
|
"refresh": 1,
|
|
"regex": "",
|
|
"skipUrlSync": false,
|
|
"type": "datasource"
|
|
},
|
|
{
|
|
"allValue": null,
|
|
"current": {},
|
|
"datasource": "$datasource",
|
|
"definition": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)",
|
|
"hide": 0,
|
|
"includeAll": true,
|
|
"index": -1,
|
|
"label": null,
|
|
"multi": true,
|
|
"name": "instance",
|
|
"options": [],
|
|
"query": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)",
|
|
"refresh": 1,
|
|
"regex": "",
|
|
"skipUrlSync": false,
|
|
"sort": 1,
|
|
"tagValuesQuery": "",
|
|
"tags": [],
|
|
"tagsQuery": "",
|
|
"type": "query",
|
|
"useTags": false
|
|
},
|
|
{
|
|
"allValue": null,
|
|
"current": {},
|
|
"datasource": "$datasource",
|
|
"definition": "label_values(DCGM_FI_DEV_GPU_TEMP, gpu)",
|
|
"hide": 0,
|
|
"includeAll": true,
|
|
"index": -1,
|
|
"label": null,
|
|
"multi": true,
|
|
"name": "gpu",
|
|
"options": [],
|
|
"query": "label_values(DCGM_FI_DEV_GPU_TEMP, gpu)",
|
|
"refresh": 1,
|
|
"regex": "",
|
|
"skipUrlSync": false,
|
|
"sort": 1,
|
|
"tagValuesQuery": "",
|
|
"tags": [],
|
|
"tagsQuery": "",
|
|
"type": "query",
|
|
"useTags": false
|
|
}
|
|
]
|
|
},
|
|
"time": {
|
|
"from": "now-15m",
|
|
"to": "now"
|
|
},
|
|
"timepicker": {
|
|
"refresh_intervals": [
|
|
"5s",
|
|
"10s",
|
|
"30s",
|
|
"1m",
|
|
"5m",
|
|
"15m",
|
|
"30m",
|
|
"1h",
|
|
"2h",
|
|
"1d"
|
|
]
|
|
},
|
|
"timezone": "",
|
|
"title": "NVIDIA DCGM Exporter Dashboard",
|
|
"uid": "Oxed_c6Wz",
|
|
"variables": {
|
|
"list": []
|
|
},
|
|
"version": 1
|
|
}
|