You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
90 lines
3.1 KiB
Go
90 lines
3.1 KiB
Go
1 year ago
|
/*
|
||
|
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||
|
*
|
||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
* you may not use this file except in compliance with the License.
|
||
|
* You may obtain a copy of the License at
|
||
|
*
|
||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||
|
*
|
||
|
* Unless required by applicable law or agreed to in writing, software
|
||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
* See the License for the specific language governing permissions and
|
||
|
* limitations under the License.
|
||
|
*/
|
||
|
|
||
|
package dcgmexporter
|
||
|
|
||
|
import (
|
||
|
"fmt"
|
||
|
"testing"
|
||
|
|
||
|
"github.com/NVIDIA/go-dcgm/pkg/dcgm"
|
||
|
"github.com/stretchr/testify/require"
|
||
|
)
|
||
|
|
||
|
var sampleCounters = []Counter{
|
||
|
{dcgm.DCGM_FI_DEV_GPU_TEMP, "DCGM_FI_DEV_GPU_TEMP", "gauge", "Temperature Help info"},
|
||
|
{dcgm.DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION", "gauge", "Energy help info"},
|
||
|
{dcgm.DCGM_FI_DEV_POWER_USAGE, "DCGM_FI_DEV_POWER_USAGE", "gauge", "Power help info"},
|
||
|
{dcgm.DCGM_FI_DRIVER_VERSION, "DCGM_FI_DRIVER_VERSION", "label", "Driver version"},
|
||
|
/* test that switch and link metrics are filtered out automatically when devices are not detected */
|
||
|
{dcgm.DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT, "DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT", "gauge", "switch temperature"},
|
||
|
{dcgm.DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS, "DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS", "gauge", "per-link flit errors"},
|
||
|
/* test that vgpu metrics are not filtered out */
|
||
|
{dcgm.DCGM_FI_DEV_VGPU_LICENSE_STATUS, "DCGM_FI_DEV_VGPU_LICENSE_STATUS", "gauge", "vgpu license status"},
|
||
|
}
|
||
|
|
||
|
var expectedMetrics = map[string]bool{
|
||
|
"DCGM_FI_DEV_GPU_TEMP": true,
|
||
|
"DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION": true,
|
||
|
"DCGM_FI_DEV_POWER_USAGE": true,
|
||
|
"DCGM_FI_DEV_VGPU_LICENSE_STATUS": true,
|
||
|
}
|
||
|
|
||
|
func TestDCGMCollector(t *testing.T) {
|
||
|
cleanup, err := dcgm.Init(dcgm.Embedded)
|
||
|
require.NoError(t, err)
|
||
|
defer cleanup()
|
||
|
|
||
|
_, cleanup = testDCGMCollector(t, sampleCounters)
|
||
|
cleanup()
|
||
|
}
|
||
|
|
||
|
func testDCGMCollector(t *testing.T, counters []Counter) (*DCGMCollector, func()) {
|
||
|
dOpt := DeviceOptions{true, []int{-1}, []int{-1}}
|
||
|
cfg := Config{
|
||
|
GPUDevices: dOpt,
|
||
|
NoHostname: false,
|
||
|
UseOldNamespace: false,
|
||
|
UseFakeGpus: false,
|
||
|
}
|
||
|
c, cleanup, err := NewDCGMCollector(counters, &cfg, dcgm.FE_GPU)
|
||
|
require.NoError(t, err)
|
||
|
|
||
|
/* Test for error when no switches are available to monitor.
|
||
|
NOTE: This test will fail on a system with switches present. */
|
||
|
_, _, err = NewDCGMCollector(counters, &cfg, dcgm.FE_SWITCH)
|
||
|
require.Error(t, err)
|
||
|
|
||
|
out, err := c.GetMetrics()
|
||
|
require.NoError(t, err)
|
||
|
require.Greater(t, len(out), 0, "Check that you have a GPU on this node")
|
||
|
require.Len(t, out[0], len(expectedMetrics))
|
||
|
|
||
|
for i, dev := range out {
|
||
|
seenMetrics := map[string]bool{}
|
||
|
for _, metric := range dev {
|
||
|
seenMetrics[metric.Counter.FieldName] = true
|
||
|
require.Equal(t, metric.GPU, fmt.Sprintf("%d", i))
|
||
|
|
||
|
require.NotEmpty(t, metric.Value)
|
||
|
require.NotEqual(t, metric.Value, FailedToConvert)
|
||
|
}
|
||
|
require.Equal(t, seenMetrics, expectedMetrics)
|
||
|
}
|
||
|
|
||
|
return c, cleanup
|
||
|
}
|