You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

90 lines
3.1 KiB
Go

/*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package dcgmexporter
import (
"fmt"
"testing"
"github.com/NVIDIA/go-dcgm/pkg/dcgm"
"github.com/stretchr/testify/require"
)
var sampleCounters = []Counter{
{dcgm.DCGM_FI_DEV_GPU_TEMP, "DCGM_FI_DEV_GPU_TEMP", "gauge", "Temperature Help info"},
{dcgm.DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION", "gauge", "Energy help info"},
{dcgm.DCGM_FI_DEV_POWER_USAGE, "DCGM_FI_DEV_POWER_USAGE", "gauge", "Power help info"},
{dcgm.DCGM_FI_DRIVER_VERSION, "DCGM_FI_DRIVER_VERSION", "label", "Driver version"},
/* test that switch and link metrics are filtered out automatically when devices are not detected */
{dcgm.DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT, "DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT", "gauge", "switch temperature"},
{dcgm.DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS, "DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS", "gauge", "per-link flit errors"},
/* test that vgpu metrics are not filtered out */
{dcgm.DCGM_FI_DEV_VGPU_LICENSE_STATUS, "DCGM_FI_DEV_VGPU_LICENSE_STATUS", "gauge", "vgpu license status"},
}
var expectedMetrics = map[string]bool{
"DCGM_FI_DEV_GPU_TEMP": true,
"DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION": true,
"DCGM_FI_DEV_POWER_USAGE": true,
"DCGM_FI_DEV_VGPU_LICENSE_STATUS": true,
}
func TestDCGMCollector(t *testing.T) {
cleanup, err := dcgm.Init(dcgm.Embedded)
require.NoError(t, err)
defer cleanup()
_, cleanup = testDCGMCollector(t, sampleCounters)
cleanup()
}
func testDCGMCollector(t *testing.T, counters []Counter) (*DCGMCollector, func()) {
dOpt := DeviceOptions{true, []int{-1}, []int{-1}}
cfg := Config{
GPUDevices: dOpt,
NoHostname: false,
UseOldNamespace: false,
UseFakeGpus: false,
}
c, cleanup, err := NewDCGMCollector(counters, &cfg, dcgm.FE_GPU)
require.NoError(t, err)
/* Test for error when no switches are available to monitor.
NOTE: This test will fail on a system with switches present. */
_, _, err = NewDCGMCollector(counters, &cfg, dcgm.FE_SWITCH)
require.Error(t, err)
out, err := c.GetMetrics()
require.NoError(t, err)
require.Greater(t, len(out), 0, "Check that you have a GPU on this node")
require.Len(t, out[0], len(expectedMetrics))
for i, dev := range out {
seenMetrics := map[string]bool{}
for _, metric := range dev {
seenMetrics[metric.Counter.FieldName] = true
require.Equal(t, metric.GPU, fmt.Sprintf("%d", i))
require.NotEmpty(t, metric.Value)
require.NotEqual(t, metric.Value, FailedToConvert)
}
require.Equal(t, seenMetrics, expectedMetrics)
}
return c, cleanup
}