You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

191 lines
6.4 KiB
Go

1 year ago
/*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package dcgmexporter
import (
"fmt"
"github.com/NVIDIA/go-dcgm/pkg/dcgm"
"github.com/stretchr/testify/require"
"testing"
)
const (
fakeProfileName string = "2fake.4gb"
)
func SpoofSwitchSystemInfo() SystemInfo {
var sysInfo SystemInfo
sysInfo.InfoType = dcgm.FE_SWITCH
sw1 := SwitchInfo{
EntityId: 0,
}
sw2 := SwitchInfo{
EntityId: 1,
}
l1 := dcgm.NvLinkStatus{
ParentId: 0,
ParentType: dcgm.FE_SWITCH,
State: 2,
Index: 0,
}
l2 := dcgm.NvLinkStatus{
ParentId: 0,
ParentType: dcgm.FE_SWITCH,
State: 3,
Index: 1,
}
l3 := dcgm.NvLinkStatus{
ParentId: 1,
ParentType: dcgm.FE_SWITCH,
State: 2,
Index: 0,
}
l4 := dcgm.NvLinkStatus{
ParentId: 1,
ParentType: dcgm.FE_SWITCH,
State: 3,
Index: 1,
}
sw1.NvLinks = append(sw1.NvLinks, l1)
sw1.NvLinks = append(sw1.NvLinks, l2)
sw2.NvLinks = append(sw2.NvLinks, l3)
sw2.NvLinks = append(sw2.NvLinks, l4)
sysInfo.Switches = append(sysInfo.Switches, sw1)
sysInfo.Switches = append(sysInfo.Switches, sw2)
return sysInfo
}
func SpoofSystemInfo() SystemInfo {
var sysInfo SystemInfo
sysInfo.GpuCount = 2
sysInfo.Gpus[0].DeviceInfo.GPU = 0
gi := GpuInstanceInfo{
Info: dcgm.MigEntityInfo{"fake", 0, 0, 0, 0, 3},
ProfileName: fakeProfileName,
EntityId: 0,
}
sysInfo.Gpus[0].GpuInstances = append(sysInfo.Gpus[0].GpuInstances, gi)
gi2 := GpuInstanceInfo{
Info: dcgm.MigEntityInfo{"fake", 0, 1, 0, 0, 3},
ProfileName: fakeProfileName,
EntityId: 14,
}
sysInfo.Gpus[1].GpuInstances = append(sysInfo.Gpus[1].GpuInstances, gi2)
sysInfo.Gpus[1].DeviceInfo.GPU = 1
return sysInfo
}
func TestMonitoredEntities(t *testing.T) {
sysInfo := SpoofSystemInfo()
sysInfo.gOpt.Flex = true
monitoring := GetMonitoredEntities(sysInfo)
require.Equal(t, len(monitoring), 2, fmt.Sprintf("Should have 2 monitored entities but found %d", len(monitoring)))
instanceCount := 0
gpuCount := 0
for _, mi := range monitoring {
if mi.Entity.EntityGroupId == dcgm.FE_GPU_I {
instanceCount = instanceCount + 1
require.NotEqual(t, mi.InstanceInfo, nil, "Expected InstanceInfo to be populated but it wasn't")
require.Equal(t, mi.InstanceInfo.ProfileName, fakeProfileName, "Expected profile named '%s' but found '%s'", fakeProfileName, mi.InstanceInfo.ProfileName)
if mi.Entity.EntityId != uint(0) {
// One of these should be 0, the other should be 14
require.Equal(t, mi.Entity.EntityId, uint(14), "Expected 14 as EntityId but found %s", monitoring[1].Entity.EntityId)
}
} else {
gpuCount = gpuCount + 1
require.Equal(t, mi.InstanceInfo, (*GpuInstanceInfo)(nil), "Expected InstanceInfo to be nil but it wasn't")
}
}
require.Equal(t, instanceCount, 2, "Expected 2 GPU instances but found %d", instanceCount)
require.Equal(t, gpuCount, 0, "Expected 0 GPUs but found %d", gpuCount)
sysInfo.Gpus[0].GpuInstances = sysInfo.Gpus[0].GpuInstances[:0]
sysInfo.Gpus[1].GpuInstances = sysInfo.Gpus[1].GpuInstances[:0]
monitoring = GetMonitoredEntities(sysInfo)
require.Equal(t, 2, len(monitoring), fmt.Sprintf("Should have 2 monitored entities but found %d", len(monitoring)))
for i, mi := range monitoring {
require.Equal(t, mi.Entity.EntityGroupId, dcgm.FE_GPU, "Expected FE_GPU but found %d", mi.Entity.EntityGroupId)
require.Equal(t, uint(i), mi.DeviceInfo.GPU, "Expected GPU %d but found %d", i, mi.DeviceInfo.GPU)
require.Equal(t, (*GpuInstanceInfo)(nil), mi.InstanceInfo, "Expected InstanceInfo not to be populated but it was")
}
}
func TestVerifyDevicePresence(t *testing.T) {
sysInfo := SpoofSystemInfo()
var dOpt DeviceOptions
dOpt.Flex = true
err := VerifyDevicePresence(&sysInfo, dOpt)
require.Equal(t, err, nil, "Expected to have no error, but found %s", err)
dOpt.Flex = false
dOpt.MajorRange = append(dOpt.MajorRange, -1)
dOpt.MinorRange = append(dOpt.MinorRange, -1)
err = VerifyDevicePresence(&sysInfo, dOpt)
require.Equal(t, err, nil, "Expected to have no error, but found %s", err)
dOpt.MinorRange[0] = 10 // this GPU instance doesn't exist
err = VerifyDevicePresence(&sysInfo, dOpt)
require.NotEqual(t, err, nil, "Expected to have an error for a non-existent GPU instance, but none found")
dOpt.MajorRange[0] = 10 // this GPU doesn't exist
dOpt.MinorRange[0] = -1
err = VerifyDevicePresence(&sysInfo, dOpt)
require.NotEqual(t, err, nil, "Expected to have an error for a non-existent GPU, but none found")
// Add GPUs and instances that exist
dOpt.MajorRange[0] = 0
dOpt.MajorRange = append(dOpt.MajorRange, 1)
dOpt.MinorRange[0] = 0
dOpt.MinorRange = append(dOpt.MinorRange, 14)
err = VerifyDevicePresence(&sysInfo, dOpt)
require.Equal(t, err, nil, "Expected to have no error, but found %s", err)
}
//func TestMigProfileNames(t *testing.T) {
// sysInfo := SpoofSystemInfo()
// SetMigProfileNames(sysInfo, values)
//}
func TestMonitoredSwitches(t *testing.T) {
sysInfo := SpoofSwitchSystemInfo()
/* test that only switches are returned */
monitoring := GetMonitoredEntities(sysInfo)
require.Equal(t, len(monitoring), 2, fmt.Sprintf("Should have 2 monitored switches but found %d", len(monitoring)))
for _, mi := range monitoring {
require.Equal(t, mi.Entity.EntityGroupId, dcgm.FE_SWITCH, fmt.Sprintf("Should have only returned switches but returned %d", mi.Entity.EntityGroupId))
}
/* test that only "up" links are monitored and 1 from each switch */
sysInfo.InfoType = dcgm.FE_LINK
monitoring = GetMonitoredEntities(sysInfo)
require.Equal(t, len(monitoring), 2, fmt.Sprintf("Should have 2 monitored links but found %d", len(monitoring)))
for i, mi := range monitoring {
require.Equal(t, mi.Entity.EntityGroupId, dcgm.FE_LINK, fmt.Sprintf("Should have only returned links but returned %d", mi.Entity.EntityGroupId))
require.Equal(t, mi.ParentId, uint(i), fmt.Sprint("Link should reference switch parent"))
}
}