You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

611 lines
15 KiB
Go

/*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package dcgmexporter
import (
"fmt"
"math/rand"
"github.com/NVIDIA/go-dcgm/pkg/dcgm"
"github.com/sirupsen/logrus"
)
const PARENT_ID_IGNORED = 0
type GroupInfo struct {
groupHandle dcgm.GroupHandle
groupType dcgm.Field_Entity_Group
}
type ComputeInstanceInfo struct {
InstanceInfo dcgm.MigEntityInfo
ProfileName string
EntityId uint
}
type GpuInstanceInfo struct {
Info dcgm.MigEntityInfo
ProfileName string
EntityId uint
ComputeInstances []ComputeInstanceInfo
}
type GpuInfo struct {
DeviceInfo dcgm.Device
GpuInstances []GpuInstanceInfo
MigEnabled bool
}
type SwitchInfo struct {
EntityId uint
NvLinks []dcgm.NvLinkStatus
}
type SystemInfo struct {
GpuCount uint
Gpus [dcgm.MAX_NUM_DEVICES]GpuInfo
gOpt DeviceOptions
sOpt DeviceOptions
InfoType dcgm.Field_Entity_Group
Switches []SwitchInfo
}
type MonitoringInfo struct {
Entity dcgm.GroupEntityPair
DeviceInfo dcgm.Device
InstanceInfo *GpuInstanceInfo
ParentId uint
}
func SetGpuInstanceProfileName(sysInfo *SystemInfo, entityId uint, profileName string) bool {
for i := uint(0); i < sysInfo.GpuCount; i++ {
for j := range sysInfo.Gpus[i].GpuInstances {
if sysInfo.Gpus[i].GpuInstances[j].EntityId == entityId {
sysInfo.Gpus[i].GpuInstances[j].ProfileName = profileName
return true
}
}
}
return false
}
func SetMigProfileNames(sysInfo *SystemInfo, values []dcgm.FieldValue_v2) error {
notFound := false
err := fmt.Errorf("Cannot find match for entities:")
for _, v := range values {
found := SetGpuInstanceProfileName(sysInfo, v.EntityId, dcgm.Fv2_String(v))
if found == false {
err = fmt.Errorf("%s group %d, id %d", err, v.EntityGroupId, v.EntityId)
notFound = true
}
}
if notFound {
return err
}
return nil
}
func PopulateMigProfileNames(sysInfo *SystemInfo, entities []dcgm.GroupEntityPair) error {
if len(entities) == 0 {
// There are no entities to populate
return nil
}
var fields []dcgm.Short
fields = append(fields, dcgm.DCGM_FI_DEV_NAME)
flags := dcgm.DCGM_FV_FLAG_LIVE_DATA
values, err := dcgm.EntitiesGetLatestValues(entities, fields, flags)
if err != nil {
return err
}
return SetMigProfileNames(sysInfo, values)
}
func GpuIdExists(sysInfo *SystemInfo, gpuId int) bool {
for i := uint(0); i < sysInfo.GpuCount; i++ {
if sysInfo.Gpus[i].DeviceInfo.GPU == uint(gpuId) {
return true
}
}
return false
}
func SwitchIdExists(sysInfo *SystemInfo, switchId int) bool {
for _, sw := range sysInfo.Switches {
if sw.EntityId == uint(switchId) {
return true
}
}
return false
}
func GpuInstanceIdExists(sysInfo *SystemInfo, gpuInstanceId int) bool {
for i := uint(0); i < sysInfo.GpuCount; i++ {
for _, instance := range sysInfo.Gpus[i].GpuInstances {
if instance.EntityId == uint(gpuInstanceId) {
return true
}
}
}
return false
}
func LinkIdExists(sysInfo *SystemInfo, linkId int) bool {
for _, sw := range sysInfo.Switches {
for _, link := range sw.NvLinks {
if link.Index == uint(linkId) {
return true
}
}
}
return false
}
func VerifySwitchDevicePresence(sysInfo *SystemInfo, sOpt DeviceOptions) error {
if sOpt.Flex {
return nil
}
if len(sOpt.MajorRange) > 0 && sOpt.MajorRange[0] != -1 {
// Verify we can find all the specified Switches
for _, swId := range sOpt.MajorRange {
if !SwitchIdExists(sysInfo, swId) {
return fmt.Errorf("couldn't find requested NvSwitch id %d", swId)
}
}
}
if len(sOpt.MinorRange) > 0 && sOpt.MinorRange[0] != -1 {
for _, linkId := range sOpt.MinorRange {
if !LinkIdExists(sysInfo, linkId) {
return fmt.Errorf("couldn't find requested NvLink %d", linkId)
}
}
}
return nil
}
func VerifyDevicePresence(sysInfo *SystemInfo, gOpt DeviceOptions) error {
if gOpt.Flex {
return nil
}
if len(gOpt.MajorRange) > 0 && gOpt.MajorRange[0] != -1 {
// Verify we can find all the specified GPUs
for _, gpuId := range gOpt.MajorRange {
if GpuIdExists(sysInfo, gpuId) == false {
return fmt.Errorf("Couldn't find requested GPU id %d", gpuId)
}
}
}
if len(gOpt.MinorRange) > 0 && gOpt.MinorRange[0] != -1 {
for _, gpuInstanceId := range gOpt.MinorRange {
if GpuInstanceIdExists(sysInfo, gpuInstanceId) == false {
return fmt.Errorf("Couldn't find requested GPU instance id %d", gpuInstanceId)
}
}
}
return nil
}
func InitializeNvSwitchInfo(sysInfo SystemInfo, sOpt DeviceOptions) (SystemInfo, error) {
switches, err := dcgm.GetEntityGroupEntities(dcgm.FE_SWITCH)
if err != nil {
return sysInfo, err
}
if len(switches) <= 0 {
return sysInfo, fmt.Errorf("no switches to monitor")
}
links, err := dcgm.GetNvLinkLinkStatus()
if err != nil {
return sysInfo, err
}
for i := 0; i < len(switches); i++ {
var matchingLinks []dcgm.NvLinkStatus
for _, link := range links {
if link.ParentType == dcgm.FE_SWITCH && link.ParentId == uint(switches[i]) {
matchingLinks = append(matchingLinks, link)
}
}
sw := SwitchInfo{
switches[i],
matchingLinks,
}
sysInfo.Switches = append(sysInfo.Switches, sw)
}
sysInfo.sOpt = sOpt
err = VerifySwitchDevicePresence(&sysInfo, sOpt)
return sysInfo, nil
}
func InitializeGpuInfo(sysInfo SystemInfo, gOpt DeviceOptions, useFakeGpus bool) (SystemInfo, error) {
gpuCount, err := dcgm.GetAllDeviceCount()
if err != nil {
return sysInfo, err
}
sysInfo.GpuCount = gpuCount
for i := uint(0); i < sysInfo.GpuCount; i++ {
// Default mig enabled to false
sysInfo.Gpus[i].MigEnabled = false
sysInfo.Gpus[i].DeviceInfo, err = dcgm.GetDeviceInfo(i)
if err != nil {
if useFakeGpus {
sysInfo.Gpus[i].DeviceInfo.GPU = i
sysInfo.Gpus[i].DeviceInfo.UUID = fmt.Sprintf("fake%d", i)
} else {
return sysInfo, err
}
}
}
hierarchy, err := dcgm.GetGpuInstanceHierarchy()
if err != nil {
return sysInfo, err
}
if hierarchy.Count > 0 {
var entities []dcgm.GroupEntityPair
gpuId := uint(0)
instanceIndex := 0
for i := uint(0); i < hierarchy.Count; i++ {
if hierarchy.EntityList[i].Parent.EntityGroupId == dcgm.FE_GPU {
// We are adding a GPU instance
gpuId = hierarchy.EntityList[i].Parent.EntityId
entityId := hierarchy.EntityList[i].Entity.EntityId
instanceInfo := GpuInstanceInfo{
Info: hierarchy.EntityList[i].Info,
ProfileName: "",
EntityId: entityId,
}
sysInfo.Gpus[gpuId].MigEnabled = true
sysInfo.Gpus[gpuId].GpuInstances = append(sysInfo.Gpus[gpuId].GpuInstances, instanceInfo)
entities = append(entities, dcgm.GroupEntityPair{dcgm.FE_GPU_I, entityId})
instanceIndex = len(sysInfo.Gpus[gpuId].GpuInstances) - 1
} else if hierarchy.EntityList[i].Parent.EntityGroupId == dcgm.FE_GPU_I {
// Add the compute instance, gpuId is recorded previously
entityId := hierarchy.EntityList[i].Entity.EntityId
ciInfo := ComputeInstanceInfo{hierarchy.EntityList[i].Info, "", entityId}
sysInfo.Gpus[gpuId].GpuInstances[instanceIndex].ComputeInstances = append(sysInfo.Gpus[gpuId].GpuInstances[instanceIndex].ComputeInstances, ciInfo)
}
}
err = PopulateMigProfileNames(&sysInfo, entities)
if err != nil {
return sysInfo, err
}
}
sysInfo.gOpt = gOpt
err = VerifyDevicePresence(&sysInfo, gOpt)
return sysInfo, nil
}
func InitializeSystemInfo(gOpt DeviceOptions, sOpt DeviceOptions, useFakeGpus bool, entityType dcgm.Field_Entity_Group) (SystemInfo, error) {
sysInfo := SystemInfo{}
logrus.Info("Initializing system entities of type: ", entityType)
switch entityType {
case dcgm.FE_LINK:
sysInfo.InfoType = dcgm.FE_LINK
return InitializeNvSwitchInfo(sysInfo, sOpt)
case dcgm.FE_SWITCH:
sysInfo.InfoType = dcgm.FE_SWITCH
return InitializeNvSwitchInfo(sysInfo, sOpt)
case dcgm.FE_GPU:
sysInfo.InfoType = dcgm.FE_GPU
return InitializeGpuInfo(sysInfo, gOpt, useFakeGpus)
}
return sysInfo, fmt.Errorf("unhandled entity type: %d", entityType)
}
func CreateLinkGroupsFromSystemInfo(sysInfo SystemInfo) ([]dcgm.GroupHandle, []func(), error) {
var groups []dcgm.GroupHandle
var cleanups []func()
/* Create per-switch link groups */
for _, sw := range sysInfo.Switches {
if !IsSwitchWatched(sw.EntityId, sysInfo) {
continue
}
groupId, err := dcgm.CreateGroup(fmt.Sprintf("gpu-collector-group-%d", rand.Uint64()))
if err != nil {
return nil, cleanups, err
}
groups = append(groups, groupId)
for _, link := range sw.NvLinks {
if link.State != dcgm.LS_UP {
continue
}
if !IsLinkWatched(link.Index, sw.EntityId, sysInfo) {
continue
}
err = dcgm.AddLinkEntityToGroup(groupId, link.Index, link.ParentId)
if err != nil {
return groups, cleanups, err
}
cleanups = append(cleanups, func() { dcgm.DestroyGroup(groupId) })
}
}
return groups, cleanups, nil
}
func CreateGroupFromSystemInfo(sysInfo SystemInfo) (dcgm.GroupHandle, func(), error) {
monitoringInfo := GetMonitoredEntities(sysInfo)
groupId, err := dcgm.CreateGroup(fmt.Sprintf("gpu-collector-group-%d", rand.Uint64()))
if err != nil {
return dcgm.GroupHandle{}, func() {}, err
}
for _, mi := range monitoringInfo {
err := dcgm.AddEntityToGroup(groupId, mi.Entity.EntityGroupId, mi.Entity.EntityId)
if err != nil {
return groupId, func() { dcgm.DestroyGroup(groupId) }, err
}
}
return groupId, func() { dcgm.DestroyGroup(groupId) }, nil
}
func AddAllGpus(sysInfo SystemInfo) []MonitoringInfo {
var monitoring []MonitoringInfo
for i := uint(0); i < sysInfo.GpuCount; i++ {
mi := MonitoringInfo{
dcgm.GroupEntityPair{dcgm.FE_GPU, sysInfo.Gpus[i].DeviceInfo.GPU},
sysInfo.Gpus[i].DeviceInfo,
nil,
PARENT_ID_IGNORED,
}
monitoring = append(monitoring, mi)
}
return monitoring
}
func AddAllSwitches(sysInfo SystemInfo) []MonitoringInfo {
var monitoring []MonitoringInfo
for _, sw := range sysInfo.Switches {
if !IsSwitchWatched(sw.EntityId, sysInfo) {
continue
}
mi := MonitoringInfo{
dcgm.GroupEntityPair{dcgm.FE_SWITCH, sw.EntityId},
dcgm.Device{
0, "", "", 0,
dcgm.PCIInfo{"", 0, 0, 0},
dcgm.DeviceIdentifiers{"", "", "", "", "", ""},
nil, "",
},
nil,
PARENT_ID_IGNORED,
}
monitoring = append(monitoring, mi)
}
return monitoring
}
func AddAllLinks(sysInfo SystemInfo) []MonitoringInfo {
var monitoring []MonitoringInfo
for _, sw := range sysInfo.Switches {
for _, link := range sw.NvLinks {
if link.State != dcgm.LS_UP {
continue
}
if !IsSwitchWatched(sw.EntityId, sysInfo) {
continue
}
if !IsLinkWatched(link.Index, sw.EntityId, sysInfo) {
continue
}
mi := MonitoringInfo{
dcgm.GroupEntityPair{dcgm.FE_LINK, link.Index},
dcgm.Device{
0, "", "", 0,
dcgm.PCIInfo{"", 0, 0, 0},
dcgm.DeviceIdentifiers{"", "", "", "", "", ""},
nil, "",
},
nil,
link.ParentId,
}
monitoring = append(monitoring, mi)
}
}
return monitoring
}
func IsSwitchWatched(switchId uint, sysInfo SystemInfo) bool {
if sysInfo.sOpt.Flex {
return true
}
if len(sysInfo.sOpt.MajorRange) <= 0 {
return true
}
for _, sw := range sysInfo.sOpt.MajorRange {
if uint(sw) == switchId {
return true
}
}
return false
}
func IsLinkWatched(linkId uint, switchId uint, sysInfo SystemInfo) bool {
if sysInfo.sOpt.Flex {
return true
}
for _, sw := range sysInfo.Switches {
if !IsSwitchWatched(sw.EntityId, sysInfo) {
return false
}
if len(sysInfo.sOpt.MinorRange) <= 0 {
return true
}
for _, link := range sysInfo.sOpt.MinorRange {
if uint(link) == linkId {
return true
}
}
return false
}
return false
}
func AddAllGpuInstances(sysInfo SystemInfo, addFlexibly bool) []MonitoringInfo {
var monitoring []MonitoringInfo
for i := uint(0); i < sysInfo.GpuCount; i++ {
if addFlexibly == true && len(sysInfo.Gpus[i].GpuInstances) == 0 {
mi := MonitoringInfo{
dcgm.GroupEntityPair{dcgm.FE_GPU, sysInfo.Gpus[i].DeviceInfo.GPU},
sysInfo.Gpus[i].DeviceInfo,
nil,
PARENT_ID_IGNORED,
}
monitoring = append(monitoring, mi)
} else {
for j := 0; j < len(sysInfo.Gpus[i].GpuInstances); j++ {
mi := MonitoringInfo{
dcgm.GroupEntityPair{dcgm.FE_GPU_I, sysInfo.Gpus[i].GpuInstances[j].EntityId},
sysInfo.Gpus[i].DeviceInfo,
&sysInfo.Gpus[i].GpuInstances[j],
PARENT_ID_IGNORED,
}
monitoring = append(monitoring, mi)
}
}
}
return monitoring
}
func GetMonitoringInfoForGpu(sysInfo SystemInfo, gpuId int) *MonitoringInfo {
for i := uint(0); i < sysInfo.GpuCount; i++ {
if sysInfo.Gpus[i].DeviceInfo.GPU == uint(gpuId) {
return &MonitoringInfo{
dcgm.GroupEntityPair{dcgm.FE_GPU, sysInfo.Gpus[i].DeviceInfo.GPU},
sysInfo.Gpus[i].DeviceInfo,
nil,
PARENT_ID_IGNORED,
}
}
}
return nil
}
func GetMonitoringInfoForGpuInstance(sysInfo SystemInfo, gpuInstanceId int) *MonitoringInfo {
for i := uint(0); i < sysInfo.GpuCount; i++ {
for _, instance := range sysInfo.Gpus[i].GpuInstances {
if instance.EntityId == uint(gpuInstanceId) {
return &MonitoringInfo{
dcgm.GroupEntityPair{dcgm.FE_GPU_I, uint(gpuInstanceId)},
sysInfo.Gpus[i].DeviceInfo,
&instance,
PARENT_ID_IGNORED,
}
}
}
}
return nil
}
func GetMonitoredEntities(sysInfo SystemInfo) []MonitoringInfo {
var monitoring []MonitoringInfo
if sysInfo.InfoType == dcgm.FE_SWITCH {
monitoring = AddAllSwitches(sysInfo)
} else if sysInfo.InfoType == dcgm.FE_LINK {
monitoring = AddAllLinks(sysInfo)
} else if sysInfo.gOpt.Flex == true {
monitoring = AddAllGpuInstances(sysInfo, true)
} else {
if len(sysInfo.gOpt.MajorRange) > 0 && sysInfo.gOpt.MajorRange[0] == -1 {
monitoring = AddAllGpus(sysInfo)
} else {
for _, gpuId := range sysInfo.gOpt.MajorRange {
// We've already verified that everything in the options list exists
monitoring = append(monitoring, *GetMonitoringInfoForGpu(sysInfo, gpuId))
}
}
if len(sysInfo.gOpt.MinorRange) > 0 && sysInfo.gOpt.MinorRange[0] == -1 {
monitoring = AddAllGpuInstances(sysInfo, false)
} else {
for _, gpuInstanceId := range sysInfo.gOpt.MinorRange {
// We've already verified that everything in the options list exists
monitoring = append(monitoring, *GetMonitoringInfoForGpuInstance(sysInfo, gpuInstanceId))
}
}
}
return monitoring
}
func GetGpuInstanceIdentifier(sysInfo SystemInfo, gpuuuid string, gpuInstanceId uint) string {
for i := uint(0); i < sysInfo.GpuCount; i++ {
if sysInfo.Gpus[i].DeviceInfo.UUID == gpuuuid {
identifier := fmt.Sprintf("%d-%d", sysInfo.Gpus[i].DeviceInfo.GPU, gpuInstanceId)
return identifier
}
}
return ""
}