You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

114 lines
3.1 KiB
Go

/*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package dcgmexporter
import (
"fmt"
"math/rand"
"github.com/NVIDIA/go-dcgm/pkg/dcgm"
)
func NewGroup() (dcgm.GroupHandle, func(), error) {
group, err := dcgm.NewDefaultGroup(fmt.Sprintf("gpu-collector-group-%d", rand.Uint64()))
if err != nil {
return dcgm.GroupHandle{}, func() {}, err
}
return group, func() { dcgm.DestroyGroup(group) }, nil
}
func NewDeviceFields(counters []Counter, entityType dcgm.Field_Entity_Group) []dcgm.Short {
var deviceFields []dcgm.Short
for _, f := range counters {
meta := dcgm.FieldGetById(f.FieldID)
if meta.EntityLevel == entityType || meta.EntityLevel == dcgm.FE_NONE {
deviceFields = append(deviceFields, f.FieldID)
} else if entityType == dcgm.FE_GPU && (meta.EntityLevel == dcgm.FE_GPU_CI || meta.EntityLevel == dcgm.FE_GPU_I || meta.EntityLevel == dcgm.FE_VGPU) {
deviceFields = append(deviceFields, f.FieldID)
}
}
return deviceFields
}
func NewFieldGroup(deviceFields []dcgm.Short) (dcgm.FieldHandle, func(), error) {
name := fmt.Sprintf("gpu-collector-fieldgroup-%d", rand.Uint64())
fieldGroup, err := dcgm.FieldGroupCreate(name, deviceFields)
if err != nil {
return dcgm.FieldHandle{}, func() {}, err
}
return fieldGroup, func() { dcgm.FieldGroupDestroy(fieldGroup) }, nil
}
func WatchFieldGroup(group dcgm.GroupHandle, field dcgm.FieldHandle, updateFreq int64, maxKeepAge float64, maxKeepSamples int32) error {
err := dcgm.WatchFieldsWithGroupEx(field, group, updateFreq, maxKeepAge, maxKeepSamples)
if err != nil {
return err
}
return nil
}
func SetupDcgmFieldsWatch(deviceFields []dcgm.Short, sysInfo SystemInfo, collectIntervalUsec int64) ([]func(), error) {
var err error
var cleanups []func()
var cleanup func()
var groups []dcgm.GroupHandle
var fieldGroup dcgm.FieldHandle
if sysInfo.InfoType == dcgm.FE_LINK {
/* one group per-nvswitch is created for nvlinks */
groups, cleanups, err = CreateLinkGroupsFromSystemInfo(sysInfo)
} else {
group, cleanup, err := CreateGroupFromSystemInfo(sysInfo)
if err == nil {
groups = append(groups, group)
cleanups = append(cleanups, cleanup)
}
}
if err != nil {
goto fail
}
for _, gr := range groups {
fieldGroup, cleanup, err = NewFieldGroup(deviceFields)
if err != nil {
goto fail
}
cleanups = append(cleanups, cleanup)
err = WatchFieldGroup(gr, fieldGroup, collectIntervalUsec, 0.0, 1)
if err != nil {
goto fail
}
}
return cleanups, nil
fail:
for _, f := range cleanups {
f()
}
return nil, err
}