You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
270 lines
6.7 KiB
Go
270 lines
6.7 KiB
Go
1 year ago
|
/*
|
||
|
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||
|
*
|
||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
* you may not use this file except in compliance with the License.
|
||
|
* You may obtain a copy of the License at
|
||
|
*
|
||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||
|
*
|
||
|
* Unless required by applicable law or agreed to in writing, software
|
||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
* See the License for the specific language governing permissions and
|
||
|
* limitations under the License.
|
||
|
*/
|
||
|
|
||
|
package dcgmexporter
|
||
|
|
||
|
import (
|
||
|
"fmt"
|
||
|
"os"
|
||
|
|
||
|
"github.com/NVIDIA/go-dcgm/pkg/dcgm"
|
||
|
"github.com/sirupsen/logrus"
|
||
|
)
|
||
|
|
||
|
func NewDCGMCollector(c []Counter, config *Config, entityType dcgm.Field_Entity_Group) (*DCGMCollector, func(), error) {
|
||
|
sysInfo, err := InitializeSystemInfo(config.GPUDevices, config.SwitchDevices, config.UseFakeGpus, entityType)
|
||
|
if err != nil {
|
||
|
return nil, func() {}, err
|
||
|
}
|
||
|
|
||
|
hostname := ""
|
||
|
if config.NoHostname == false {
|
||
|
if nodeName := os.Getenv("NODE_NAME"); nodeName != "" {
|
||
|
hostname = nodeName
|
||
|
} else {
|
||
|
hostname, err = os.Hostname()
|
||
|
if err != nil {
|
||
|
return nil, func() {}, err
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
var deviceFields = NewDeviceFields(c, entityType)
|
||
|
|
||
|
if len(deviceFields) <= 0 {
|
||
|
return nil, func() {}, fmt.Errorf("No fields to watch for device type: %d", entityType)
|
||
|
}
|
||
|
|
||
|
collector := &DCGMCollector{
|
||
|
Counters: c,
|
||
|
DeviceFields: deviceFields,
|
||
|
UseOldNamespace: config.UseOldNamespace,
|
||
|
SysInfo: sysInfo,
|
||
|
Hostname: hostname,
|
||
|
}
|
||
|
|
||
|
cleanups, err := SetupDcgmFieldsWatch(collector.DeviceFields, sysInfo, int64(config.CollectInterval)*1000)
|
||
|
if err != nil {
|
||
|
logrus.Fatal("Failed to watch metrics: ", err)
|
||
|
}
|
||
|
|
||
|
collector.Cleanups = cleanups
|
||
|
|
||
|
return collector, func() { collector.Cleanup() }, nil
|
||
|
}
|
||
|
|
||
|
func (c *DCGMCollector) Cleanup() {
|
||
|
for _, c := range c.Cleanups {
|
||
|
c()
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (c *DCGMCollector) GetMetrics() ([][]Metric, error) {
|
||
|
monitoringInfo := GetMonitoredEntities(c.SysInfo)
|
||
|
count := len(monitoringInfo)
|
||
|
|
||
|
metrics := make([][]Metric, count)
|
||
|
|
||
|
for i, mi := range monitoringInfo {
|
||
|
var vals []dcgm.FieldValue_v1
|
||
|
var err error
|
||
|
if mi.Entity.EntityGroupId == dcgm.FE_LINK {
|
||
|
vals, err = dcgm.LinkGetLatestValues(mi.Entity.EntityId, mi.ParentId, c.DeviceFields)
|
||
|
} else {
|
||
|
vals, err = dcgm.EntityGetLatestValues(mi.Entity.EntityGroupId, mi.Entity.EntityId, c.DeviceFields)
|
||
|
}
|
||
|
|
||
|
if err != nil {
|
||
|
if derr, ok := err.(*dcgm.DcgmError); ok {
|
||
|
if derr.Code == dcgm.DCGM_ST_CONNECTION_NOT_VALID {
|
||
|
logrus.Fatal("Could not retrieve metrics: ", err)
|
||
|
}
|
||
|
}
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
// InstanceInfo will be nil for GPUs
|
||
|
if c.SysInfo.InfoType == dcgm.FE_SWITCH || c.SysInfo.InfoType == dcgm.FE_LINK {
|
||
|
metrics[i] = ToSwitchMetric(vals, c.Counters, mi, c.UseOldNamespace, c.Hostname)
|
||
|
} else {
|
||
|
metrics[i] = ToMetric(vals, c.Counters, mi.DeviceInfo, mi.InstanceInfo, c.UseOldNamespace, c.Hostname)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return metrics, nil
|
||
|
}
|
||
|
|
||
|
func FindCounterField(c []Counter, fieldId uint) (*Counter, error) {
|
||
|
for i := 0; i < len(c); i++ {
|
||
|
if uint(c[i].FieldID) == fieldId {
|
||
|
return &c[i], nil
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return &c[0], fmt.Errorf("Could not find corresponding counter")
|
||
|
}
|
||
|
|
||
|
func ToSwitchMetric(values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) []Metric {
|
||
|
var metrics []Metric
|
||
|
var labels = map[string]string{}
|
||
|
|
||
|
for _, val := range values {
|
||
|
v := ToString(val)
|
||
|
// Filter out counters with no value and ignored fields for this entity
|
||
|
|
||
|
counter, err := FindCounterField(c, val.FieldId)
|
||
|
if err != nil {
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
if counter.PromType == "label" {
|
||
|
labels[counter.FieldName] = v
|
||
|
continue
|
||
|
}
|
||
|
uuid := "UUID"
|
||
|
if useOld {
|
||
|
uuid = "uuid"
|
||
|
}
|
||
|
var m Metric
|
||
|
if v == SkipDCGMValue {
|
||
|
continue
|
||
|
} else {
|
||
|
m = Metric{
|
||
|
Counter: counter,
|
||
|
Value: v,
|
||
|
UUID: uuid,
|
||
|
GPU: fmt.Sprintf("%d", mi.Entity.EntityId),
|
||
|
GPUUUID: "",
|
||
|
GPUDevice: fmt.Sprintf("nvswitch%d", mi.ParentId),
|
||
|
GPUModelName: "",
|
||
|
Hostname: hostname,
|
||
|
Labels: &labels,
|
||
|
Attributes: nil,
|
||
|
}
|
||
|
}
|
||
|
metrics = append(metrics, m)
|
||
|
}
|
||
|
|
||
|
return metrics
|
||
|
}
|
||
|
|
||
|
func ToMetric(values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device, instanceInfo *GpuInstanceInfo, useOld bool, hostname string) []Metric {
|
||
|
var metrics []Metric
|
||
|
var labels = map[string]string{}
|
||
|
|
||
|
for _, val := range values {
|
||
|
v := ToString(val)
|
||
|
// Filter out counters with no value and ignored fields for this entity
|
||
|
if v == SkipDCGMValue {
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
counter, err := FindCounterField(c, val.FieldId)
|
||
|
if err != nil {
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
if counter.PromType == "label" {
|
||
|
labels[counter.FieldName] = v
|
||
|
continue
|
||
|
}
|
||
|
uuid := "UUID"
|
||
|
if useOld {
|
||
|
uuid = "uuid"
|
||
|
}
|
||
|
m := Metric{
|
||
|
Counter: counter,
|
||
|
Value: v,
|
||
|
|
||
|
UUID: uuid,
|
||
|
GPU: fmt.Sprintf("%d", d.GPU),
|
||
|
GPUUUID: d.UUID,
|
||
|
GPUDevice: fmt.Sprintf("nvidia%d", d.GPU),
|
||
|
GPUModelName: d.Identifiers.Model,
|
||
|
Hostname: hostname,
|
||
|
|
||
|
Labels: &labels,
|
||
|
Attributes: map[string]string{},
|
||
|
}
|
||
|
if instanceInfo != nil {
|
||
|
m.MigProfile = instanceInfo.ProfileName
|
||
|
m.GPUInstanceID = fmt.Sprintf("%d", instanceInfo.Info.NvmlInstanceId)
|
||
|
} else {
|
||
|
m.MigProfile = ""
|
||
|
m.GPUInstanceID = ""
|
||
|
}
|
||
|
metrics = append(metrics, m)
|
||
|
}
|
||
|
|
||
|
return metrics
|
||
|
}
|
||
|
|
||
|
func ToString(value dcgm.FieldValue_v1) string {
|
||
|
switch value.FieldType {
|
||
|
case dcgm.DCGM_FT_INT64:
|
||
|
switch v := value.Int64(); v {
|
||
|
case dcgm.DCGM_FT_INT32_BLANK:
|
||
|
return SkipDCGMValue
|
||
|
case dcgm.DCGM_FT_INT32_NOT_FOUND:
|
||
|
return SkipDCGMValue
|
||
|
case dcgm.DCGM_FT_INT32_NOT_SUPPORTED:
|
||
|
return SkipDCGMValue
|
||
|
case dcgm.DCGM_FT_INT32_NOT_PERMISSIONED:
|
||
|
return SkipDCGMValue
|
||
|
case dcgm.DCGM_FT_INT64_BLANK:
|
||
|
return SkipDCGMValue
|
||
|
case dcgm.DCGM_FT_INT64_NOT_FOUND:
|
||
|
return SkipDCGMValue
|
||
|
case dcgm.DCGM_FT_INT64_NOT_SUPPORTED:
|
||
|
return SkipDCGMValue
|
||
|
case dcgm.DCGM_FT_INT64_NOT_PERMISSIONED:
|
||
|
return SkipDCGMValue
|
||
|
default:
|
||
|
return fmt.Sprintf("%d", value.Int64())
|
||
|
}
|
||
|
case dcgm.DCGM_FT_DOUBLE:
|
||
|
switch v := value.Float64(); v {
|
||
|
case dcgm.DCGM_FT_FP64_BLANK:
|
||
|
return SkipDCGMValue
|
||
|
case dcgm.DCGM_FT_FP64_NOT_FOUND:
|
||
|
return SkipDCGMValue
|
||
|
case dcgm.DCGM_FT_FP64_NOT_SUPPORTED:
|
||
|
return SkipDCGMValue
|
||
|
case dcgm.DCGM_FT_FP64_NOT_PERMISSIONED:
|
||
|
return SkipDCGMValue
|
||
|
default:
|
||
|
return fmt.Sprintf("%f", value.Float64())
|
||
|
}
|
||
|
case dcgm.DCGM_FT_STRING:
|
||
|
switch v := value.String(); v {
|
||
|
case dcgm.DCGM_FT_STR_BLANK:
|
||
|
return SkipDCGMValue
|
||
|
case dcgm.DCGM_FT_STR_NOT_FOUND:
|
||
|
return SkipDCGMValue
|
||
|
case dcgm.DCGM_FT_STR_NOT_SUPPORTED:
|
||
|
return SkipDCGMValue
|
||
|
case dcgm.DCGM_FT_STR_NOT_PERMISSIONED:
|
||
|
return SkipDCGMValue
|
||
|
default:
|
||
|
return v
|
||
|
}
|
||
|
default:
|
||
|
return FailedToConvert
|
||
|
}
|
||
|
|
||
|
return FailedToConvert
|
||
|
}
|