fix ignore
parent
af85ed20a2
commit
4179e26f1c
@ -1,6 +1,5 @@
|
||||
*.swp
|
||||
*.swo
|
||||
dcgm-exporter
|
||||
!etc/
|
||||
!deployment/
|
||||
tags
|
||||
|
@ -0,0 +1,427 @@
|
||||
/*
|
||||
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/signal"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"text/template"
|
||||
"time"
|
||||
|
||||
"github.com/NVIDIA/dcgm-exporter/pkg/dcgmexporter"
|
||||
|
||||
"github.com/NVIDIA/go-dcgm/pkg/dcgm"
|
||||
"github.com/sirupsen/logrus"
|
||||
"github.com/urfave/cli/v2"
|
||||
)
|
||||
|
||||
const (
|
||||
FlexKey = "f" // Monitor all GPUs if MIG is disabled or all GPU instances if MIG is enabled
|
||||
MajorKey = "g" // Monitor top-level entities: GPUs or NvSwitches
|
||||
MinorKey = "i" // Monitor sub-level entities: GPU instances/NvLinks - cannot be specified if MIG is disabled
|
||||
undefinedConfigMapData = "none"
|
||||
)
|
||||
|
||||
var (
|
||||
BuildVersion = "Filled by the build system"
|
||||
|
||||
CLIFieldsFile = "collectors"
|
||||
CLIAddress = "address"
|
||||
CLICollectInterval = "collect-interval"
|
||||
CLIKubernetes = "kubernetes"
|
||||
CLIKubernetesGPUIDType = "kubernetes-gpu-id-type"
|
||||
CLIUseOldNamespace = "use-old-namespace"
|
||||
CLIRemoteHEInfo = "remote-hostengine-info"
|
||||
CLIGPUDevices = "devices"
|
||||
CLISwitchDevices = "switch-devices"
|
||||
CLINoHostname = "no-hostname"
|
||||
CLIUseFakeGpus = "fake-gpus"
|
||||
CLIConfigMapData = "configmap-data"
|
||||
)
|
||||
|
||||
func main() {
|
||||
c := cli.NewApp()
|
||||
c.Name = "DCGM Exporter"
|
||||
c.Usage = "Generates GPU metrics in the prometheus format"
|
||||
c.Version = BuildVersion
|
||||
|
||||
deviceUsageTemplate := `Specify which devices dcgm-exporter monitors.
|
||||
Possible values: {{.FlexKey}} or
|
||||
{{.MajorKey}}[:id1[,-id2...] or
|
||||
{{.MinorKey}}[:id1[,-id2...].
|
||||
If an id list is used, then devices with match IDs must exist on the system. For example:
|
||||
(default) = monitor all GPU instances in MIG mode, all GPUs if MIG mode is disabled. (See {{.FlexKey}})
|
||||
{{.MajorKey}} = Monitor all GPUs
|
||||
{{.MinorKey}} = Monitor all GPU instances
|
||||
{{.FlexKey}} = Monitor all GPUs if MIG is disabled, or all GPU instances if MIG is enabled.
|
||||
Note: this rule will be applied to each GPU. If it has GPU instances, those
|
||||
will be monitored. If it doesn't, then the GPU will be monitored.
|
||||
This is our recommended option for single or mixed MIG Strategies.
|
||||
{{.MajorKey}}:0,1 = monitor GPUs 0 and 1
|
||||
{{.MinorKey}}:0,2-4 = monitor GPU instances 0, 2, 3, and 4.
|
||||
|
||||
NOTE 1: -i cannot be specified unless MIG mode is enabled.
|
||||
NOTE 2: Any time indices are specified, those indices must exist on the system.
|
||||
NOTE 3: In MIG mode, only -f or -i with a range can be specified. GPUs are not assigned to pods
|
||||
and therefore reporting must occur at the GPU instance level.`
|
||||
|
||||
var deviceUsageBuffer bytes.Buffer
|
||||
t := template.Must(template.New("").Parse(deviceUsageTemplate))
|
||||
_ = t.Execute(&deviceUsageBuffer, map[string]string{"FlexKey": FlexKey, "MajorKey": MajorKey, "MinorKey": MinorKey})
|
||||
DeviceUsageStr := deviceUsageBuffer.String()
|
||||
|
||||
c.Flags = []cli.Flag{
|
||||
&cli.StringFlag{
|
||||
Name: CLIFieldsFile,
|
||||
Aliases: []string{"f"},
|
||||
Usage: "Path to the file, that contains the DCGM fields to collect",
|
||||
Value: "/etc/dcgm-exporter/default-counters.csv",
|
||||
EnvVars: []string{"DCGM_EXPORTER_COLLECTORS"},
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: CLIAddress,
|
||||
Aliases: []string{"a"},
|
||||
Value: ":9400",
|
||||
Usage: "Address",
|
||||
EnvVars: []string{"DCGM_EXPORTER_LISTEN"},
|
||||
},
|
||||
&cli.IntFlag{
|
||||
Name: CLICollectInterval,
|
||||
Aliases: []string{"c"},
|
||||
Value: 30000,
|
||||
Usage: "Interval of time at which point metrics are collected. Unit is milliseconds (ms).",
|
||||
EnvVars: []string{"DCGM_EXPORTER_INTERVAL"},
|
||||
},
|
||||
&cli.BoolFlag{
|
||||
Name: CLIKubernetes,
|
||||
Aliases: []string{"k"},
|
||||
Value: false,
|
||||
Usage: "Enable kubernetes mapping metrics to kubernetes pods",
|
||||
EnvVars: []string{"DCGM_EXPORTER_KUBERNETES"},
|
||||
},
|
||||
&cli.BoolFlag{
|
||||
Name: CLIUseOldNamespace,
|
||||
Aliases: []string{"o"},
|
||||
Value: false,
|
||||
Usage: "Use old 1.x namespace",
|
||||
EnvVars: []string{"DCGM_EXPORTER_USE_OLD_NAMESPACE"},
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: CLIConfigMapData,
|
||||
Aliases: []string{"m"},
|
||||
Value: undefinedConfigMapData,
|
||||
Usage: "ConfigMap <NAMESPACE>:<NAME> for metric data",
|
||||
EnvVars: []string{"DCGM_EXPORTER_CONFIGMAP_DATA"},
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: CLIRemoteHEInfo,
|
||||
Aliases: []string{"r"},
|
||||
Value: "localhost:5555",
|
||||
Usage: "Connect to remote hostengine at <HOST>:<PORT>",
|
||||
EnvVars: []string{"DCGM_REMOTE_HOSTENGINE_INFO"},
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: CLIKubernetesGPUIDType,
|
||||
Value: string(dcgmexporter.GPUUID),
|
||||
Usage: fmt.Sprintf("Choose Type of GPU ID to use to map kubernetes resources to pods. Possible values: '%s', '%s'",
|
||||
dcgmexporter.GPUUID, dcgmexporter.DeviceName),
|
||||
EnvVars: []string{"DCGM_EXPORTER_KUBERNETES_GPU_ID_TYPE"},
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: CLIGPUDevices,
|
||||
Aliases: []string{"d"},
|
||||
Value: FlexKey,
|
||||
Usage: DeviceUsageStr,
|
||||
EnvVars: []string{"DCGM_EXPORTER_DEVICES_STR"},
|
||||
},
|
||||
&cli.BoolFlag{
|
||||
Name: CLINoHostname,
|
||||
Aliases: []string{"n"},
|
||||
Value: false,
|
||||
Usage: "Omit the hostname information from the output, matching older versions.",
|
||||
EnvVars: []string{"DCGM_EXPORTER_NO_HOSTNAME"},
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: CLISwitchDevices,
|
||||
Aliases: []string{"s"},
|
||||
Value: FlexKey,
|
||||
Usage: DeviceUsageStr,
|
||||
EnvVars: []string{"DCGM_EXPORTER_OTHER_DEVICES_STR"},
|
||||
},
|
||||
&cli.BoolFlag{
|
||||
Name: CLIUseFakeGpus,
|
||||
Value: false,
|
||||
Usage: "Accept GPUs that are fake, for testing purposes only",
|
||||
EnvVars: []string{"DCGM_EXPORTER_USE_FAKE_GPUS"},
|
||||
},
|
||||
}
|
||||
|
||||
c.Action = func(c *cli.Context) error {
|
||||
return Run(c)
|
||||
}
|
||||
|
||||
if err := c.Run(os.Args); err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
func newOSWatcher(sigs ...os.Signal) chan os.Signal {
|
||||
sigChan := make(chan os.Signal, 1)
|
||||
signal.Notify(sigChan, sigs...)
|
||||
|
||||
return sigChan
|
||||
}
|
||||
|
||||
func Run(c *cli.Context) error {
|
||||
restart:
|
||||
|
||||
logrus.Info("Starting dcgm-exporter")
|
||||
config, err := contextToConfig(c)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if config.UseRemoteHE {
|
||||
logrus.Info("Attemping to connect to remote hostengine at ", config.RemoteHEInfo)
|
||||
cleanup, err := dcgm.Init(dcgm.Standalone, config.RemoteHEInfo, "0")
|
||||
defer cleanup()
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
} else {
|
||||
cleanup, err := dcgm.Init(dcgm.Embedded)
|
||||
defer cleanup()
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
}
|
||||
logrus.Info("DCGM successfully initialized!")
|
||||
|
||||
dcgm.FieldsInit()
|
||||
defer dcgm.FieldsTerm()
|
||||
|
||||
var groups []dcgm.MetricGroup
|
||||
groups, err = dcgm.GetSupportedMetricGroups(0)
|
||||
if err != nil {
|
||||
config.CollectDCP = false
|
||||
logrus.Info("Not collecting DCP metrics: ", err)
|
||||
} else {
|
||||
logrus.Info("Collecting DCP Metrics")
|
||||
config.MetricGroups = groups
|
||||
}
|
||||
|
||||
ch := make(chan string, 10)
|
||||
pipeline, cleanup, err := dcgmexporter.NewMetricsPipeline(config)
|
||||
defer cleanup()
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
|
||||
server, cleanup, err := dcgmexporter.NewMetricsServer(config, ch)
|
||||
defer cleanup()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var wg sync.WaitGroup
|
||||
stop := make(chan interface{})
|
||||
|
||||
wg.Add(1)
|
||||
go pipeline.Run(ch, stop, &wg)
|
||||
|
||||
wg.Add(1)
|
||||
go server.Run(stop, &wg)
|
||||
|
||||
sigs := newOSWatcher(syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT, syscall.SIGHUP)
|
||||
for {
|
||||
select {
|
||||
case sig := <-sigs:
|
||||
close(stop)
|
||||
err := dcgmexporter.WaitWithTimeout(&wg, time.Second*2)
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
|
||||
if sig == syscall.SIGHUP {
|
||||
goto restart
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func parseDeviceOptionsToken(token string, dOpt *dcgmexporter.DeviceOptions) error {
|
||||
letterAndRange := strings.Split(token, ":")
|
||||
count := len(letterAndRange)
|
||||
if count > 2 {
|
||||
return fmt.Errorf("Invalid ranged device option '%s': there can only be one specified range", token)
|
||||
}
|
||||
|
||||
letter := letterAndRange[0]
|
||||
if letter == FlexKey {
|
||||
dOpt.Flex = true
|
||||
if count > 1 {
|
||||
return fmt.Errorf("No range can be specified with the flex option 'f'")
|
||||
}
|
||||
} else if letter == MajorKey || letter == MinorKey {
|
||||
var indices []int
|
||||
if count == 1 {
|
||||
// No range means all present devices of the type
|
||||
indices = append(indices, -1)
|
||||
} else {
|
||||
numbers := strings.Split(letterAndRange[1], ",")
|
||||
for _, numberOrRange := range numbers {
|
||||
rangeTokens := strings.Split(numberOrRange, "-")
|
||||
rangeTokenCount := len(rangeTokens)
|
||||
if rangeTokenCount > 2 {
|
||||
return fmt.Errorf("A range can only be '<number>-<number>', but found '%s'", numberOrRange)
|
||||
} else if rangeTokenCount == 1 {
|
||||
number, err := strconv.Atoi(rangeTokens[0])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
indices = append(indices, number)
|
||||
} else {
|
||||
start, err := strconv.Atoi(rangeTokens[0])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
end, err := strconv.Atoi(rangeTokens[1])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Add the range to the indices
|
||||
for i := start; i <= end; i++ {
|
||||
indices = append(indices, i)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if letter == MajorKey {
|
||||
dOpt.MajorRange = indices
|
||||
} else {
|
||||
dOpt.MinorRange = indices
|
||||
}
|
||||
} else {
|
||||
return fmt.Errorf("The only valid options preceding ':<range>' are 'g' or 'i', but found '%s'", letter)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func parseDeviceOptions(devices string) (dcgmexporter.DeviceOptions, error) {
|
||||
var dOpt dcgmexporter.DeviceOptions
|
||||
|
||||
letterAndRange := strings.Split(devices, ":")
|
||||
count := len(letterAndRange)
|
||||
if count > 2 {
|
||||
return dOpt, fmt.Errorf("Invalid ranged device option '%s': there can only be one specified range", devices)
|
||||
}
|
||||
|
||||
letter := letterAndRange[0]
|
||||
if letter == FlexKey {
|
||||
dOpt.Flex = true
|
||||
if count > 1 {
|
||||
return dOpt, fmt.Errorf("No range can be specified with the flex option 'f'")
|
||||
}
|
||||
} else if letter == MajorKey || letter == MinorKey {
|
||||
var indices []int
|
||||
if count == 1 {
|
||||
// No range means all present devices of the type
|
||||
indices = append(indices, -1)
|
||||
} else {
|
||||
numbers := strings.Split(letterAndRange[1], ",")
|
||||
for _, numberOrRange := range numbers {
|
||||
rangeTokens := strings.Split(numberOrRange, "-")
|
||||
rangeTokenCount := len(rangeTokens)
|
||||
if rangeTokenCount > 2 {
|
||||
return dOpt, fmt.Errorf("A range can only be '<number>-<number>', but found '%s'", numberOrRange)
|
||||
} else if rangeTokenCount == 1 {
|
||||
number, err := strconv.Atoi(rangeTokens[0])
|
||||
if err != nil {
|
||||
return dOpt, err
|
||||
}
|
||||
indices = append(indices, number)
|
||||
} else {
|
||||
start, err := strconv.Atoi(rangeTokens[0])
|
||||
if err != nil {
|
||||
return dOpt, err
|
||||
}
|
||||
end, err := strconv.Atoi(rangeTokens[1])
|
||||
if err != nil {
|
||||
return dOpt, err
|
||||
}
|
||||
|
||||
// Add the range to the indices
|
||||
for i := start; i <= end; i++ {
|
||||
indices = append(indices, i)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if letter == MajorKey {
|
||||
dOpt.MajorRange = indices
|
||||
} else {
|
||||
dOpt.MinorRange = indices
|
||||
}
|
||||
} else {
|
||||
return dOpt, fmt.Errorf("The only valid options preceding ':<range>' are 'g' or 'i', but found '%s'", letter)
|
||||
}
|
||||
|
||||
return dOpt, nil
|
||||
}
|
||||
|
||||
func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) {
|
||||
gOpt, err := parseDeviceOptions(c.String(CLIGPUDevices))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
sOpt, err := parseDeviceOptions(c.String(CLISwitchDevices))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &dcgmexporter.Config{
|
||||
CollectorsFile: c.String(CLIFieldsFile),
|
||||
Address: c.String(CLIAddress),
|
||||
CollectInterval: c.Int(CLICollectInterval),
|
||||
Kubernetes: c.Bool(CLIKubernetes),
|
||||
KubernetesGPUIdType: dcgmexporter.KubernetesGPUIDType(c.String(CLIKubernetesGPUIDType)),
|
||||
CollectDCP: true,
|
||||
UseOldNamespace: c.Bool(CLIUseOldNamespace),
|
||||
UseRemoteHE: c.IsSet(CLIRemoteHEInfo),
|
||||
RemoteHEInfo: c.String(CLIRemoteHEInfo),
|
||||
GPUDevices: gOpt,
|
||||
SwitchDevices: sOpt,
|
||||
NoHostname: c.Bool(CLINoHostname),
|
||||
UseFakeGpus: c.Bool(CLIUseFakeGpus),
|
||||
ConfigMapData: c.String(CLIConfigMapData),
|
||||
}, nil
|
||||
}
|
Loading…
Reference in New Issue