From 4179e26f1c2648aff7515b6e6badef5233e7f312 Mon Sep 17 00:00:00 2001 From: sunhong Date: Tue, 5 Sep 2023 16:45:12 +0800 Subject: [PATCH] fix ignore --- .gitignore | 1 - cmd/dcgm-exporter/main.go | 427 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 427 insertions(+), 1 deletion(-) create mode 100644 cmd/dcgm-exporter/main.go diff --git a/.gitignore b/.gitignore index 9c8dfe1..b954f55 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,5 @@ *.swp *.swo -dcgm-exporter !etc/ !deployment/ tags diff --git a/cmd/dcgm-exporter/main.go b/cmd/dcgm-exporter/main.go new file mode 100644 index 0000000..8a3eb05 --- /dev/null +++ b/cmd/dcgm-exporter/main.go @@ -0,0 +1,427 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +import ( + "bytes" + "fmt" + "os" + "os/signal" + "strconv" + "strings" + "sync" + "syscall" + "text/template" + "time" + + "github.com/NVIDIA/dcgm-exporter/pkg/dcgmexporter" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/sirupsen/logrus" + "github.com/urfave/cli/v2" +) + +const ( + FlexKey = "f" // Monitor all GPUs if MIG is disabled or all GPU instances if MIG is enabled + MajorKey = "g" // Monitor top-level entities: GPUs or NvSwitches + MinorKey = "i" // Monitor sub-level entities: GPU instances/NvLinks - cannot be specified if MIG is disabled + undefinedConfigMapData = "none" +) + +var ( + BuildVersion = "Filled by the build system" + + CLIFieldsFile = "collectors" + CLIAddress = "address" + CLICollectInterval = "collect-interval" + CLIKubernetes = "kubernetes" + CLIKubernetesGPUIDType = "kubernetes-gpu-id-type" + CLIUseOldNamespace = "use-old-namespace" + CLIRemoteHEInfo = "remote-hostengine-info" + CLIGPUDevices = "devices" + CLISwitchDevices = "switch-devices" + CLINoHostname = "no-hostname" + CLIUseFakeGpus = "fake-gpus" + CLIConfigMapData = "configmap-data" +) + +func main() { + c := cli.NewApp() + c.Name = "DCGM Exporter" + c.Usage = "Generates GPU metrics in the prometheus format" + c.Version = BuildVersion + + deviceUsageTemplate := `Specify which devices dcgm-exporter monitors. + Possible values: {{.FlexKey}} or + {{.MajorKey}}[:id1[,-id2...] or + {{.MinorKey}}[:id1[,-id2...]. + If an id list is used, then devices with match IDs must exist on the system. For example: + (default) = monitor all GPU instances in MIG mode, all GPUs if MIG mode is disabled. (See {{.FlexKey}}) + {{.MajorKey}} = Monitor all GPUs + {{.MinorKey}} = Monitor all GPU instances + {{.FlexKey}} = Monitor all GPUs if MIG is disabled, or all GPU instances if MIG is enabled. + Note: this rule will be applied to each GPU. If it has GPU instances, those + will be monitored. If it doesn't, then the GPU will be monitored. + This is our recommended option for single or mixed MIG Strategies. + {{.MajorKey}}:0,1 = monitor GPUs 0 and 1 + {{.MinorKey}}:0,2-4 = monitor GPU instances 0, 2, 3, and 4. + + NOTE 1: -i cannot be specified unless MIG mode is enabled. + NOTE 2: Any time indices are specified, those indices must exist on the system. + NOTE 3: In MIG mode, only -f or -i with a range can be specified. GPUs are not assigned to pods + and therefore reporting must occur at the GPU instance level.` + + var deviceUsageBuffer bytes.Buffer + t := template.Must(template.New("").Parse(deviceUsageTemplate)) + _ = t.Execute(&deviceUsageBuffer, map[string]string{"FlexKey": FlexKey, "MajorKey": MajorKey, "MinorKey": MinorKey}) + DeviceUsageStr := deviceUsageBuffer.String() + + c.Flags = []cli.Flag{ + &cli.StringFlag{ + Name: CLIFieldsFile, + Aliases: []string{"f"}, + Usage: "Path to the file, that contains the DCGM fields to collect", + Value: "/etc/dcgm-exporter/default-counters.csv", + EnvVars: []string{"DCGM_EXPORTER_COLLECTORS"}, + }, + &cli.StringFlag{ + Name: CLIAddress, + Aliases: []string{"a"}, + Value: ":9400", + Usage: "Address", + EnvVars: []string{"DCGM_EXPORTER_LISTEN"}, + }, + &cli.IntFlag{ + Name: CLICollectInterval, + Aliases: []string{"c"}, + Value: 30000, + Usage: "Interval of time at which point metrics are collected. Unit is milliseconds (ms).", + EnvVars: []string{"DCGM_EXPORTER_INTERVAL"}, + }, + &cli.BoolFlag{ + Name: CLIKubernetes, + Aliases: []string{"k"}, + Value: false, + Usage: "Enable kubernetes mapping metrics to kubernetes pods", + EnvVars: []string{"DCGM_EXPORTER_KUBERNETES"}, + }, + &cli.BoolFlag{ + Name: CLIUseOldNamespace, + Aliases: []string{"o"}, + Value: false, + Usage: "Use old 1.x namespace", + EnvVars: []string{"DCGM_EXPORTER_USE_OLD_NAMESPACE"}, + }, + &cli.StringFlag{ + Name: CLIConfigMapData, + Aliases: []string{"m"}, + Value: undefinedConfigMapData, + Usage: "ConfigMap : for metric data", + EnvVars: []string{"DCGM_EXPORTER_CONFIGMAP_DATA"}, + }, + &cli.StringFlag{ + Name: CLIRemoteHEInfo, + Aliases: []string{"r"}, + Value: "localhost:5555", + Usage: "Connect to remote hostengine at :", + EnvVars: []string{"DCGM_REMOTE_HOSTENGINE_INFO"}, + }, + &cli.StringFlag{ + Name: CLIKubernetesGPUIDType, + Value: string(dcgmexporter.GPUUID), + Usage: fmt.Sprintf("Choose Type of GPU ID to use to map kubernetes resources to pods. Possible values: '%s', '%s'", + dcgmexporter.GPUUID, dcgmexporter.DeviceName), + EnvVars: []string{"DCGM_EXPORTER_KUBERNETES_GPU_ID_TYPE"}, + }, + &cli.StringFlag{ + Name: CLIGPUDevices, + Aliases: []string{"d"}, + Value: FlexKey, + Usage: DeviceUsageStr, + EnvVars: []string{"DCGM_EXPORTER_DEVICES_STR"}, + }, + &cli.BoolFlag{ + Name: CLINoHostname, + Aliases: []string{"n"}, + Value: false, + Usage: "Omit the hostname information from the output, matching older versions.", + EnvVars: []string{"DCGM_EXPORTER_NO_HOSTNAME"}, + }, + &cli.StringFlag{ + Name: CLISwitchDevices, + Aliases: []string{"s"}, + Value: FlexKey, + Usage: DeviceUsageStr, + EnvVars: []string{"DCGM_EXPORTER_OTHER_DEVICES_STR"}, + }, + &cli.BoolFlag{ + Name: CLIUseFakeGpus, + Value: false, + Usage: "Accept GPUs that are fake, for testing purposes only", + EnvVars: []string{"DCGM_EXPORTER_USE_FAKE_GPUS"}, + }, + } + + c.Action = func(c *cli.Context) error { + return Run(c) + } + + if err := c.Run(os.Args); err != nil { + logrus.Fatal(err) + } +} + +func newOSWatcher(sigs ...os.Signal) chan os.Signal { + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, sigs...) + + return sigChan +} + +func Run(c *cli.Context) error { +restart: + + logrus.Info("Starting dcgm-exporter") + config, err := contextToConfig(c) + if err != nil { + return err + } + + if config.UseRemoteHE { + logrus.Info("Attemping to connect to remote hostengine at ", config.RemoteHEInfo) + cleanup, err := dcgm.Init(dcgm.Standalone, config.RemoteHEInfo, "0") + defer cleanup() + if err != nil { + logrus.Fatal(err) + } + } else { + cleanup, err := dcgm.Init(dcgm.Embedded) + defer cleanup() + if err != nil { + logrus.Fatal(err) + } + } + logrus.Info("DCGM successfully initialized!") + + dcgm.FieldsInit() + defer dcgm.FieldsTerm() + + var groups []dcgm.MetricGroup + groups, err = dcgm.GetSupportedMetricGroups(0) + if err != nil { + config.CollectDCP = false + logrus.Info("Not collecting DCP metrics: ", err) + } else { + logrus.Info("Collecting DCP Metrics") + config.MetricGroups = groups + } + + ch := make(chan string, 10) + pipeline, cleanup, err := dcgmexporter.NewMetricsPipeline(config) + defer cleanup() + if err != nil { + logrus.Fatal(err) + } + + server, cleanup, err := dcgmexporter.NewMetricsServer(config, ch) + defer cleanup() + if err != nil { + return err + } + + var wg sync.WaitGroup + stop := make(chan interface{}) + + wg.Add(1) + go pipeline.Run(ch, stop, &wg) + + wg.Add(1) + go server.Run(stop, &wg) + + sigs := newOSWatcher(syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT, syscall.SIGHUP) + for { + select { + case sig := <-sigs: + close(stop) + err := dcgmexporter.WaitWithTimeout(&wg, time.Second*2) + if err != nil { + logrus.Fatal(err) + } + + if sig == syscall.SIGHUP { + goto restart + } + + return nil + } + } + + return nil +} + +func parseDeviceOptionsToken(token string, dOpt *dcgmexporter.DeviceOptions) error { + letterAndRange := strings.Split(token, ":") + count := len(letterAndRange) + if count > 2 { + return fmt.Errorf("Invalid ranged device option '%s': there can only be one specified range", token) + } + + letter := letterAndRange[0] + if letter == FlexKey { + dOpt.Flex = true + if count > 1 { + return fmt.Errorf("No range can be specified with the flex option 'f'") + } + } else if letter == MajorKey || letter == MinorKey { + var indices []int + if count == 1 { + // No range means all present devices of the type + indices = append(indices, -1) + } else { + numbers := strings.Split(letterAndRange[1], ",") + for _, numberOrRange := range numbers { + rangeTokens := strings.Split(numberOrRange, "-") + rangeTokenCount := len(rangeTokens) + if rangeTokenCount > 2 { + return fmt.Errorf("A range can only be '-', but found '%s'", numberOrRange) + } else if rangeTokenCount == 1 { + number, err := strconv.Atoi(rangeTokens[0]) + if err != nil { + return err + } + indices = append(indices, number) + } else { + start, err := strconv.Atoi(rangeTokens[0]) + if err != nil { + return err + } + end, err := strconv.Atoi(rangeTokens[1]) + if err != nil { + return err + } + + // Add the range to the indices + for i := start; i <= end; i++ { + indices = append(indices, i) + } + } + } + } + + if letter == MajorKey { + dOpt.MajorRange = indices + } else { + dOpt.MinorRange = indices + } + } else { + return fmt.Errorf("The only valid options preceding ':' are 'g' or 'i', but found '%s'", letter) + } + + return nil +} + +func parseDeviceOptions(devices string) (dcgmexporter.DeviceOptions, error) { + var dOpt dcgmexporter.DeviceOptions + + letterAndRange := strings.Split(devices, ":") + count := len(letterAndRange) + if count > 2 { + return dOpt, fmt.Errorf("Invalid ranged device option '%s': there can only be one specified range", devices) + } + + letter := letterAndRange[0] + if letter == FlexKey { + dOpt.Flex = true + if count > 1 { + return dOpt, fmt.Errorf("No range can be specified with the flex option 'f'") + } + } else if letter == MajorKey || letter == MinorKey { + var indices []int + if count == 1 { + // No range means all present devices of the type + indices = append(indices, -1) + } else { + numbers := strings.Split(letterAndRange[1], ",") + for _, numberOrRange := range numbers { + rangeTokens := strings.Split(numberOrRange, "-") + rangeTokenCount := len(rangeTokens) + if rangeTokenCount > 2 { + return dOpt, fmt.Errorf("A range can only be '-', but found '%s'", numberOrRange) + } else if rangeTokenCount == 1 { + number, err := strconv.Atoi(rangeTokens[0]) + if err != nil { + return dOpt, err + } + indices = append(indices, number) + } else { + start, err := strconv.Atoi(rangeTokens[0]) + if err != nil { + return dOpt, err + } + end, err := strconv.Atoi(rangeTokens[1]) + if err != nil { + return dOpt, err + } + + // Add the range to the indices + for i := start; i <= end; i++ { + indices = append(indices, i) + } + } + } + } + + if letter == MajorKey { + dOpt.MajorRange = indices + } else { + dOpt.MinorRange = indices + } + } else { + return dOpt, fmt.Errorf("The only valid options preceding ':' are 'g' or 'i', but found '%s'", letter) + } + + return dOpt, nil +} + +func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) { + gOpt, err := parseDeviceOptions(c.String(CLIGPUDevices)) + if err != nil { + return nil, err + } + + sOpt, err := parseDeviceOptions(c.String(CLISwitchDevices)) + if err != nil { + return nil, err + } + + return &dcgmexporter.Config{ + CollectorsFile: c.String(CLIFieldsFile), + Address: c.String(CLIAddress), + CollectInterval: c.Int(CLICollectInterval), + Kubernetes: c.Bool(CLIKubernetes), + KubernetesGPUIdType: dcgmexporter.KubernetesGPUIDType(c.String(CLIKubernetesGPUIDType)), + CollectDCP: true, + UseOldNamespace: c.Bool(CLIUseOldNamespace), + UseRemoteHE: c.IsSet(CLIRemoteHEInfo), + RemoteHEInfo: c.String(CLIRemoteHEInfo), + GPUDevices: gOpt, + SwitchDevices: sOpt, + NoHostname: c.Bool(CLINoHostname), + UseFakeGpus: c.Bool(CLIUseFakeGpus), + ConfigMapData: c.String(CLIConfigMapData), + }, nil +}