/* * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package dcgmexporter import ( "bytes" "fmt" "sync" "text/template" "time" "github.com/NVIDIA/go-dcgm/pkg/dcgm" "github.com/sirupsen/logrus" ) func NewMetricsPipeline(c *Config) (*MetricsPipeline, func(), error) { counters, err := ExtractCounters(c) if err != nil { return nil, func() {}, err } gpuCollector, cleanup, err := NewDCGMCollector(counters, c, dcgm.FE_GPU) if err != nil { return nil, func() {}, err } switchCollector, cleanup, err := NewDCGMCollector(counters, c, dcgm.FE_SWITCH) if err != nil { logrus.Info("Not collecting switch metrics: ", err) } linkCollector, cleanup, err := NewDCGMCollector(counters, c, dcgm.FE_LINK) if err != nil { logrus.Info("Not collecting link metrics: ", err) } transformations := []Transform{} if c.Kubernetes { podMapper, err := NewPodMapper(c) if err != nil { logrus.Warnf("Could not enable kubernetes metric collection: %v", err) } else { transformations = append(transformations, podMapper) } } return &MetricsPipeline{ config: c, migMetricsFormat: template.Must(template.New("migMetrics").Parse(migMetricsFormat)), switchMetricsFormat: template.Must(template.New("switchMetrics").Parse(switchMetricsFormat)), linkMetricsFormat: template.Must(template.New("switchMetrics").Parse(linkMetricsFormat)), counters: counters, gpuCollector: gpuCollector, switchCollector: switchCollector, linkCollector: linkCollector, transformations: transformations, }, func() { cleanup() }, nil } // Primarely for testing, caller expected to cleanup the collector func NewMetricsPipelineWithGPUCollector(c *Config, collector *DCGMCollector) (*MetricsPipeline, func(), error) { return &MetricsPipeline{ config: c, migMetricsFormat: template.Must(template.New("migMetrics").Parse(migMetricsFormat)), switchMetricsFormat: template.Must(template.New("switchMetrics").Parse(switchMetricsFormat)), linkMetricsFormat: template.Must(template.New("switchMetrics").Parse(linkMetricsFormat)), counters: collector.Counters, gpuCollector: collector, }, func() {}, nil } func (m *MetricsPipeline) Run(out chan string, stop chan interface{}, wg *sync.WaitGroup) { defer wg.Done() logrus.Info("Pipeline starting") // Note we are using a ticker so that we can stick as close as possible to the collect interval. // e.g: The CollectInterval is 10s and the transformation pipeline takes 5s, the time will // ensure we really collect metrics every 10s by firing an event 5s after the run function completes. t := time.NewTicker(time.Millisecond * time.Duration(m.config.CollectInterval)) defer t.Stop() for { select { case <-stop: return case <-t.C: o, err := m.run() if err != nil { logrus.Errorf("Failed to collect metrics with error: %v", err) /* flush output rather than output stale data */ out <- "" continue } if len(out) == cap(out) { logrus.Errorf("Channel is full skipping") } else { out <- o } } } } func (m *MetricsPipeline) run() (string, error) { /* Collect GPU Metrics */ metrics, err := m.gpuCollector.GetMetrics() if err != nil { return "", fmt.Errorf("Failed to collect gpu metrics with error: %v", err) } for _, transform := range m.transformations { err := transform.Process(metrics, m.gpuCollector.SysInfo) if err != nil { return "", fmt.Errorf("Failed to transform metrics for transform %s: %v", err, transform.Name()) } } formated, err := FormatMetrics(m.migMetricsFormat, metrics) if err != nil { return "", fmt.Errorf("Failed to format metrics with error: %v", err) } if m.switchCollector != nil { /* Collect Switch Metrics */ metrics, err = m.switchCollector.GetMetrics() if err != nil { return "", fmt.Errorf("Failed to collect switch metrics with error: %v", err) } if len(metrics) > 0 { switchFormated, err := FormatMetrics(m.switchMetricsFormat, metrics) if err != nil { logrus.Warnf("Failed to format switch metrics with error: %v", err) } formated = formated + switchFormated } } if m.linkCollector != nil { /* Collect Link Metrics */ metrics, err = m.linkCollector.GetMetrics() if err != nil { return "", fmt.Errorf("Failed to collect link metrics with error: %v", err) } if len(metrics) > 0 { switchFormated, err := FormatMetrics(m.linkMetricsFormat, metrics) if err != nil { logrus.Warnf("Failed to format link metrics with error: %v", err) } formated = formated + switchFormated } } return formated, nil } /* * The goal here is to get to the following format: * ``` * # HELP FIELD_ID HELP_MSG * # TYPE FIELD_ID PROM_TYPE * FIELD_ID{gpu="GPU_INDEX_0",uuid="GPU_UUID", attr...} VALUE * FIELD_ID{gpu="GPU_INDEX_N",uuid="GPU_UUID", attr...} VALUE * ... * ``` */ var migMetricsFormat = ` {{- range $counter, $metrics := . -}} # HELP {{ $counter.FieldName }} {{ $counter.Help }} # TYPE {{ $counter.FieldName }} {{ $counter.PromType }} {{- range $metric := $metrics }} {{ $counter.FieldName }}{gpu="{{ $metric.GPU }}",{{ $metric.UUID }}="{{ $metric.GPUUUID }}",device="{{ $metric.GPUDevice }}",modelName="{{ $metric.GPUModelName }}"{{if $metric.MigProfile}},GPU_I_PROFILE="{{ $metric.MigProfile }}",GPU_I_ID="{{ $metric.GPUInstanceID }}"{{end}}{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}} {{- range $k, $v := $metric.Labels -}} ,{{ $k }}="{{ $v }}" {{- end -}} {{- range $k, $v := $metric.Attributes -}} ,{{ $k }}="{{ $v }}" {{- end -}} } {{ $metric.Value -}} {{- end }} {{ end }}` var switchMetricsFormat = ` {{- range $counter, $metrics := . -}} # HELP {{ $counter.FieldName }} {{ $counter.Help }} # TYPE {{ $counter.FieldName }} {{ $counter.PromType }} {{- range $metric := $metrics }} {{ $counter.FieldName }}{nvswitch="{{ $metric.GPU }}"{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}} {{- range $k, $v := $metric.Labels -}} ,{{ $k }}="{{ $v }}" {{- end -}} } {{ $metric.Value -}} {{- end }} {{ end }}` var linkMetricsFormat = ` {{- range $counter, $metrics := . -}} # HELP {{ $counter.FieldName }} {{ $counter.Help }} # TYPE {{ $counter.FieldName }} {{ $counter.PromType }} {{- range $metric := $metrics }} {{ $counter.FieldName }}{nvlink="{{ $metric.GPU }}",nvswitch="{{ $metric.GPUDevice }}"{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}} {{- range $k, $v := $metric.Labels -}} ,{{ $k }}="{{ $v }}" {{- end -}} } {{ $metric.Value -}} {{- end }} {{ end }}` // Template is passed here so that it isn't recompiled at each iteration func FormatMetrics(t *template.Template, m [][]Metric) (string, error) { // Group metrics by counter instead of by device groupedMetrics := make(map[*Counter][]Metric) for _, deviceMetrics := range m { for _, deviceMetric := range deviceMetrics { groupedMetrics[deviceMetric.Counter] = append(groupedMetrics[deviceMetric.Counter], deviceMetric) } } // Format metrics var res bytes.Buffer if err := t.Execute(&res, groupedMetrics); err != nil { return "", err } return res.String(), nil }