dcgm-exporter/pkg/dcgmexporter/pipeline.go

/*
 * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package dcgmexporter

import (
	"bytes"
	"fmt"
	"sync"
	"text/template"
	"time"

	"github.com/NVIDIA/go-dcgm/pkg/dcgm"
	"github.com/sirupsen/logrus"
)

func NewMetricsPipeline(c *Config) (*MetricsPipeline, func(), error) {
	counters, err := ExtractCounters(c)
	if err != nil {
		return nil, func() {}, err
	}

	gpuCollector, cleanup, err := NewDCGMCollector(counters, c, dcgm.FE_GPU)
	if err != nil {
		return nil, func() {}, err
	}

	switchCollector, cleanup, err := NewDCGMCollector(counters, c, dcgm.FE_SWITCH)
	if err != nil {
		logrus.Info("Not collecting switch metrics: ", err)
	}

	linkCollector, cleanup, err := NewDCGMCollector(counters, c, dcgm.FE_LINK)
	if err != nil {
		logrus.Info("Not collecting link metrics: ", err)
	}

	transformations := []Transform{}
	if c.Kubernetes {
		podMapper, err := NewPodMapper(c)
		if err != nil {
			logrus.Warnf("Could not enable kubernetes metric collection: %v", err)
		} else {
			transformations = append(transformations, podMapper)
		}
	}

	return &MetricsPipeline{
			config: c,

			migMetricsFormat:    template.Must(template.New("migMetrics").Parse(migMetricsFormat)),
			switchMetricsFormat: template.Must(template.New("switchMetrics").Parse(switchMetricsFormat)),
			linkMetricsFormat:   template.Must(template.New("switchMetrics").Parse(linkMetricsFormat)),

			counters:        counters,
			gpuCollector:    gpuCollector,
			switchCollector: switchCollector,
			linkCollector:   linkCollector,
			transformations: transformations,
		}, func() {
			cleanup()
		}, nil
}

// Primarely for testing, caller expected to cleanup the collector
func NewMetricsPipelineWithGPUCollector(c *Config, collector *DCGMCollector) (*MetricsPipeline, func(), error) {
	return &MetricsPipeline{
		config: c,

		migMetricsFormat:    template.Must(template.New("migMetrics").Parse(migMetricsFormat)),
		switchMetricsFormat: template.Must(template.New("switchMetrics").Parse(switchMetricsFormat)),
		linkMetricsFormat:   template.Must(template.New("switchMetrics").Parse(linkMetricsFormat)),

		counters:     collector.Counters,
		gpuCollector: collector,
	}, func() {}, nil
}

func (m *MetricsPipeline) Run(out chan string, stop chan interface{}, wg *sync.WaitGroup) {
	defer wg.Done()

	logrus.Info("Pipeline starting")

	// Note we are using a ticker so that we can stick as close as possible to the collect interval.
	// e.g: The CollectInterval is 10s and the transformation pipeline takes 5s, the time will
	// ensure we really collect metrics every 10s by firing an event 5s after the run function completes.
	t := time.NewTicker(time.Millisecond * time.Duration(m.config.CollectInterval))
	defer t.Stop()

	for {
		select {
		case <-stop:
			return
		case <-t.C:
			o, err := m.run()
			if err != nil {
				logrus.Errorf("Failed to collect metrics with error: %v", err)
				/* flush output rather than output stale data */
				out <- ""
				continue
			}

			if len(out) == cap(out) {
				logrus.Errorf("Channel is full skipping")
			} else {
				out <- o
			}
		}
	}
}

func (m *MetricsPipeline) run() (string, error) {
	/* Collect GPU Metrics */
	metrics, err := m.gpuCollector.GetMetrics()
	if err != nil {
		return "", fmt.Errorf("Failed to collect gpu metrics with error: %v", err)
	}

	for _, transform := range m.transformations {
		err := transform.Process(metrics, m.gpuCollector.SysInfo)
		if err != nil {
			return "", fmt.Errorf("Failed to transform metrics for transform %s: %v", err, transform.Name())
		}
	}

	formated, err := FormatMetrics(m.migMetricsFormat, metrics)
	if err != nil {
		return "", fmt.Errorf("Failed to format metrics with error: %v", err)
	}

	if m.switchCollector != nil {
		/* Collect Switch Metrics */
		metrics, err = m.switchCollector.GetMetrics()
		if err != nil {
			return "", fmt.Errorf("Failed to collect switch metrics with error: %v", err)
		}

		if len(metrics) > 0 {
			switchFormated, err := FormatMetrics(m.switchMetricsFormat, metrics)
			if err != nil {
				logrus.Warnf("Failed to format switch metrics with error: %v", err)
			}

			formated = formated + switchFormated
		}
	}

	if m.linkCollector != nil {
		/* Collect Link Metrics */
		metrics, err = m.linkCollector.GetMetrics()
		if err != nil {
			return "", fmt.Errorf("Failed to collect link metrics with error: %v", err)
		}

		if len(metrics) > 0 {
			switchFormated, err := FormatMetrics(m.linkMetricsFormat, metrics)
			if err != nil {
				logrus.Warnf("Failed to format link metrics with error: %v", err)
			}

			formated = formated + switchFormated
		}
	}

	return formated, nil
}

/*
* The goal here is to get to the following format:
* ```
* # HELP FIELD_ID HELP_MSG
* # TYPE FIELD_ID PROM_TYPE
* FIELD_ID{gpu="GPU_INDEX_0",uuid="GPU_UUID", attr...} VALUE
* FIELD_ID{gpu="GPU_INDEX_N",uuid="GPU_UUID", attr...} VALUE
* ...
* ```
 */

var migMetricsFormat = `
{{- range $counter, $metrics := . -}}
# HELP {{ $counter.FieldName }} {{ $counter.Help }}
# TYPE {{ $counter.FieldName }} {{ $counter.PromType }}
{{- range $metric := $metrics }}
{{ $counter.FieldName }}{gpu="{{ $metric.GPU }}",{{ $metric.UUID }}="{{ $metric.GPUUUID }}",device="{{ $metric.GPUDevice }}",modelName="{{ $metric.GPUModelName }}"{{if $metric.MigProfile}},GPU_I_PROFILE="{{ $metric.MigProfile }}",GPU_I_ID="{{ $metric.GPUInstanceID }}"{{end}}{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}}

{{- range $k, $v := $metric.Labels -}}
	,{{ $k }}="{{ $v }}"
{{- end -}}
{{- range $k, $v := $metric.Attributes -}}
	,{{ $k }}="{{ $v }}"
{{- end -}}

} {{ $metric.Value -}}
{{- end }}
{{ end }}`

var switchMetricsFormat = `
{{- range $counter, $metrics := . -}}
# HELP {{ $counter.FieldName }} {{ $counter.Help }}
# TYPE {{ $counter.FieldName }} {{ $counter.PromType }}
{{- range $metric := $metrics }}
{{ $counter.FieldName }}{nvswitch="{{ $metric.GPU }}"{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}}

{{- range $k, $v := $metric.Labels -}}
	,{{ $k }}="{{ $v }}"
{{- end -}}
} {{ $metric.Value -}}
{{- end }}
{{ end }}`

var linkMetricsFormat = `
{{- range $counter, $metrics := . -}}
# HELP {{ $counter.FieldName }} {{ $counter.Help }}
# TYPE {{ $counter.FieldName }} {{ $counter.PromType }}
{{- range $metric := $metrics }}
{{ $counter.FieldName }}{nvlink="{{ $metric.GPU }}",nvswitch="{{ $metric.GPUDevice }}"{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}}

{{- range $k, $v := $metric.Labels -}}
	,{{ $k }}="{{ $v }}"
{{- end -}}
} {{ $metric.Value -}}
{{- end }}
{{ end }}`

// Template is passed here so that it isn't recompiled at each iteration
func FormatMetrics(t *template.Template, m [][]Metric) (string, error) {
	// Group metrics by counter instead of by device
	groupedMetrics := make(map[*Counter][]Metric)
	for _, deviceMetrics := range m {
		for _, deviceMetric := range deviceMetrics {
			groupedMetrics[deviceMetric.Counter] = append(groupedMetrics[deviceMetric.Counter], deviceMetric)
		}
	}

	// Format metrics
	var res bytes.Buffer
	if err := t.Execute(&res, groupedMetrics); err != nil {
		return "", err
	}

	return res.String(), nil
}