You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
256 lines
7.5 KiB
Go
256 lines
7.5 KiB
Go
1 year ago
|
/*
|
||
|
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||
|
*
|
||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
* you may not use this file except in compliance with the License.
|
||
|
* You may obtain a copy of the License at
|
||
|
*
|
||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||
|
*
|
||
|
* Unless required by applicable law or agreed to in writing, software
|
||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
* See the License for the specific language governing permissions and
|
||
|
* limitations under the License.
|
||
|
*/
|
||
|
|
||
|
package dcgmexporter
|
||
|
|
||
|
import (
|
||
|
"bytes"
|
||
|
"fmt"
|
||
|
"sync"
|
||
|
"text/template"
|
||
|
"time"
|
||
|
|
||
|
"github.com/NVIDIA/go-dcgm/pkg/dcgm"
|
||
|
"github.com/sirupsen/logrus"
|
||
|
)
|
||
|
|
||
|
func NewMetricsPipeline(c *Config) (*MetricsPipeline, func(), error) {
|
||
|
counters, err := ExtractCounters(c)
|
||
|
if err != nil {
|
||
|
return nil, func() {}, err
|
||
|
}
|
||
|
|
||
|
gpuCollector, cleanup, err := NewDCGMCollector(counters, c, dcgm.FE_GPU)
|
||
|
if err != nil {
|
||
|
return nil, func() {}, err
|
||
|
}
|
||
|
|
||
|
switchCollector, cleanup, err := NewDCGMCollector(counters, c, dcgm.FE_SWITCH)
|
||
|
if err != nil {
|
||
|
logrus.Info("Not collecting switch metrics: ", err)
|
||
|
}
|
||
|
|
||
|
linkCollector, cleanup, err := NewDCGMCollector(counters, c, dcgm.FE_LINK)
|
||
|
if err != nil {
|
||
|
logrus.Info("Not collecting link metrics: ", err)
|
||
|
}
|
||
|
|
||
|
transformations := []Transform{}
|
||
|
if c.Kubernetes {
|
||
|
podMapper, err := NewPodMapper(c)
|
||
|
if err != nil {
|
||
|
logrus.Warnf("Could not enable kubernetes metric collection: %v", err)
|
||
|
} else {
|
||
|
transformations = append(transformations, podMapper)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return &MetricsPipeline{
|
||
|
config: c,
|
||
|
|
||
|
migMetricsFormat: template.Must(template.New("migMetrics").Parse(migMetricsFormat)),
|
||
|
switchMetricsFormat: template.Must(template.New("switchMetrics").Parse(switchMetricsFormat)),
|
||
|
linkMetricsFormat: template.Must(template.New("switchMetrics").Parse(linkMetricsFormat)),
|
||
|
|
||
|
counters: counters,
|
||
|
gpuCollector: gpuCollector,
|
||
|
switchCollector: switchCollector,
|
||
|
linkCollector: linkCollector,
|
||
|
transformations: transformations,
|
||
|
}, func() {
|
||
|
cleanup()
|
||
|
}, nil
|
||
|
}
|
||
|
|
||
|
// Primarely for testing, caller expected to cleanup the collector
|
||
|
func NewMetricsPipelineWithGPUCollector(c *Config, collector *DCGMCollector) (*MetricsPipeline, func(), error) {
|
||
|
return &MetricsPipeline{
|
||
|
config: c,
|
||
|
|
||
|
migMetricsFormat: template.Must(template.New("migMetrics").Parse(migMetricsFormat)),
|
||
|
switchMetricsFormat: template.Must(template.New("switchMetrics").Parse(switchMetricsFormat)),
|
||
|
linkMetricsFormat: template.Must(template.New("switchMetrics").Parse(linkMetricsFormat)),
|
||
|
|
||
|
counters: collector.Counters,
|
||
|
gpuCollector: collector,
|
||
|
}, func() {}, nil
|
||
|
}
|
||
|
|
||
|
func (m *MetricsPipeline) Run(out chan string, stop chan interface{}, wg *sync.WaitGroup) {
|
||
|
defer wg.Done()
|
||
|
|
||
|
logrus.Info("Pipeline starting")
|
||
|
|
||
|
// Note we are using a ticker so that we can stick as close as possible to the collect interval.
|
||
|
// e.g: The CollectInterval is 10s and the transformation pipeline takes 5s, the time will
|
||
|
// ensure we really collect metrics every 10s by firing an event 5s after the run function completes.
|
||
|
t := time.NewTicker(time.Millisecond * time.Duration(m.config.CollectInterval))
|
||
|
defer t.Stop()
|
||
|
|
||
|
for {
|
||
|
select {
|
||
|
case <-stop:
|
||
|
return
|
||
|
case <-t.C:
|
||
|
o, err := m.run()
|
||
|
if err != nil {
|
||
|
logrus.Errorf("Failed to collect metrics with error: %v", err)
|
||
|
/* flush output rather than output stale data */
|
||
|
out <- ""
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
if len(out) == cap(out) {
|
||
|
logrus.Errorf("Channel is full skipping")
|
||
|
} else {
|
||
|
out <- o
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (m *MetricsPipeline) run() (string, error) {
|
||
|
/* Collect GPU Metrics */
|
||
|
metrics, err := m.gpuCollector.GetMetrics()
|
||
|
if err != nil {
|
||
|
return "", fmt.Errorf("Failed to collect gpu metrics with error: %v", err)
|
||
|
}
|
||
|
|
||
|
for _, transform := range m.transformations {
|
||
|
err := transform.Process(metrics, m.gpuCollector.SysInfo)
|
||
|
if err != nil {
|
||
|
return "", fmt.Errorf("Failed to transform metrics for transform %s: %v", err, transform.Name())
|
||
|
}
|
||
|
}
|
||
|
|
||
|
formated, err := FormatMetrics(m.migMetricsFormat, metrics)
|
||
|
if err != nil {
|
||
|
return "", fmt.Errorf("Failed to format metrics with error: %v", err)
|
||
|
}
|
||
|
|
||
|
if m.switchCollector != nil {
|
||
|
/* Collect Switch Metrics */
|
||
|
metrics, err = m.switchCollector.GetMetrics()
|
||
|
if err != nil {
|
||
|
return "", fmt.Errorf("Failed to collect switch metrics with error: %v", err)
|
||
|
}
|
||
|
|
||
|
if len(metrics) > 0 {
|
||
|
switchFormated, err := FormatMetrics(m.switchMetricsFormat, metrics)
|
||
|
if err != nil {
|
||
|
logrus.Warnf("Failed to format switch metrics with error: %v", err)
|
||
|
}
|
||
|
|
||
|
formated = formated + switchFormated
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if m.linkCollector != nil {
|
||
|
/* Collect Link Metrics */
|
||
|
metrics, err = m.linkCollector.GetMetrics()
|
||
|
if err != nil {
|
||
|
return "", fmt.Errorf("Failed to collect link metrics with error: %v", err)
|
||
|
}
|
||
|
|
||
|
if len(metrics) > 0 {
|
||
|
switchFormated, err := FormatMetrics(m.linkMetricsFormat, metrics)
|
||
|
if err != nil {
|
||
|
logrus.Warnf("Failed to format link metrics with error: %v", err)
|
||
|
}
|
||
|
|
||
|
formated = formated + switchFormated
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return formated, nil
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* The goal here is to get to the following format:
|
||
|
* ```
|
||
|
* # HELP FIELD_ID HELP_MSG
|
||
|
* # TYPE FIELD_ID PROM_TYPE
|
||
|
* FIELD_ID{gpu="GPU_INDEX_0",uuid="GPU_UUID", attr...} VALUE
|
||
|
* FIELD_ID{gpu="GPU_INDEX_N",uuid="GPU_UUID", attr...} VALUE
|
||
|
* ...
|
||
|
* ```
|
||
|
*/
|
||
|
|
||
|
var migMetricsFormat = `
|
||
|
{{- range $counter, $metrics := . -}}
|
||
|
# HELP {{ $counter.FieldName }} {{ $counter.Help }}
|
||
|
# TYPE {{ $counter.FieldName }} {{ $counter.PromType }}
|
||
|
{{- range $metric := $metrics }}
|
||
|
{{ $counter.FieldName }}{gpu="{{ $metric.GPU }}",{{ $metric.UUID }}="{{ $metric.GPUUUID }}",device="{{ $metric.GPUDevice }}",modelName="{{ $metric.GPUModelName }}"{{if $metric.MigProfile}},GPU_I_PROFILE="{{ $metric.MigProfile }}",GPU_I_ID="{{ $metric.GPUInstanceID }}"{{end}}{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}}
|
||
|
|
||
|
{{- range $k, $v := $metric.Labels -}}
|
||
|
,{{ $k }}="{{ $v }}"
|
||
|
{{- end -}}
|
||
|
{{- range $k, $v := $metric.Attributes -}}
|
||
|
,{{ $k }}="{{ $v }}"
|
||
|
{{- end -}}
|
||
|
|
||
|
} {{ $metric.Value -}}
|
||
|
{{- end }}
|
||
|
{{ end }}`
|
||
|
|
||
|
var switchMetricsFormat = `
|
||
|
{{- range $counter, $metrics := . -}}
|
||
|
# HELP {{ $counter.FieldName }} {{ $counter.Help }}
|
||
|
# TYPE {{ $counter.FieldName }} {{ $counter.PromType }}
|
||
|
{{- range $metric := $metrics }}
|
||
|
{{ $counter.FieldName }}{nvswitch="{{ $metric.GPU }}"{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}}
|
||
|
|
||
|
{{- range $k, $v := $metric.Labels -}}
|
||
|
,{{ $k }}="{{ $v }}"
|
||
|
{{- end -}}
|
||
|
} {{ $metric.Value -}}
|
||
|
{{- end }}
|
||
|
{{ end }}`
|
||
|
|
||
|
var linkMetricsFormat = `
|
||
|
{{- range $counter, $metrics := . -}}
|
||
|
# HELP {{ $counter.FieldName }} {{ $counter.Help }}
|
||
|
# TYPE {{ $counter.FieldName }} {{ $counter.PromType }}
|
||
|
{{- range $metric := $metrics }}
|
||
|
{{ $counter.FieldName }}{nvlink="{{ $metric.GPU }}",nvswitch="{{ $metric.GPUDevice }}"{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}}
|
||
|
|
||
|
{{- range $k, $v := $metric.Labels -}}
|
||
|
,{{ $k }}="{{ $v }}"
|
||
|
{{- end -}}
|
||
|
} {{ $metric.Value -}}
|
||
|
{{- end }}
|
||
|
{{ end }}`
|
||
|
|
||
|
// Template is passed here so that it isn't recompiled at each iteration
|
||
|
func FormatMetrics(t *template.Template, m [][]Metric) (string, error) {
|
||
|
// Group metrics by counter instead of by device
|
||
|
groupedMetrics := make(map[*Counter][]Metric)
|
||
|
for _, deviceMetrics := range m {
|
||
|
for _, deviceMetric := range deviceMetrics {
|
||
|
groupedMetrics[deviceMetric.Counter] = append(groupedMetrics[deviceMetric.Counter], deviceMetric)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Format metrics
|
||
|
var res bytes.Buffer
|
||
|
if err := t.Execute(&res, groupedMetrics); err != nil {
|
||
|
return "", err
|
||
|
}
|
||
|
|
||
|
return res.String(), nil
|
||
|
}
|