add all
parent
24bdefa8e6
commit
af85ed20a2
@ -0,0 +1,9 @@
|
||||
**Please open your pull requests on [gitlab repository](https://gitlab.com/nvidia/dcgm-exporter.git) **
|
||||
|
||||
Make sure to complete the following items:_
|
||||
|
||||
- _A reference to a related issue._
|
||||
- _A small description of the changes proposed in the pull request._
|
||||
- _One commit per change and descriptive commit messages._
|
||||
- _Sign-off your work following these [guidelines](https://gitlab.com/nvidia/dcgm-exporter/blob/master/CONTRIBUTING.md) ._
|
||||
- _Test run of your changes._
|
@ -0,0 +1,25 @@
|
||||
name: Go
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
|
||||
jobs:
|
||||
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- name: Set up Go
|
||||
uses: actions/setup-go@v2
|
||||
with:
|
||||
go-version: 1.17
|
||||
|
||||
- name: Build
|
||||
run: make binary
|
||||
|
||||
- name: Lint
|
||||
run: make check-format
|
@ -0,0 +1,6 @@
|
||||
*.swp
|
||||
*.swo
|
||||
dcgm-exporter
|
||||
!etc/
|
||||
!deployment/
|
||||
tags
|
@ -0,0 +1,60 @@
|
||||
# Contribute to the DCGM-Exporter Project
|
||||
|
||||
Want to hack on the NVIDIA DCGM-Exporter Project? Awesome!
|
||||
We only require you to sign your work, the below section describes this!
|
||||
|
||||
## Sign your work
|
||||
|
||||
The sign-off is a simple line at the end of the explanation for the patch. Your
|
||||
signature certifies that you wrote the patch or otherwise have the right to pass
|
||||
it on as an open-source patch. The rules are pretty simple: if you can certify
|
||||
the below (from [developercertificate.org](http://developercertificate.org/)):
|
||||
|
||||
```
|
||||
Developer Certificate of Origin
|
||||
Version 1.1
|
||||
|
||||
Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
|
||||
1 Letterman Drive
|
||||
Suite D4700
|
||||
San Francisco, CA, 94129
|
||||
|
||||
Everyone is permitted to copy and distribute verbatim copies of this
|
||||
license document, but changing it is not allowed.
|
||||
|
||||
Developer's Certificate of Origin 1.1
|
||||
|
||||
By making a contribution to this project, I certify that:
|
||||
|
||||
(a) The contribution was created in whole or in part by me and I
|
||||
have the right to submit it under the open source license
|
||||
indicated in the file; or
|
||||
|
||||
(b) The contribution is based upon previous work that, to the best
|
||||
of my knowledge, is covered under an appropriate open source
|
||||
license and I have the right under that license to submit that
|
||||
work with modifications, whether created in whole or in part
|
||||
by me, under the same open source license (unless I am
|
||||
permitted to submit under a different license), as indicated
|
||||
in the file; or
|
||||
|
||||
(c) The contribution was provided directly to me by some other
|
||||
person who certified (a), (b) or (c) and I have not modified
|
||||
it.
|
||||
|
||||
(d) I understand and agree that this project and the contribution
|
||||
are public and that a record of the contribution (including all
|
||||
personal information I submit with it, including my sign-off) is
|
||||
maintained indefinitely and may be redistributed consistent with
|
||||
this project or the open source license(s) involved.
|
||||
```
|
||||
|
||||
Then you just add a line to every git commit message:
|
||||
|
||||
Signed-off-by: Joe Smith <joe.smith@email.com>
|
||||
|
||||
Use your real name (sorry, no pseudonyms or anonymous contributions.)
|
||||
|
||||
If you set your `user.name` and `user.email` git configs, you can sign your
|
||||
commit automatically with `git commit -s`.
|
||||
|
@ -0,0 +1,18 @@
|
||||
FROM nvidia/cuda:11.4.1-base-ubuntu20.04
|
||||
|
||||
ARG DCGM_VERSION
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
gnupg2 curl ca-certificates build-essential && \
|
||||
curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub | apt-key add - && \
|
||||
curl -s https://storage.googleapis.com/golang/go1.16.5.linux-amd64.tar.gz| tar -v -C /usr/local -xz && \
|
||||
echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
|
||||
echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \
|
||||
apt-get purge --autoremove -y curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
datacenter-gpu-manager
|
||||
|
||||
ENV PATH $PATH:/usr/local/go/bin
|
@ -0,0 +1,64 @@
|
||||
@Library(['shared-libs']) _
|
||||
|
||||
pipeline {
|
||||
|
||||
agent {
|
||||
dockerfile {
|
||||
label 'docker'
|
||||
filename 'Dockerfile'
|
||||
args '-v /etc/passwd:/etc/passwd:ro -v /var/run/docker.sock:/var/run/docker.sock:rw'
|
||||
}
|
||||
}
|
||||
|
||||
options {
|
||||
ansiColor('xterm')
|
||||
timestamps()
|
||||
timeout(time: 1, unit: 'HOURS')
|
||||
gitLabConnection('GitLab Master')
|
||||
buildDiscarder(logRotator(numToKeepStr: '100', artifactNumToKeepStr: '10'))
|
||||
}
|
||||
|
||||
environment {
|
||||
HOME="${WORKSPACE}"
|
||||
PYTHONUNBUFFERED=1
|
||||
}
|
||||
|
||||
parameters {
|
||||
string(name: 'REF', defaultValue: '\${gitlabBranch}', description: 'Commit to build')
|
||||
}
|
||||
|
||||
stages {
|
||||
stage('Prep') {
|
||||
steps {
|
||||
script {
|
||||
updateGitlabCommitStatus(name: 'Jenkins CI', state: 'running')
|
||||
}
|
||||
}
|
||||
}
|
||||
stage('Compile') {
|
||||
steps {
|
||||
echo "building"
|
||||
sh "make binary"
|
||||
}
|
||||
}
|
||||
stage('Test') {
|
||||
steps {
|
||||
echo "Running tests"
|
||||
// Tests require supported GPU
|
||||
// make test-main
|
||||
sh "make check-format"
|
||||
}
|
||||
}
|
||||
}
|
||||
post {
|
||||
always {
|
||||
script{
|
||||
String status = (currentBuild.currentResult == "SUCCESS") ? "success" : "failed"
|
||||
updateGitlabCommitStatus(name: 'Jenkins CI', state: status)
|
||||
}
|
||||
}
|
||||
cleanup {
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,202 @@
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
@ -0,0 +1,78 @@
|
||||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
MKDIR ?= mkdir
|
||||
REGISTRY ?= nvidia
|
||||
|
||||
DCGM_VERSION := 3.2.5
|
||||
GOLANG_VERSION := 1.18
|
||||
VERSION := 3.1.7
|
||||
FULL_VERSION := $(DCGM_VERSION)-$(VERSION)
|
||||
OUTPUT := type=oci,dest=/tmp/dcgm-exporter.tar
|
||||
PLATFORMS := linux/amd64,linux/arm64
|
||||
DOCKERCMD := docker buildx build
|
||||
|
||||
NON_TEST_FILES := pkg/dcgmexporter/dcgm.go pkg/dcgmexporter/gpu_collector.go pkg/dcgmexporter/parser.go
|
||||
NON_TEST_FILES += pkg/dcgmexporter/pipeline.go pkg/dcgmexporter/server.go pkg/dcgmexporter/system_info.go
|
||||
NON_TEST_FILES += pkg/dcgmexporter/types.go pkg/dcgmexporter/utils.go pkg/dcgmexporter/kubernetes.go
|
||||
NON_TEST_FILES += cmd/dcgm-exporter/main.go
|
||||
MAIN_TEST_FILES := pkg/dcgmexporter/system_info_test.go
|
||||
|
||||
.PHONY: all binary install check-format local
|
||||
all: ubuntu20.04 ubi8
|
||||
|
||||
binary:
|
||||
cd cmd/dcgm-exporter; go build -ldflags "-X main.BuildVersion=${DCGM_VERSION}-${VERSION}"
|
||||
|
||||
test-main: $(NON_TEST_FILES) $(MAIN_TEST_FILES)
|
||||
go test ./...
|
||||
|
||||
install: binary
|
||||
install -m 557 cmd/dcgm-exporter/dcgm-exporter /usr/bin/dcgm-exporter
|
||||
install -m 557 -D ./etc/default-counters.csv /etc/dcgm-exporter/default-counters.csv
|
||||
install -m 557 -D ./etc/dcp-metrics-included.csv /etc/dcgm-exporter/dcp-metrics-included.csv
|
||||
|
||||
check-format:
|
||||
test $$(gofmt -l pkg | tee /dev/stderr | wc -l) -eq 0
|
||||
test $$(gofmt -l cmd | tee /dev/stderr | wc -l) -eq 0
|
||||
|
||||
push:
|
||||
$(MAKE) ubuntu20.04 OUTPUT=type=registry
|
||||
$(MAKE) ubi8 OUTPUT=type=registry
|
||||
|
||||
local:
|
||||
ifeq ($(shell uname -p),aarch64)
|
||||
$(MAKE) PLATFORMS=linux/arm64 OUTPUT=type=docker DOCKERCMD='docker build'
|
||||
else
|
||||
$(MAKE) PLATFORMS=linux/amd64 OUTPUT=type=docker DOCKERCMD='docker build'
|
||||
endif
|
||||
|
||||
ubuntu20.04:
|
||||
$(DOCKERCMD) --pull \
|
||||
--output $(OUTPUT) \
|
||||
--platform $(PLATFORMS) \
|
||||
--build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \
|
||||
--build-arg "DCGM_VERSION=$(DCGM_VERSION)" \
|
||||
--tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu20.04" \
|
||||
--file docker/Dockerfile.ubuntu20.04 .
|
||||
|
||||
ubi8:
|
||||
$(DOCKERCMD) --pull \
|
||||
--output $(OUTPUT) \
|
||||
--platform $(PLATFORMS) \
|
||||
--build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \
|
||||
--build-arg "DCGM_VERSION=$(DCGM_VERSION)" \
|
||||
--build-arg "VERSION=$(FULL_VERSION)" \
|
||||
--tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubi8" \
|
||||
--file docker/Dockerfile.ubi8 .
|
@ -0,0 +1,27 @@
|
||||
# Release
|
||||
|
||||
This documents the release process as well as the versioning strategy for the DCGM exporter.
|
||||
|
||||
## Versioning
|
||||
|
||||
The DCGM container has three major components:
|
||||
- The DCGM Version (e.g: 1.17.3)
|
||||
- The Exporter Version (e.g: 2.0.0)
|
||||
- The platform of the container (e.g: ubuntu20.04)
|
||||
|
||||
The overall version of the DCGM container has three forms:
|
||||
- The long form: `${DCGM_VERSION}-${EXPORTER_VERSION}-${PLATFORM}`
|
||||
- The short form: `${DCGM_VERSION}`
|
||||
- The latest tag: `latest`
|
||||
|
||||
The long form is a unique tag that once pushed will always refer to the same container.
|
||||
This means that no updates will be made to that tag and it will always point to the same container.
|
||||
|
||||
The short form refers to the latest EXPORTER_VERSION with the platform fixed to ubuntu20.04.
|
||||
The latest tag refers to the latest short form (i.e: latest DCGM_VERSION and EXPORTER_VERSION).
|
||||
|
||||
Note: We do not maintain multiple version branches.
|
||||
|
||||
## Releases
|
||||
|
||||
Newer versions are released on demand but tend to follow DCGM's release cadence.
|
@ -0,0 +1,74 @@
|
||||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: "dcgm-exporter"
|
||||
labels:
|
||||
app.kubernetes.io/name: "dcgm-exporter"
|
||||
app.kubernetes.io/version: "3.1.7"
|
||||
spec:
|
||||
updateStrategy:
|
||||
type: RollingUpdate
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "dcgm-exporter"
|
||||
app.kubernetes.io/version: "3.1.7"
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: "dcgm-exporter"
|
||||
app.kubernetes.io/version: "3.1.7"
|
||||
name: "dcgm-exporter"
|
||||
spec:
|
||||
containers:
|
||||
- image: "nvcr.io/nvidia/k8s/dcgm-exporter:3.2.5-3.1.7-ubuntu20.04"
|
||||
env:
|
||||
- name: "DCGM_EXPORTER_LISTEN"
|
||||
value: ":9400"
|
||||
- name: "DCGM_EXPORTER_KUBERNETES"
|
||||
value: "true"
|
||||
name: "dcgm-exporter"
|
||||
ports:
|
||||
- name: "metrics"
|
||||
containerPort: 9400
|
||||
securityContext:
|
||||
runAsNonRoot: false
|
||||
runAsUser: 0
|
||||
volumeMounts:
|
||||
- name: "pod-gpu-resources"
|
||||
readOnly: true
|
||||
mountPath: "/var/lib/kubelet/pod-resources"
|
||||
volumes:
|
||||
- name: "pod-gpu-resources"
|
||||
hostPath:
|
||||
path: "/var/lib/kubelet/pod-resources"
|
||||
|
||||
---
|
||||
|
||||
kind: Service
|
||||
apiVersion: v1
|
||||
metadata:
|
||||
name: "dcgm-exporter"
|
||||
labels:
|
||||
app.kubernetes.io/name: "dcgm-exporter"
|
||||
app.kubernetes.io/version: "3.1.7"
|
||||
spec:
|
||||
selector:
|
||||
app.kubernetes.io/name: "dcgm-exporter"
|
||||
app.kubernetes.io/version: "3.1.7"
|
||||
ports:
|
||||
- name: "metrics"
|
||||
port: 9400
|
@ -0,0 +1,23 @@
|
||||
# Patterns to ignore when building packages.
|
||||
# This supports shell glob matching, relative path matching, and
|
||||
# negation (prefixed with !). Only one pattern per line.
|
||||
.DS_Store
|
||||
# Common VCS dirs
|
||||
.git/
|
||||
.gitignore
|
||||
.bzr/
|
||||
.bzrignore
|
||||
.hg/
|
||||
.hgignore
|
||||
.svn/
|
||||
# Common backup files
|
||||
*.swp
|
||||
*.bak
|
||||
*.tmp
|
||||
*.orig
|
||||
*~
|
||||
# Various IDEs
|
||||
.project
|
||||
.idea/
|
||||
*.tmproj
|
||||
.vscode/
|
@ -0,0 +1,17 @@
|
||||
apiVersion: v2
|
||||
name: dcgm-exporter
|
||||
description: A Helm chart for DCGM exporter
|
||||
version: "3.1.7"
|
||||
kubeVersion: ">= 1.19.0-0"
|
||||
appVersion: "3.1.7"
|
||||
sources:
|
||||
- https://github.com/nvidia/dcgm-exporter
|
||||
home: https://github.com/nvidia/dcgm-exporter/
|
||||
icon: https://assets.nvidiagrid.net/ngc/logos/DCGM.png
|
||||
keywords:
|
||||
- gpu
|
||||
- cuda
|
||||
- compute
|
||||
- monitoring
|
||||
- telemetry
|
||||
- tesla
|
@ -0,0 +1,15 @@
|
||||
1. Get the application URL by running these commands:
|
||||
{{- if contains "NodePort" .Values.service.type }}
|
||||
export NODE_PORT=$(kubectl get --namespace {{ include "dcgm-exporter.namespace" . }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "dcgm-exporter.fullname" . }})
|
||||
export NODE_IP=$(kubectl get nodes --namespace {{ include "dcgm-exporter.namespace" . }} -o jsonpath="{.items[0].status.addresses[0].address}")
|
||||
echo http://$NODE_IP:$NODE_PORT/metrics
|
||||
{{- else if contains "LoadBalancer" .Values.service.type }}
|
||||
NOTE: It may take a few minutes for the LoadBalancer IP to be available.
|
||||
You can watch the status of by running 'kubectl get --namespace {{ include "dcgm-exporter.namespace" . }} svc -w {{ include "dcgm-exporter.fullname" . }}'
|
||||
export SERVICE_IP=$(kubectl get svc --namespace {{ include "dcgm-exporter.namespace" . }} {{ include "dcgm-exporter.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}")
|
||||
echo http://$SERVICE_IP:{{ .Values.service.port }}
|
||||
{{- else if contains "ClusterIP" .Values.service.type }}
|
||||
export POD_NAME=$(kubectl get pods -n {{ include "dcgm-exporter.namespace" . }} -l "app.kubernetes.io/name={{ include "dcgm-exporter.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}")
|
||||
kubectl -n {{ include "dcgm-exporter.namespace" . }} port-forward $POD_NAME 8080:{{ .Values.service.port }} &
|
||||
echo "Visit http://127.0.0.1:8080/metrics to use your application"
|
||||
{{- end }}
|
@ -0,0 +1,75 @@
|
||||
{{/* vim: set filetype=mustache: */}}
|
||||
{{/*
|
||||
Expand the name of the chart.
|
||||
*/}}
|
||||
{{- define "dcgm-exporter.name" -}}
|
||||
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
Create a default fully qualified app name.
|
||||
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
|
||||
If release name contains chart name it will be used as a full name.
|
||||
*/}}
|
||||
{{- define "dcgm-exporter.fullname" -}}
|
||||
{{- if .Values.fullnameOverride -}}
|
||||
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
|
||||
{{- else -}}
|
||||
{{- $name := default .Chart.Name .Values.nameOverride -}}
|
||||
{{- if contains $name .Release.Name -}}
|
||||
{{- .Release.Name | trunc 63 | trimSuffix "-" -}}
|
||||
{{- else -}}
|
||||
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
|
||||
{{/*
|
||||
Allow the release namespace to be overridden for multi-namespace deployments in combined charts
|
||||
*/}}
|
||||
{{- define "dcgm-exporter.namespace" -}}
|
||||
{{- if .Values.namespaceOverride -}}
|
||||
{{- .Values.namespaceOverride -}}
|
||||
{{- else -}}
|
||||
{{- .Release.Namespace -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
Create chart name and version as used by the chart label.
|
||||
*/}}
|
||||
{{- define "dcgm-exporter.chart" -}}
|
||||
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
Common labels
|
||||
*/}}
|
||||
{{- define "dcgm-exporter.labels" -}}
|
||||
helm.sh/chart: {{ include "dcgm-exporter.chart" . }}
|
||||
{{ include "dcgm-exporter.selectorLabels" . }}
|
||||
{{- if .Chart.AppVersion }}
|
||||
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
|
||||
{{- end }}
|
||||
app.kubernetes.io/managed-by: {{ .Release.Service }}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
Selector labels
|
||||
*/}}
|
||||
{{- define "dcgm-exporter.selectorLabels" -}}
|
||||
app.kubernetes.io/name: {{ include "dcgm-exporter.name" . }}
|
||||
app.kubernetes.io/instance: {{ .Release.Name }}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
Create the name of the service account to use
|
||||
*/}}
|
||||
{{- define "dcgm-exporter.serviceAccountName" -}}
|
||||
{{- if .Values.serviceAccount.create -}}
|
||||
{{ default (include "dcgm-exporter.fullname" .) .Values.serviceAccount.name }}
|
||||
{{- else -}}
|
||||
{{ default "default" .Values.serviceAccount.name }}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
@ -0,0 +1,136 @@
|
||||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: {{ include "dcgm-exporter.fullname" . }}
|
||||
namespace: {{ include "dcgm-exporter.namespace" . }}
|
||||
labels:
|
||||
{{- include "dcgm-exporter.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: "dcgm-exporter"
|
||||
spec:
|
||||
updateStrategy:
|
||||
type: RollingUpdate
|
||||
{{- with .Values.rollingUpdate }}
|
||||
rollingUpdate:
|
||||
maxUnavailable: {{ .maxUnavailable }}
|
||||
maxSurge: {{ .maxSurge }}
|
||||
{{- end }}
|
||||
selector:
|
||||
matchLabels:
|
||||
{{- include "dcgm-exporter.selectorLabels" . | nindent 6 }}
|
||||
app.kubernetes.io/component: "dcgm-exporter"
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
{{- include "dcgm-exporter.selectorLabels" . | nindent 8 }}
|
||||
app.kubernetes.io/component: "dcgm-exporter"
|
||||
{{- if .Values.podAnnotations }}
|
||||
annotations:
|
||||
{{- toYaml .Values.podAnnotations | nindent 8 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
{{- if .Values.runtimeClassName }}
|
||||
runtimeClassName: {{ .Values.runtimeClassName }}
|
||||
{{- end }}
|
||||
priorityClassName: {{ .Values.priorityClassName | default "system-node-critical" }}
|
||||
{{- if .Values.hostNetwork }}
|
||||
hostNetwork: {{ .Values.hostNetwork }}
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
{{- end }}
|
||||
{{- with .Values.imagePullSecrets }}
|
||||
imagePullSecrets:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
serviceAccountName: {{ include "dcgm-exporter.serviceAccountName" . }}
|
||||
{{- if .Values.podSecurityContext }}
|
||||
securityContext:
|
||||
{{- toYaml .Values.podSecurityContext | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.affinity }}
|
||||
affinity:
|
||||
{{- toYaml .Values.affinity | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.nodeSelector }}
|
||||
nodeSelector:
|
||||
{{- toYaml .Values.nodeSelector | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.tolerations }}
|
||||
tolerations:
|
||||
{{- toYaml . | nindent 6 }}
|
||||
{{- end }}
|
||||
volumes:
|
||||
- name: "pod-gpu-resources"
|
||||
hostPath:
|
||||
path: {{ .Values.kubeletPath }}
|
||||
{{- range .Values.extraHostVolumes }}
|
||||
- name: {{ .name | quote }}
|
||||
hostPath:
|
||||
path: {{ .hostPath | quote }}
|
||||
{{- end }}
|
||||
{{- with .Values.extraConfigMapVolumes }}
|
||||
{{- toYaml . | nindent 6 }}
|
||||
{{- end }}
|
||||
containers:
|
||||
- name: exporter
|
||||
securityContext:
|
||||
{{- toYaml .Values.securityContext | nindent 10 }}
|
||||
{{- if .Values.image.tag }}
|
||||
image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
|
||||
{{- else }}
|
||||
image: "{{ .Values.image.repository }}:{{ .Chart.AppVersion }}"
|
||||
{{- end }}
|
||||
imagePullPolicy: "{{ .Values.image.pullPolicy }}"
|
||||
args:
|
||||
{{- range $.Values.arguments }}
|
||||
- {{ . }}
|
||||
{{- end }}
|
||||
env:
|
||||
- name: "DCGM_EXPORTER_KUBERNETES"
|
||||
value: "true"
|
||||
- name: "DCGM_EXPORTER_LISTEN"
|
||||
value: "{{ .Values.service.address }}"
|
||||
- name: NODE_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
{{- if .Values.extraEnv }}
|
||||
{{- toYaml .Values.extraEnv | nindent 8 }}
|
||||
{{- end }}
|
||||
ports:
|
||||
- name: "metrics"
|
||||
containerPort: {{ .Values.service.port }}
|
||||
volumeMounts:
|
||||
- name: "pod-gpu-resources"
|
||||
readOnly: true
|
||||
mountPath: "/var/lib/kubelet/pod-resources"
|
||||
{{- if .Values.extraVolumeMounts }}
|
||||
{{- toYaml .Values.extraVolumeMounts | nindent 8 }}
|
||||
{{- end }}
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: {{ .Values.service.port }}
|
||||
initialDelaySeconds: 45
|
||||
periodSeconds: 5
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: {{ .Values.service.port }}
|
||||
initialDelaySeconds: 45
|
||||
{{- if .Values.resources }}
|
||||
resources:
|
||||
{{- toYaml .Values.resources | nindent 10 }}
|
||||
{{- end }}
|
@ -0,0 +1,85 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: exporter-metrics-config-map
|
||||
namespace: {{ include "dcgm-exporter.namespace" . }}
|
||||
data:
|
||||
metrics: |
|
||||
# Format
|
||||
# If line starts with a '#' it is considered a comment
|
||||
# DCGM FIELD, Prometheus metric type, help message
|
||||
|
||||
# Clocks
|
||||
DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz).
|
||||
DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz).
|
||||
|
||||
# Temperature
|
||||
DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C).
|
||||
DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C).
|
||||
|
||||
# Power
|
||||
DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W).
|
||||
DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ).
|
||||
|
||||
# PCIE
|
||||
# DCGM_FI_DEV_PCIE_TX_THROUGHPUT, counter, Total number of bytes transmitted through PCIe TX (in KB) via NVML.
|
||||
# DCGM_FI_DEV_PCIE_RX_THROUGHPUT, counter, Total number of bytes received through PCIe RX (in KB) via NVML.
|
||||
DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries.
|
||||
|
||||
# Utilization (the sample period varies depending on the product)
|
||||
DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %).
|
||||
DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %).
|
||||
DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %).
|
||||
DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %).
|
||||
|
||||
# Errors and violations
|
||||
DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered.
|
||||
# DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us).
|
||||
# DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us).
|
||||
# DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us).
|
||||
# DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us).
|
||||
# DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us).
|
||||
# DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us).
|
||||
|
||||
# Memory usage
|
||||
DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB).
|
||||
DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB).
|
||||
|
||||
# ECC
|
||||
# DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors.
|
||||
# DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors.
|
||||
# DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors.
|
||||
# DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors.
|
||||
|
||||
# Retired pages
|
||||
# DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors.
|
||||
# DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors.
|
||||
# DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement.
|
||||
|
||||
# NVLink
|
||||
# DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors.
|
||||
# DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors.
|
||||
# DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries.
|
||||
# DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors.
|
||||
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes.
|
||||
# DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, counter, The number of bytes of active NVLink rx or tx data including both header and payload.
|
||||
|
||||
# VGPU License status
|
||||
DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status
|
||||
|
||||
# Remapped rows
|
||||
DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors
|
||||
DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors
|
||||
DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed
|
||||
|
||||
# DCP metrics
|
||||
DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active (in %).
|
||||
# DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned (in %).
|
||||
# DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM (in %).
|
||||
DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active (in %).
|
||||
DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data (in %).
|
||||
# DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active (in %).
|
||||
# DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active (in %).
|
||||
# DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active (in %).
|
||||
DCGM_FI_PROF_PCIE_TX_BYTES, counter, The number of bytes of active pcie tx data including both header and payload.
|
||||
DCGM_FI_PROF_PCIE_RX_BYTES, counter, The number of bytes of active pcie rx data including both header and payload.
|
@ -0,0 +1,13 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: dcgm-exporter-read-cm
|
||||
namespace: {{ include "dcgm-exporter.namespace" . }}
|
||||
labels:
|
||||
{{- include "dcgm-exporter.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: "dcgm-exporter"
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["configmaps"]
|
||||
resourceNames: ["exporter-metrics-config-map"]
|
||||
verbs: ["get"]
|
@ -0,0 +1,16 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: {{ include "dcgm-exporter.fullname" . }}
|
||||
namespace: {{ include "dcgm-exporter.namespace" . }}
|
||||
labels:
|
||||
{{- include "dcgm-exporter.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: "dcgm-exporter"
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: {{ include "dcgm-exporter.serviceAccountName" . }}
|
||||
namespace: {{ include "dcgm-exporter.namespace" . }}
|
||||
roleRef:
|
||||
kind: Role
|
||||
name: dcgm-exporter-read-cm
|
||||
apiGroup: rbac.authorization.k8s.io
|
@ -0,0 +1,42 @@
|
||||
{{- if .Values.serviceMonitor.enabled }}
|
||||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: {{ include "dcgm-exporter.fullname" . }}
|
||||
namespace: {{ include "dcgm-exporter.namespace" . }}
|
||||
labels:
|
||||
{{- include "dcgm-exporter.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: "dcgm-exporter"
|
||||
{{- if .Values.serviceMonitor.additionalLabels }}
|
||||
{{- toYaml .Values.serviceMonitor.additionalLabels | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
{{- include "dcgm-exporter.selectorLabels" . | nindent 6 }}
|
||||
app.kubernetes.io/component: "dcgm-exporter"
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ include "dcgm-exporter.namespace" . }}"
|
||||
endpoints:
|
||||
- port: "metrics"
|
||||
path: "/metrics"
|
||||
interval: "{{ .Values.serviceMonitor.interval }}"
|
||||
honorLabels: {{ .Values.serviceMonitor.honorLabels }}
|
||||
relabelings:
|
||||
{{ toYaml .Values.serviceMonitor.relabelings | nindent 6 }}
|
||||
{{- end -}}
|
@ -0,0 +1,37 @@
|
||||
{{- if .Values.service.enable }}
|
||||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: {{ include "dcgm-exporter.fullname" . }}
|
||||
namespace: {{ include "dcgm-exporter.namespace" . }}
|
||||
labels:
|
||||
{{- include "dcgm-exporter.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: "dcgm-exporter"
|
||||
{{- with .Values.service.annotations }}
|
||||
annotations:
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
type: {{ .Values.service.type }}
|
||||
ports:
|
||||
- name: "metrics"
|
||||
port: {{ .Values.service.port }}
|
||||
targetPort: {{ .Values.service.port }}
|
||||
protocol: TCP
|
||||
selector:
|
||||
{{- include "dcgm-exporter.selectorLabels" . | nindent 4 }}
|
||||
{{- end }}
|
@ -0,0 +1,28 @@
|
||||
{{- if .Values.serviceAccount.create -}}
|
||||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: {{ include "dcgm-exporter.serviceAccountName" . }}
|
||||
namespace: {{ include "dcgm-exporter.namespace" . }}
|
||||
labels:
|
||||
{{- include "dcgm-exporter.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: "dcgm-exporter"
|
||||
{{- with .Values.serviceAccount.annotations }}
|
||||
annotations:
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
{{- end -}}
|
@ -0,0 +1,138 @@
|
||||
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
image:
|
||||
repository: nvcr.io/nvidia/k8s/dcgm-exporter
|
||||
pullPolicy: IfNotPresent
|
||||
# Image tag defaults to AppVersion, but you can use the tag key
|
||||
# for the image tag, e.g:
|
||||
tag: 3.2.5-3.1.7-ubuntu20.04
|
||||
|
||||
# Change the following reference to "/etc/dcgm-exporter/default-counters.csv"
|
||||
# to stop profiling metrics from DCGM
|
||||
arguments: ["-f", "/etc/dcgm-exporter/dcp-metrics-included.csv"]
|
||||
# NOTE: in general, add any command line arguments to arguments above
|
||||
# and they will be passed through.
|
||||
# Use "-r", "<HOST>:<PORT>" to connect to an already running hostengine
|
||||
# Example arguments: ["-r", "host123:5555"]
|
||||
# Use "-n" to remove the hostname tag from the output.
|
||||
# Example arguments: ["-n"]
|
||||
# Use "-d" to specify the devices to monitor. -d must be followed by a string
|
||||
# in the following format: [f] or [g[:numeric_range][+]][i[:numeric_range]]
|
||||
# Where a numeric range is something like 0-4 or 0,2,4, etc.
|
||||
# Example arguments: ["-d", "g+i"] to monitor all GPUs and GPU instances or
|
||||
# ["-d", "g:0-3"] to monitor GPUs 0-3.
|
||||
# Use "-m" to specify the namespace and name of a configmap containing
|
||||
# the watched exporter fields.
|
||||
# Example arguments: ["-m", "default:exporter-metrics-config-map"]
|
||||
|
||||
imagePullSecrets: []
|
||||
nameOverride: ""
|
||||
fullnameOverride: ""
|
||||
namespaceOverride: ""
|
||||
|
||||
runtimeClassName: ""
|
||||
|
||||
serviceAccount:
|
||||
# Specifies whether a service account should be created
|
||||
create: true
|
||||
# Annotations to add to the service account
|
||||
annotations: {}
|
||||
# The name of the service account to use.
|
||||
# If not set and create is true, a name is generated using the fullname template
|
||||
name:
|
||||
|
||||
rollingUpdate:
|
||||
# Specifies maximum number of DaemonSet pods that can be unavailable during the update
|
||||
maxUnavailable: 1
|
||||
# Specifies maximum number of nodes with an existing available DaemonSet pod that can have an updated DaemonSet pod during during an update
|
||||
maxSurge: 0
|
||||
|
||||
podAnnotations: {}
|
||||
# Using this annotation which is required for prometheus scraping
|
||||
# prometheus.io/scrape: "true"
|
||||
# prometheus.io/port: "9400"
|
||||
|
||||
podSecurityContext: {}
|
||||
# fsGroup: 2000
|
||||
|
||||
securityContext:
|
||||
runAsNonRoot: false
|
||||
runAsUser: 0
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
# readOnlyRootFilesystem: true
|
||||
|
||||
service:
|
||||
enable: true
|
||||
type: ClusterIP
|
||||
port: 9400
|
||||
address: ":9400"
|
||||
# Annotations to add to the service
|
||||
annotations: {}
|
||||
|
||||
resources: {}
|
||||
# limits:
|
||||
# cpu: 100m
|
||||
# memory: 128Mi
|
||||
# requests:
|
||||
# cpu: 100m
|
||||
# memory: 128Mi
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
interval: 15s
|
||||
honorLabels: false
|
||||
additionalLabels: {}
|
||||
#monitoring: prometheus
|
||||
relabelings: []
|
||||
# - sourceLabels: [__meta_kubernetes_pod_node_name]
|
||||
# separator: ;
|
||||
# regex: ^(.*)$
|
||||
# targetLabel: nodename
|
||||
# replacement: $1
|
||||
# action: replace
|
||||
|
||||
nodeSelector: {}
|
||||
#node: gpu
|
||||
|
||||
tolerations: []
|
||||
#- operator: Exists
|
||||
|
||||
affinity: {}
|
||||
#nodeAffinity:
|
||||
# requiredDuringSchedulingIgnoredDuringExecution:
|
||||
# nodeSelectorTerms:
|
||||
# - matchExpressions:
|
||||
# - key: nvidia-gpu
|
||||
# operator: Exists
|
||||
|
||||
extraHostVolumes: []
|
||||
#- name: host-binaries
|
||||
# hostPath: /opt/bin
|
||||
|
||||
extraConfigMapVolumes: []
|
||||
#- name: exporter-metrics-volume
|
||||
# configMap:
|
||||
# name: exporter-metrics-config-map
|
||||
|
||||
extraVolumeMounts: []
|
||||
#- name: host-binaries
|
||||
# mountPath: /opt/bin
|
||||
# readOnly: true
|
||||
|
||||
extraEnv: []
|
||||
#- name: EXTRA_VAR
|
||||
# value: "TheStringValue"
|
||||
|
||||
kubeletPath: "/var/lib/kubelet/pod-resources"
|
@ -0,0 +1,41 @@
|
||||
ARG GOLANG_VERSION
|
||||
FROM golang:$GOLANG_VERSION AS builder
|
||||
WORKDIR /go/src/github.com/NVIDIA/dcgm-exporter
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN make binary check-format
|
||||
|
||||
FROM nvcr.io/nvidia/cuda:12.2.0-base-ubi8
|
||||
LABEL io.k8s.display-name="NVIDIA DCGM Exporter"
|
||||
|
||||
ARG DCGM_VERSION
|
||||
|
||||
RUN dnf clean expire-cache && dnf install -y datacenter-gpu-manager-${DCGM_VERSION} libcap
|
||||
|
||||
COPY --from=builder /go/src/github.com/NVIDIA/dcgm-exporter/cmd/dcgm-exporter/dcgm-exporter /usr/bin/
|
||||
COPY etc /etc/dcgm-exporter
|
||||
|
||||
ENV NVIDIA_VISIBLE_DEVICES=all
|
||||
# disable all constraints on the configurations required by NVIDIA container toolkit
|
||||
ENV NVIDIA_DISABLE_REQUIRE="true"
|
||||
# Required for DCP metrics
|
||||
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32
|
||||
|
||||
ARG VERSION
|
||||
|
||||
LABEL io.k8s.display-name="NVIDIA DCGM Eporter"
|
||||
LABEL name="NVIDIA DCGM Exporter"
|
||||
LABEL vendor="NVIDIA"
|
||||
LABEL version="${VERSION}"
|
||||
LABEL release="N/A"
|
||||
LABEL summary="Exports GPU Metrics to Prometheus"
|
||||
LABEL description="See summary"
|
||||
|
||||
COPY ./LICENSE ./licenses/LICENSE
|
||||
|
||||
ENV NO_SETCAP=
|
||||
COPY docker/dcgm-exporter-entrypoint.sh /usr/local/dcgm/dcgm-exporter-entrypoint.sh
|
||||
RUN chmod +x /usr/local/dcgm/dcgm-exporter-entrypoint.sh
|
||||
|
||||
ENTRYPOINT ["/usr/local/dcgm/dcgm-exporter-entrypoint.sh"]
|
@ -0,0 +1,29 @@
|
||||
ARG GOLANG_VERSION
|
||||
FROM golang:$GOLANG_VERSION AS builder
|
||||
WORKDIR /go/src/github.com/NVIDIA/dcgm-exporter
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN make binary check-format
|
||||
|
||||
FROM nvcr.io/nvidia/cuda:12.2.0-base-ubuntu20.04
|
||||
LABEL io.k8s.display-name="NVIDIA DCGM Exporter"
|
||||
|
||||
COPY --from=builder /go/src/github.com/NVIDIA/dcgm-exporter/cmd/dcgm-exporter/dcgm-exporter /usr/bin/
|
||||
COPY etc /etc/dcgm-exporter
|
||||
|
||||
ARG DCGM_VERSION
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
datacenter-gpu-manager=1:${DCGM_VERSION} libcap2-bin && apt-get purge --autoremove -y openssl
|
||||
|
||||
# Required for DCP metrics
|
||||
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32
|
||||
# disable all constraints on the configurations required by NVIDIA container toolkit
|
||||
ENV NVIDIA_DISABLE_REQUIRE="true"
|
||||
ENV NVIDIA_VISIBLE_DEVICES=all
|
||||
|
||||
ENV NO_SETCAP=
|
||||
COPY docker/dcgm-exporter-entrypoint.sh /usr/local/dcgm/dcgm-exporter-entrypoint.sh
|
||||
RUN chmod +x /usr/local/dcgm/dcgm-exporter-entrypoint.sh
|
||||
|
||||
ENTRYPOINT ["/usr/local/dcgm/dcgm-exporter-entrypoint.sh"]
|
@ -0,0 +1,20 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# We want to setcap only when the container is started with the right caps
|
||||
DCGM_EXPORTER=$(readlink -f $(which dcgm-exporter))
|
||||
if [ -z "$NO_SETCAP" ]; then
|
||||
if setcap 'cap_sys_admin=+ep' $DCGM_EXPORTER; then
|
||||
if ! $DCGM_EXPORTER -v 1>/dev/null 2>/dev/null; then
|
||||
>&2 echo "Warning #2: dcgm-exporter doesn't have sufficient privileges to expose profiling metrics. To get profiling metrics with dcgm-exporter, use --cap-add SYS_ADMIN"
|
||||
setcap 'cap_sys_admin=-ep' $DCGM_EXPORTER
|
||||
fi
|
||||
else
|
||||
>&2 echo "Warning #1: dcgm-exporter doesn't have sufficient privileges to expose profiling metrics. To get profiling metrics with dcgm-exporter, use --cap-add SYS_ADMIN"
|
||||
fi
|
||||
|
||||
fi
|
||||
|
||||
# Pass the command line arguments to dcgm-exporter
|
||||
set -- $DCGM_EXPORTER "$@"
|
||||
exec "$@"
|
@ -0,0 +1,66 @@
|
||||
# Format
|
||||
# If line starts with a '#' it is considered a comment
|
||||
# DCGM FIELD, Prometheus metric type, help message
|
||||
|
||||
# Clocks
|
||||
dcgm_sm_clock, gauge, SM clock frequency (in MHz).
|
||||
dcgm_memory_clock, gauge, Memory clock frequency (in MHz).
|
||||
|
||||
# Temperature
|
||||
dcgm_memory_temp, gauge, Memory temperature (in C).
|
||||
dcgm_gpu_temp, gauge, GPU temperature (in C).
|
||||
|
||||
# Power
|
||||
dcgm_power_usage, gauge, Power draw (in W).
|
||||
dcgm_total_energy_consumption, counter, Total energy consumption since boot (in mJ).
|
||||
|
||||
# PCIe
|
||||
dcgm_pcie_tx_throughput, counter, Total number of bytes transmitted through PCIe TX (in KB) via NVML.
|
||||
dcgm_pcie_rx_throughput, counter, Total number of bytes received through PCIe RX (in KB) via NVML.
|
||||
dcgm_pcie_replay_counter, counter, Total number of PCIe retries.
|
||||
|
||||
# Utilization (the sample period varies depending on the product)
|
||||
dcgm_gpu_utilization, gauge, GPU utilization (in %).
|
||||
dcgm_mem_copy_utilization, gauge, Memory utilization (in %).
|
||||
dcgm_enc_utilization, gauge, Encoder utilization (in %).
|
||||
dcgm_dec_utilization, gauge, Decoder utilization (in %).
|
||||
|
||||
# Errors and violations
|
||||
dcgm_xid_errors, gauge, Value of the last XID error encountered.
|
||||
# dcgm_power_violation, counter, Throttling duration due to power constraints (in us).
|
||||
# dcgm_thermal_violation, counter, Throttling duration due to thermal constraints (in us).
|
||||
# dcgm_sync_boost_violation, counter, Throttling duration due to sync-boost constraints (in us).
|
||||
# dcgm_board_limit_violation, counter, Throttling duration due to board limit constraints (in us).
|
||||
# dcgm_low_util_violation, counter, Throttling duration due to low utilization (in us).
|
||||
# dcgm_reliability_violation, counter, Throttling duration due to reliability constraints (in us).
|
||||
|
||||
# Memory usage
|
||||
dcgm_fb_free, gauge, Framebuffer memory free (in MiB).
|
||||
dcgm_fb_used, gauge, Framebuffer memory used (in MiB).
|
||||
|
||||
# ECC
|
||||
# dcgm_ecc_sbe_volatile_total, counter, Total number of single-bit volatile ECC errors.
|
||||
# dcgm_ecc_dbe_volatile_total, counter, Total number of double-bit volatile ECC errors.
|
||||
# dcgm_ecc_sbe_aggregate_total, counter, Total number of single-bit persistent ECC errors.
|
||||
# dcgm_ecc_dbe_aggregate_total, counter, Total number of double-bit persistent ECC errors.
|
||||
|
||||
# Retired pages
|
||||
# dcgm_retired_pages_sbe, counter, Total number of retired pages due to single-bit errors.
|
||||
# dcgm_retired_pages_dbe, counter, Total number of retired pages due to double-bit errors.
|
||||
# dcgm_retired_pages_pending, counter, Total number of pages pending retirement.
|
||||
|
||||
# NVLink
|
||||
# dcgm_nvlink_flit_crc_error_count_total, counter, Total number of NVLink flow-control CRC errors.
|
||||
# dcgm_nvlink_data_crc_error_count_total, counter, Total number of NVLink data CRC errors.
|
||||
# dcgm_nvlink_replay_error_count_total, counter, Total number of NVLink retries.
|
||||
# dcgm_nvlink_recovery_error_count_total, counter, Total number of NVLink recovery errors.
|
||||
dcgm_nvlink_bandwidth_total, counter, Total number of NVLink bandwidth counters for all lanes
|
||||
|
||||
# Add DCP metrics
|
||||
dcgm_fi_prof_gr_engine_active, gauge, Ratio of time the graphics engine is active (in %).
|
||||
# dcgm_fi_prof_sm_active, gauge, The ratio of cycles an SM has at least 1 warp assigned (in %).
|
||||
# dcgm_fi_prof_sm_occupancy, gauge, The ratio of number of warps resident on an SM (in %).
|
||||
dcgm_fi_prof_pipe_tensor_active, gauge, Ratio of cycles the tensor (HMMA) pipe is active (in %).
|
||||
dcgm_fi_prof_dram_active, gauge, Ratio of cycles the device memory interface is active sending or receiving data (in %).
|
||||
dcgm_fi_prof_pcie_tx_bytes, counter, The number of bytes of active pcie tx data including both header and payload.
|
||||
dcgm_fi_prof_pcie_rx_bytes, counter, The number of bytes of active pcie rx data including both header and payload.
|
|
@ -0,0 +1,90 @@
|
||||
# Format
|
||||
# If line starts with a '#' it is considered a comment
|
||||
# DCGM FIELD, Prometheus metric type, help message
|
||||
|
||||
# Clocks
|
||||
DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz).
|
||||
DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz).
|
||||
|
||||
# Temperature
|
||||
DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C).
|
||||
DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C).
|
||||
|
||||
# Power
|
||||
DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W).
|
||||
DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ).
|
||||
|
||||
# PCIE
|
||||
# DCGM_FI_DEV_PCIE_TX_THROUGHPUT, counter, Total number of bytes transmitted through PCIe TX (in KB) via NVML.
|
||||
# DCGM_FI_DEV_PCIE_RX_THROUGHPUT, counter, Total number of bytes received through PCIe RX (in KB) via NVML.
|
||||
DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries.
|
||||
|
||||
# Utilization (the sample period varies depending on the product)
|
||||
DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %).
|
||||
DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %).
|
||||
DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %).
|
||||
DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %).
|
||||
|
||||
# Errors and violations
|
||||
DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered.
|
||||
# DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us).
|
||||
# DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us).
|
||||
# DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us).
|
||||
# DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us).
|
||||
# DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us).
|
||||
# DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us).
|
||||
|
||||
# Memory usage
|
||||
DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB).
|
||||
DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB).
|
||||
|
||||
# ECC
|
||||
# DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors.
|
||||
# DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors.
|
||||
# DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors.
|
||||
# DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors.
|
||||
|
||||
# Retired pages
|
||||
# DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors.
|
||||
# DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors.
|
||||
# DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement.
|
||||
|
||||
# NVLink
|
||||
# DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors.
|
||||
# DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors.
|
||||
# DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries.
|
||||
# DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors.
|
||||
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes.
|
||||
# DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, counter, The number of bytes of active NVLink rx or tx data including both header and payload.
|
||||
|
||||
# VGPU License status
|
||||
DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status
|
||||
|
||||
# Remapped rows
|
||||
DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors
|
||||
DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors
|
||||
DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed
|
||||
|
||||
# Static configuration information. These appear as labels on the other metrics
|
||||
DCGM_FI_DRIVER_VERSION, label, Driver Version
|
||||
# DCGM_FI_NVML_VERSION, label, NVML Version
|
||||
# DCGM_FI_DEV_BRAND, label, Device Brand
|
||||
# DCGM_FI_DEV_SERIAL, label, Device Serial Number
|
||||
# DCGM_FI_DEV_OEM_INFOROM_VER, label, OEM inforom version
|
||||
# DCGM_FI_DEV_ECC_INFOROM_VER, label, ECC inforom version
|
||||
# DCGM_FI_DEV_POWER_INFOROM_VER, label, Power management object inforom version
|
||||
# DCGM_FI_DEV_INFOROM_IMAGE_VER, label, Inforom image version
|
||||
# DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device
|
||||
|
||||
# DCP metrics
|
||||
DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active (in %).
|
||||
# DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned (in %).
|
||||
# DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM (in %).
|
||||
DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active (in %).
|
||||
DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data (in %).
|
||||
# DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active (in %).
|
||||
# DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active (in %).
|
||||
# DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active (in %).
|
||||
DCGM_FI_PROF_PCIE_TX_BYTES, gauge, The rate of data transmitted over the PCIe bus - including both protocol headers and data payloads - in bytes per second.
|
||||
DCGM_FI_PROF_PCIE_RX_BYTES, gauge, The rate of data received over the PCIe bus - including both protocol headers and data payloads - in bytes per second.
|
||||
|
|
@ -0,0 +1,76 @@
|
||||
# Format
|
||||
# If line starts with a '#' it is considered a comment
|
||||
# DCGM FIELD, Prometheus metric type, help message
|
||||
|
||||
# Clocks
|
||||
DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz).
|
||||
DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz).
|
||||
|
||||
# Temperature
|
||||
DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C).
|
||||
DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C).
|
||||
|
||||
# Power
|
||||
DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W).
|
||||
DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ).
|
||||
|
||||
# PCIE
|
||||
DCGM_FI_DEV_PCIE_TX_THROUGHPUT, counter, Total number of bytes transmitted through PCIe TX (in KB) via NVML.
|
||||
DCGM_FI_DEV_PCIE_RX_THROUGHPUT, counter, Total number of bytes received through PCIe RX (in KB) via NVML.
|
||||
DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries.
|
||||
|
||||
# Utilization (the sample period varies depending on the product)
|
||||
DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %).
|
||||
DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %).
|
||||
DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %).
|
||||
DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %).
|
||||
|
||||
# Errors and violations
|
||||
DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered.
|
||||
# DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us).
|
||||
# DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us).
|
||||
# DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us).
|
||||
# DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us).
|
||||
# DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us).
|
||||
# DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us).
|
||||
|
||||
# Memory usage
|
||||
DCGM_FI_DEV_FB_FREE, gauge, Frame buffer memory free (in MB).
|
||||
DCGM_FI_DEV_FB_USED, gauge, Frame buffer memory used (in MB).
|
||||
|
||||
# ECC
|
||||
# DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors.
|
||||
# DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors.
|
||||
# DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors.
|
||||
# DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors.
|
||||
|
||||
# Retired pages
|
||||
# DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors.
|
||||
# DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors.
|
||||
# DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement.
|
||||
|
||||
# NVLink
|
||||
# DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors.
|
||||
# DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors.
|
||||
# DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries.
|
||||
# DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors.
|
||||
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes
|
||||
|
||||
# VGPU License status
|
||||
DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status
|
||||
|
||||
# Remapped rows
|
||||
DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors
|
||||
DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors
|
||||
DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed
|
||||
|
||||
# Static configuration information. These appear as labels on the other metrics
|
||||
DCGM_FI_DRIVER_VERSION, label, Driver Version
|
||||
# DCGM_FI_NVML_VERSION, label, NVML Version
|
||||
# DCGM_FI_DEV_BRAND, label, Device Brand
|
||||
# DCGM_FI_DEV_SERIAL, label, Device Serial Number
|
||||
# DCGM_FI_DEV_OEM_INFOROM_VER, label, OEM inforom version
|
||||
# DCGM_FI_DEV_ECC_INFOROM_VER, label, ECC inforom version
|
||||
# DCGM_FI_DEV_POWER_INFOROM_VER, label, Power management object inforom version
|
||||
# DCGM_FI_DEV_INFOROM_IMAGE_VER, label, Inforom image version
|
||||
# DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device
|
|
@ -0,0 +1,83 @@
|
||||
module github.com/NVIDIA/dcgm-exporter
|
||||
|
||||
go 1.17
|
||||
|
||||
replace (
|
||||
k8s.io/api => k8s.io/api v0.20.2
|
||||
k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.20.2
|
||||
k8s.io/apimachinery => k8s.io/apimachinery v0.20.2
|
||||
k8s.io/apiserver => k8s.io/apiserver v0.20.2
|
||||
k8s.io/cli-runtime => k8s.io/cli-runtime v0.20.2
|
||||
k8s.io/client-go => k8s.io/client-go v0.20.2
|
||||
k8s.io/cloud-provider => k8s.io/cloud-provider v0.20.2
|
||||
k8s.io/cluster-bootstrap => k8s.io/cluster-bootstrap v0.20.2
|
||||
k8s.io/code-generator => k8s.io/code-generator v0.20.2
|
||||
k8s.io/component-base => k8s.io/component-base v0.20.2
|
||||
k8s.io/cri-api => k8s.io/cri-api v0.20.2
|
||||
k8s.io/csi-translation-lib => k8s.io/csi-translation-lib v0.20.2
|
||||
k8s.io/kube-aggregator => k8s.io/kube-aggregator v0.20.2
|
||||
k8s.io/kube-controller-manager => k8s.io/kube-controller-manager v0.20.2
|
||||
k8s.io/kube-proxy => k8s.io/kube-proxy v0.20.2
|
||||
k8s.io/kube-scheduler => k8s.io/kube-scheduler v0.20.2
|
||||
k8s.io/kubectl => k8s.io/kubectl v0.20.2
|
||||
k8s.io/kubelet => k8s.io/kubelet v0.20.2
|
||||
k8s.io/kubernetes => k8s.io/kubernetes v1.18.20
|
||||
k8s.io/legacy-cloud-providers => k8s.io/legacy-cloud-providers v0.20.2
|
||||
k8s.io/metrics => k8s.io/metrics v0.20.2
|
||||
k8s.io/sample-apiserver => k8s.io/sample-apiserver v0.20.2
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/NVIDIA/go-dcgm v0.0.0-20230816170901-d898cc7820fe
|
||||
github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20211102125545-5a2c58442e48
|
||||
github.com/gorilla/mux v1.8.0
|
||||
github.com/sirupsen/logrus v1.9.0
|
||||
github.com/stretchr/testify v1.8.1
|
||||
github.com/urfave/cli/v2 v2.3.0
|
||||
google.golang.org/grpc v1.53.0
|
||||
k8s.io/api v0.20.2
|
||||
k8s.io/apimachinery v0.20.2
|
||||
k8s.io/client-go v0.20.2
|
||||
k8s.io/kubelet v0.20.2
|
||||
k8s.io/kubernetes v0.0.0-00010101000000-000000000000
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/Masterminds/semver v1.5.0 // indirect
|
||||
github.com/Microsoft/go-winio v0.4.14 // indirect
|
||||
github.com/bits-and-blooms/bitset v1.2.1 // indirect
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.0 // indirect
|
||||
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||
github.com/evanphx/json-patch v4.9.0+incompatible // indirect
|
||||
github.com/go-logr/logr v0.2.0 // indirect
|
||||
github.com/gogo/protobuf v1.3.2 // indirect
|
||||
github.com/golang/protobuf v1.5.2 // indirect
|
||||
github.com/google/gofuzz v1.1.0 // indirect
|
||||
github.com/googleapis/gnostic v0.4.1 // indirect
|
||||
github.com/json-iterator/go v1.1.10 // indirect
|
||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
|
||||
github.com/modern-go/reflect2 v1.0.1 // indirect
|
||||
github.com/pkg/errors v0.9.1 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.0 // indirect
|
||||
github.com/russross/blackfriday/v2 v2.0.1 // indirect
|
||||
github.com/shurcooL/sanitized_anchor_name v1.0.0 // indirect
|
||||
golang.org/x/crypto v0.7.0 // indirect
|
||||
golang.org/x/net v0.8.0 // indirect
|
||||
golang.org/x/oauth2 v0.4.0 // indirect
|
||||
golang.org/x/sys v0.6.0 // indirect
|
||||
golang.org/x/term v0.6.0 // indirect
|
||||
golang.org/x/text v0.8.0 // indirect
|
||||
golang.org/x/time v0.1.0 // indirect
|
||||
google.golang.org/appengine v1.6.7 // indirect
|
||||
google.golang.org/genproto v0.0.0-20230110181048-76db0878b65f // indirect
|
||||
google.golang.org/protobuf v1.28.1 // indirect
|
||||
gopkg.in/inf.v0 v0.9.1 // indirect
|
||||
gopkg.in/yaml.v2 v2.2.8 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
k8s.io/klog v1.0.0 // indirect
|
||||
k8s.io/klog/v2 v2.4.0 // indirect
|
||||
k8s.io/kube-openapi v0.0.0-20201113171705-d219536bb9fd // indirect
|
||||
k8s.io/utils v0.0.0-20201110183641-67b214c5f920 // indirect
|
||||
sigs.k8s.io/structured-merge-diff/v4 v4.0.2 // indirect
|
||||
sigs.k8s.io/yaml v1.2.0 // indirect
|
||||
)
|
@ -0,0 +1,822 @@
|
||||
{
|
||||
"__requires": [
|
||||
{
|
||||
"type": "panel",
|
||||
"id": "gauge",
|
||||
"name": "Gauge",
|
||||
"version": ""
|
||||
},
|
||||
{
|
||||
"type": "grafana",
|
||||
"id": "grafana",
|
||||
"name": "Grafana",
|
||||
"version": "6.7.3"
|
||||
},
|
||||
{
|
||||
"type": "panel",
|
||||
"id": "graph",
|
||||
"name": "Graph",
|
||||
"version": ""
|
||||
},
|
||||
{
|
||||
"type": "datasource",
|
||||
"id": "prometheus",
|
||||
"name": "Prometheus",
|
||||
"version": "1.0.0"
|
||||
}
|
||||
],
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"$$hashKey": "object:192",
|
||||
"builtIn": 1,
|
||||
"datasource": "-- Grafana --",
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"description": "This dashboard is to display the metrics from DCGM Exporter on a Kubernetes (1.19+) cluster",
|
||||
"editable": true,
|
||||
"gnetId": 12239,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"iteration": 1588401887165,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 18,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 12,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"max": true,
|
||||
"min": false,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 2,
|
||||
"nullPointMode": "null",
|
||||
"options": {
|
||||
"dataLinks": []
|
||||
},
|
||||
"percentage": false,
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "DCGM_FI_DEV_GPU_TEMP{instance=~\"$instance\", gpu=~\"$gpu\"}",
|
||||
"instant": false,
|
||||
"interval": "",
|
||||
"legendFormat": "GPU {{gpu}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeRegions": [],
|
||||
"timeShift": null,
|
||||
"title": "GPU Temperature",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "celsius",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false,
|
||||
"alignLevel": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"datasource": "$datasource",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 6,
|
||||
"x": 18,
|
||||
"y": 0
|
||||
},
|
||||
"id": 14,
|
||||
"options": {
|
||||
"fieldOptions": {
|
||||
"calcs": [
|
||||
"mean"
|
||||
],
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"max": 100,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "#EAB839",
|
||||
"value": 83
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 87
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "celsius"
|
||||
},
|
||||
"overrides": [],
|
||||
"values": false
|
||||
},
|
||||
"orientation": "auto",
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true
|
||||
},
|
||||
"pluginVersion": "6.7.3",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "avg(DCGM_FI_DEV_GPU_TEMP{instance=~\"$instance\", gpu=~\"$gpu\"})",
|
||||
"interval": "",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "GPU Avg. Temp",
|
||||
"type": "gauge"
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 18,
|
||||
"x": 0,
|
||||
"y": 8
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 10,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"max": true,
|
||||
"min": false,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 2,
|
||||
"nullPointMode": "null",
|
||||
"options": {
|
||||
"dataLinks": []
|
||||
},
|
||||
"percentage": false,
|
||||
"pluginVersion": "6.5.2",
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "DCGM_FI_DEV_POWER_USAGE{instance=~\"$instance\", gpu=~\"$gpu\"}",
|
||||
"interval": "",
|
||||
"legendFormat": "GPU {{gpu}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeRegions": [],
|
||||
"timeShift": null,
|
||||
"title": "GPU Power Usage",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "watt",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false,
|
||||
"alignLevel": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"cacheTimeout": null,
|
||||
"datasource": "$datasource",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 6,
|
||||
"x": 18,
|
||||
"y": 8
|
||||
},
|
||||
"id": 16,
|
||||
"links": [],
|
||||
"options": {
|
||||
"fieldOptions": {
|
||||
"calcs": [
|
||||
"sum"
|
||||
],
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"max": 2400,
|
||||
"min": 0,
|
||||
"nullValueMode": "connected",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "#EAB839",
|
||||
"value": 1800
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 2200
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "watt"
|
||||
},
|
||||
"overrides": [],
|
||||
"values": false
|
||||
},
|
||||
"orientation": "horizontal",
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true
|
||||
},
|
||||
"pluginVersion": "6.7.3",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(DCGM_FI_DEV_POWER_USAGE{instance=~\"$instance\", gpu=~\"$gpu\"})",
|
||||
"instant": true,
|
||||
"interval": "",
|
||||
"legendFormat": "",
|
||||
"range": false,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "GPU Power Total",
|
||||
"type": "gauge"
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 16
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 2,
|
||||
"interval": "",
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"max": true,
|
||||
"min": false,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 2,
|
||||
"nullPointMode": "null",
|
||||
"options": {
|
||||
"dataLinks": []
|
||||
},
|
||||
"percentage": false,
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "DCGM_FI_DEV_SM_CLOCK{instance=~\"$instance\", gpu=~\"$gpu\"} * 1000000",
|
||||
"format": "time_series",
|
||||
"interval": "",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "GPU {{gpu}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeRegions": [],
|
||||
"timeShift": null,
|
||||
"title": "GPU SM Clocks",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"decimals": null,
|
||||
"format": "hertz",
|
||||
"label": "",
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false,
|
||||
"alignLevel": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 24
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 6,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"max": true,
|
||||
"min": false,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 2,
|
||||
"nullPointMode": "null",
|
||||
"options": {
|
||||
"dataLinks": []
|
||||
},
|
||||
"percentage": false,
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "DCGM_FI_DEV_GPU_UTIL{instance=~\"$instance\", gpu=~\"$gpu\"}",
|
||||
"interval": "",
|
||||
"legendFormat": "GPU {{gpu}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeRegions": [],
|
||||
"timeShift": null,
|
||||
"title": "GPU Utilization",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "cumulative"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "percent",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": "100",
|
||||
"min": "0",
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false,
|
||||
"alignLevel": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 32
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 18,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"max": true,
|
||||
"min": false,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 2,
|
||||
"nullPointMode": "null",
|
||||
"options": {
|
||||
"dataLinks": []
|
||||
},
|
||||
"percentage": false,
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "DCGM_FI_DEV_FB_USED{instance=~\"$instance\", gpu=~\"$gpu\"}",
|
||||
"interval": "",
|
||||
"legendFormat": "GPU {{gpu}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeRegions": [],
|
||||
"timeShift": null,
|
||||
"title": "GPU Framebuffer Mem Used",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "decmbytes",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false,
|
||||
"alignLevel": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 24
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 4,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"max": true,
|
||||
"min": false,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 2,
|
||||
"nullPointMode": "null",
|
||||
"options": {
|
||||
"dataLinks": []
|
||||
},
|
||||
"percentage": false,
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{instance=~\"$instance\", gpu=~\"$gpu\"}",
|
||||
"interval": "",
|
||||
"legendFormat": "GPU {{gpu}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeRegions": [],
|
||||
"timeShift": null,
|
||||
"title": "Tensor Core Utilization",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "cumulative"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "percentunit",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": "1",
|
||||
"min": "0",
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false,
|
||||
"alignLevel": null
|
||||
}
|
||||
}
|
||||
],
|
||||
"refresh": false,
|
||||
"schemaVersion": 22,
|
||||
"style": "dark",
|
||||
"tags": [],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"selected": true,
|
||||
"text": "Prometheus",
|
||||
"value": "Prometheus"
|
||||
},
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"multi": false,
|
||||
"name": "datasource",
|
||||
"options": [],
|
||||
"query": "prometheus",
|
||||
"queryValue": "",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": {},
|
||||
"datasource": "$datasource",
|
||||
"definition": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)",
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"index": -1,
|
||||
"label": null,
|
||||
"multi": true,
|
||||
"name": "instance",
|
||||
"options": [],
|
||||
"query": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": {},
|
||||
"datasource": "$datasource",
|
||||
"definition": "label_values(DCGM_FI_DEV_GPU_TEMP, gpu)",
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"index": -1,
|
||||
"label": null,
|
||||
"multi": true,
|
||||
"name": "gpu",
|
||||
"options": [],
|
||||
"query": "label_values(DCGM_FI_DEV_GPU_TEMP, gpu)",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-15m",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": [
|
||||
"5s",
|
||||
"10s",
|
||||
"30s",
|
||||
"1m",
|
||||
"5m",
|
||||
"15m",
|
||||
"30m",
|
||||
"1h",
|
||||
"2h",
|
||||
"1d"
|
||||
]
|
||||
},
|
||||
"timezone": "",
|
||||
"title": "NVIDIA DCGM Exporter Dashboard",
|
||||
"uid": "Oxed_c6Wz",
|
||||
"variables": {
|
||||
"list": []
|
||||
},
|
||||
"version": 1
|
||||
}
|
@ -0,0 +1,20 @@
|
||||
FROM ubuntu:16.04
|
||||
|
||||
ARG DCGM_VERSION
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends libgomp1 \
|
||||
ca-certificates wget && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN wget https://developer.download.nvidia.com/compute/redist/dcgm/${DCGM_VERSION}/DEBS/datacenter-gpu-manager_${DCGM_VERSION}_amd64.deb && \
|
||||
dpkg -i datacenter-gpu-manager_*.deb && \
|
||||
rm -f datacenter-gpu-manager_*.deb
|
||||
|
||||
COPY dcgm-exporter /usr/local/bin
|
||||
|
||||
ENV NVIDIA_VISIBLE_DEVICES all
|
||||
ENV NVIDIA_DRIVER_CAPABILITIES all
|
||||
|
||||
VOLUME /run/prometheus
|
||||
|
||||
ENTRYPOINT [ "dcgm-exporter", "-e" ]
|
@ -0,0 +1,113 @@
|
||||
/*
|
||||
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package dcgmexporter
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math/rand"
|
||||
|
||||
"github.com/NVIDIA/go-dcgm/pkg/dcgm"
|
||||
)
|
||||
|
||||
func NewGroup() (dcgm.GroupHandle, func(), error) {
|
||||
group, err := dcgm.NewDefaultGroup(fmt.Sprintf("gpu-collector-group-%d", rand.Uint64()))
|
||||
if err != nil {
|
||||
return dcgm.GroupHandle{}, func() {}, err
|
||||
}
|
||||
|
||||
return group, func() { dcgm.DestroyGroup(group) }, nil
|
||||
}
|
||||
|
||||
func NewDeviceFields(counters []Counter, entityType dcgm.Field_Entity_Group) []dcgm.Short {
|
||||
var deviceFields []dcgm.Short
|
||||
for _, f := range counters {
|
||||
meta := dcgm.FieldGetById(f.FieldID)
|
||||
|
||||
if meta.EntityLevel == entityType || meta.EntityLevel == dcgm.FE_NONE {
|
||||
deviceFields = append(deviceFields, f.FieldID)
|
||||
} else if entityType == dcgm.FE_GPU && (meta.EntityLevel == dcgm.FE_GPU_CI || meta.EntityLevel == dcgm.FE_GPU_I || meta.EntityLevel == dcgm.FE_VGPU) {
|
||||
deviceFields = append(deviceFields, f.FieldID)
|
||||
}
|
||||
}
|
||||
|
||||
return deviceFields
|
||||
}
|
||||
|
||||
func NewFieldGroup(deviceFields []dcgm.Short) (dcgm.FieldHandle, func(), error) {
|
||||
name := fmt.Sprintf("gpu-collector-fieldgroup-%d", rand.Uint64())
|
||||
fieldGroup, err := dcgm.FieldGroupCreate(name, deviceFields)
|
||||
if err != nil {
|
||||
return dcgm.FieldHandle{}, func() {}, err
|
||||
}
|
||||
|
||||
return fieldGroup, func() { dcgm.FieldGroupDestroy(fieldGroup) }, nil
|
||||
}
|
||||
|
||||
func WatchFieldGroup(group dcgm.GroupHandle, field dcgm.FieldHandle, updateFreq int64, maxKeepAge float64, maxKeepSamples int32) error {
|
||||
err := dcgm.WatchFieldsWithGroupEx(field, group, updateFreq, maxKeepAge, maxKeepSamples)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func SetupDcgmFieldsWatch(deviceFields []dcgm.Short, sysInfo SystemInfo, collectIntervalUsec int64) ([]func(), error) {
|
||||
var err error
|
||||
var cleanups []func()
|
||||
var cleanup func()
|
||||
var groups []dcgm.GroupHandle
|
||||
var fieldGroup dcgm.FieldHandle
|
||||
|
||||
if sysInfo.InfoType == dcgm.FE_LINK {
|
||||
/* one group per-nvswitch is created for nvlinks */
|
||||
groups, cleanups, err = CreateLinkGroupsFromSystemInfo(sysInfo)
|
||||
} else {
|
||||
group, cleanup, err := CreateGroupFromSystemInfo(sysInfo)
|
||||
if err == nil {
|
||||
groups = append(groups, group)
|
||||
cleanups = append(cleanups, cleanup)
|
||||
}
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
goto fail
|
||||
}
|
||||
|
||||
for _, gr := range groups {
|
||||
fieldGroup, cleanup, err = NewFieldGroup(deviceFields)
|
||||
if err != nil {
|
||||
goto fail
|
||||
}
|
||||
|
||||
cleanups = append(cleanups, cleanup)
|
||||
|
||||
err = WatchFieldGroup(gr, fieldGroup, collectIntervalUsec, 0.0, 1)
|
||||
if err != nil {
|
||||
goto fail
|
||||
}
|
||||
}
|
||||
|
||||
return cleanups, nil
|
||||
|
||||
fail:
|
||||
for _, f := range cleanups {
|
||||
f()
|
||||
}
|
||||
|
||||
return nil, err
|
||||
}
|
@ -0,0 +1,269 @@
|
||||
/*
|
||||
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package dcgmexporter
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"github.com/NVIDIA/go-dcgm/pkg/dcgm"
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
func NewDCGMCollector(c []Counter, config *Config, entityType dcgm.Field_Entity_Group) (*DCGMCollector, func(), error) {
|
||||
sysInfo, err := InitializeSystemInfo(config.GPUDevices, config.SwitchDevices, config.UseFakeGpus, entityType)
|
||||
if err != nil {
|
||||
return nil, func() {}, err
|
||||
}
|
||||
|
||||
hostname := ""
|
||||
if config.NoHostname == false {
|
||||
if nodeName := os.Getenv("NODE_NAME"); nodeName != "" {
|
||||
hostname = nodeName
|
||||
} else {
|
||||
hostname, err = os.Hostname()
|
||||
if err != nil {
|
||||
return nil, func() {}, err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var deviceFields = NewDeviceFields(c, entityType)
|
||||
|
||||
if len(deviceFields) <= 0 {
|
||||
return nil, func() {}, fmt.Errorf("No fields to watch for device type: %d", entityType)
|
||||
}
|
||||
|
||||
collector := &DCGMCollector{
|
||||
Counters: c,
|
||||
DeviceFields: deviceFields,
|
||||
UseOldNamespace: config.UseOldNamespace,
|
||||
SysInfo: sysInfo,
|
||||
Hostname: hostname,
|
||||
}
|
||||
|
||||
cleanups, err := SetupDcgmFieldsWatch(collector.DeviceFields, sysInfo, int64(config.CollectInterval)*1000)
|
||||
if err != nil {
|
||||
logrus.Fatal("Failed to watch metrics: ", err)
|
||||
}
|
||||
|
||||
collector.Cleanups = cleanups
|
||||
|
||||
return collector, func() { collector.Cleanup() }, nil
|
||||
}
|
||||
|
||||
func (c *DCGMCollector) Cleanup() {
|
||||
for _, c := range c.Cleanups {
|
||||
c()
|
||||
}
|
||||
}
|
||||
|
||||
func (c *DCGMCollector) GetMetrics() ([][]Metric, error) {
|
||||
monitoringInfo := GetMonitoredEntities(c.SysInfo)
|
||||
count := len(monitoringInfo)
|
||||
|
||||
metrics := make([][]Metric, count)
|
||||
|
||||
for i, mi := range monitoringInfo {
|
||||
var vals []dcgm.FieldValue_v1
|
||||
var err error
|
||||
if mi.Entity.EntityGroupId == dcgm.FE_LINK {
|
||||
vals, err = dcgm.LinkGetLatestValues(mi.Entity.EntityId, mi.ParentId, c.DeviceFields)
|
||||
} else {
|
||||
vals, err = dcgm.EntityGetLatestValues(mi.Entity.EntityGroupId, mi.Entity.EntityId, c.DeviceFields)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
if derr, ok := err.(*dcgm.DcgmError); ok {
|
||||
if derr.Code == dcgm.DCGM_ST_CONNECTION_NOT_VALID {
|
||||
logrus.Fatal("Could not retrieve metrics: ", err)
|
||||
}
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// InstanceInfo will be nil for GPUs
|
||||
if c.SysInfo.InfoType == dcgm.FE_SWITCH || c.SysInfo.InfoType == dcgm.FE_LINK {
|
||||
metrics[i] = ToSwitchMetric(vals, c.Counters, mi, c.UseOldNamespace, c.Hostname)
|
||||
} else {
|
||||
metrics[i] = ToMetric(vals, c.Counters, mi.DeviceInfo, mi.InstanceInfo, c.UseOldNamespace, c.Hostname)
|
||||
}
|
||||
}
|
||||
|
||||
return metrics, nil
|
||||
}
|
||||
|
||||
func FindCounterField(c []Counter, fieldId uint) (*Counter, error) {
|
||||
for i := 0; i < len(c); i++ {
|
||||
if uint(c[i].FieldID) == fieldId {
|
||||
return &c[i], nil
|
||||
}
|
||||
}
|
||||
|
||||
return &c[0], fmt.Errorf("Could not find corresponding counter")
|
||||
}
|
||||
|
||||
func ToSwitchMetric(values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) []Metric {
|
||||
var metrics []Metric
|
||||
var labels = map[string]string{}
|
||||
|
||||
for _, val := range values {
|
||||
v := ToString(val)
|
||||
// Filter out counters with no value and ignored fields for this entity
|
||||
|
||||
counter, err := FindCounterField(c, val.FieldId)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if counter.PromType == "label" {
|
||||
labels[counter.FieldName] = v
|
||||
continue
|
||||
}
|
||||
uuid := "UUID"
|
||||
if useOld {
|
||||
uuid = "uuid"
|
||||
}
|
||||
var m Metric
|
||||
if v == SkipDCGMValue {
|
||||
continue
|
||||
} else {
|
||||
m = Metric{
|
||||
Counter: counter,
|
||||
Value: v,
|
||||
UUID: uuid,
|
||||
GPU: fmt.Sprintf("%d", mi.Entity.EntityId),
|
||||
GPUUUID: "",
|
||||
GPUDevice: fmt.Sprintf("nvswitch%d", mi.ParentId),
|
||||
GPUModelName: "",
|
||||
Hostname: hostname,
|
||||
Labels: &labels,
|
||||
Attributes: nil,
|
||||
}
|
||||
}
|
||||
metrics = append(metrics, m)
|
||||
}
|
||||
|
||||
return metrics
|
||||
}
|
||||
|
||||
func ToMetric(values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device, instanceInfo *GpuInstanceInfo, useOld bool, hostname string) []Metric {
|
||||
var metrics []Metric
|
||||
var labels = map[string]string{}
|
||||
|
||||
for _, val := range values {
|
||||
v := ToString(val)
|
||||
// Filter out counters with no value and ignored fields for this entity
|
||||
if v == SkipDCGMValue {
|
||||
continue
|
||||
}
|
||||
|
||||
counter, err := FindCounterField(c, val.FieldId)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if counter.PromType == "label" {
|
||||
labels[counter.FieldName] = v
|
||||
continue
|
||||
}
|
||||
uuid := "UUID"
|
||||
if useOld {
|
||||
uuid = "uuid"
|
||||
}
|
||||
m := Metric{
|
||||
Counter: counter,
|
||||
Value: v,
|
||||
|
||||
UUID: uuid,
|
||||
GPU: fmt.Sprintf("%d", d.GPU),
|
||||
GPUUUID: d.UUID,
|
||||
GPUDevice: fmt.Sprintf("nvidia%d", d.GPU),
|
||||
GPUModelName: d.Identifiers.Model,
|
||||
Hostname: hostname,
|
||||
|
||||
Labels: &labels,
|
||||
Attributes: map[string]string{},
|
||||
}
|
||||
if instanceInfo != nil {
|
||||
m.MigProfile = instanceInfo.ProfileName
|
||||
m.GPUInstanceID = fmt.Sprintf("%d", instanceInfo.Info.NvmlInstanceId)
|
||||
} else {
|
||||
m.MigProfile = ""
|
||||
m.GPUInstanceID = ""
|
||||
}
|
||||
metrics = append(metrics, m)
|
||||
}
|
||||
|
||||
return metrics
|
||||
}
|
||||
|
||||
func ToString(value dcgm.FieldValue_v1) string {
|
||||
switch value.FieldType {
|
||||
case dcgm.DCGM_FT_INT64:
|
||||
switch v := value.Int64(); v {
|
||||
case dcgm.DCGM_FT_INT32_BLANK:
|
||||
return SkipDCGMValue
|
||||
case dcgm.DCGM_FT_INT32_NOT_FOUND:
|
||||
return SkipDCGMValue
|
||||
case dcgm.DCGM_FT_INT32_NOT_SUPPORTED:
|
||||
return SkipDCGMValue
|
||||
case dcgm.DCGM_FT_INT32_NOT_PERMISSIONED:
|
||||
return SkipDCGMValue
|
||||
case dcgm.DCGM_FT_INT64_BLANK:
|
||||
return SkipDCGMValue
|
||||
case dcgm.DCGM_FT_INT64_NOT_FOUND:
|
||||
return SkipDCGMValue
|
||||
case dcgm.DCGM_FT_INT64_NOT_SUPPORTED:
|
||||
return SkipDCGMValue
|
||||
case dcgm.DCGM_FT_INT64_NOT_PERMISSIONED:
|
||||
return SkipDCGMValue
|
||||
default:
|
||||
return fmt.Sprintf("%d", value.Int64())
|
||||
}
|
||||
case dcgm.DCGM_FT_DOUBLE:
|
||||
switch v := value.Float64(); v {
|
||||
case dcgm.DCGM_FT_FP64_BLANK:
|
||||
return SkipDCGMValue
|
||||
case dcgm.DCGM_FT_FP64_NOT_FOUND:
|
||||
return SkipDCGMValue
|
||||
case dcgm.DCGM_FT_FP64_NOT_SUPPORTED:
|
||||
return SkipDCGMValue
|
||||
case dcgm.DCGM_FT_FP64_NOT_PERMISSIONED:
|
||||
return SkipDCGMValue
|
||||
default:
|
||||
return fmt.Sprintf("%f", value.Float64())
|
||||
}
|
||||
case dcgm.DCGM_FT_STRING:
|
||||
switch v := value.String(); v {
|
||||
case dcgm.DCGM_FT_STR_BLANK:
|
||||
return SkipDCGMValue
|
||||
case dcgm.DCGM_FT_STR_NOT_FOUND:
|
||||
return SkipDCGMValue
|
||||
case dcgm.DCGM_FT_STR_NOT_SUPPORTED:
|
||||
return SkipDCGMValue
|
||||
case dcgm.DCGM_FT_STR_NOT_PERMISSIONED:
|
||||
return SkipDCGMValue
|
||||
default:
|
||||
return v
|
||||
}
|
||||
default:
|
||||
return FailedToConvert
|
||||
}
|
||||
|
||||
return FailedToConvert
|
||||
}
|
@ -0,0 +1,89 @@
|
||||
/*
|
||||
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package dcgmexporter
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
|
||||
"github.com/NVIDIA/go-dcgm/pkg/dcgm"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
var sampleCounters = []Counter{
|
||||
{dcgm.DCGM_FI_DEV_GPU_TEMP, "DCGM_FI_DEV_GPU_TEMP", "gauge", "Temperature Help info"},
|
||||
{dcgm.DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION", "gauge", "Energy help info"},
|
||||
{dcgm.DCGM_FI_DEV_POWER_USAGE, "DCGM_FI_DEV_POWER_USAGE", "gauge", "Power help info"},
|
||||
{dcgm.DCGM_FI_DRIVER_VERSION, "DCGM_FI_DRIVER_VERSION", "label", "Driver version"},
|
||||
/* test that switch and link metrics are filtered out automatically when devices are not detected */
|
||||
{dcgm.DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT, "DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT", "gauge", "switch temperature"},
|
||||
{dcgm.DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS, "DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS", "gauge", "per-link flit errors"},
|
||||
/* test that vgpu metrics are not filtered out */
|
||||
{dcgm.DCGM_FI_DEV_VGPU_LICENSE_STATUS, "DCGM_FI_DEV_VGPU_LICENSE_STATUS", "gauge", "vgpu license status"},
|
||||
}
|
||||
|
||||
var expectedMetrics = map[string]bool{
|
||||
"DCGM_FI_DEV_GPU_TEMP": true,
|
||||
"DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION": true,
|
||||
"DCGM_FI_DEV_POWER_USAGE": true,
|
||||
"DCGM_FI_DEV_VGPU_LICENSE_STATUS": true,
|
||||
}
|
||||
|
||||
func TestDCGMCollector(t *testing.T) {
|
||||
cleanup, err := dcgm.Init(dcgm.Embedded)
|
||||
require.NoError(t, err)
|
||||
defer cleanup()
|
||||
|
||||
_, cleanup = testDCGMCollector(t, sampleCounters)
|
||||
cleanup()
|
||||
}
|
||||
|
||||
func testDCGMCollector(t *testing.T, counters []Counter) (*DCGMCollector, func()) {
|
||||
dOpt := DeviceOptions{true, []int{-1}, []int{-1}}
|
||||
cfg := Config{
|
||||
GPUDevices: dOpt,
|
||||
NoHostname: false,
|
||||
UseOldNamespace: false,
|
||||
UseFakeGpus: false,
|
||||
}
|
||||
c, cleanup, err := NewDCGMCollector(counters, &cfg, dcgm.FE_GPU)
|
||||
require.NoError(t, err)
|
||||
|
||||
/* Test for error when no switches are available to monitor.
|
||||
NOTE: This test will fail on a system with switches present. */
|
||||
_, _, err = NewDCGMCollector(counters, &cfg, dcgm.FE_SWITCH)
|
||||
require.Error(t, err)
|
||||
|
||||
out, err := c.GetMetrics()
|
||||
require.NoError(t, err)
|
||||
require.Greater(t, len(out), 0, "Check that you have a GPU on this node")
|
||||
require.Len(t, out[0], len(expectedMetrics))
|
||||
|
||||
for i, dev := range out {
|
||||
seenMetrics := map[string]bool{}
|
||||
for _, metric := range dev {
|
||||
seenMetrics[metric.Counter.FieldName] = true
|
||||
require.Equal(t, metric.GPU, fmt.Sprintf("%d", i))
|
||||
|
||||
require.NotEmpty(t, metric.Value)
|
||||
require.NotEqual(t, metric.Value, FailedToConvert)
|
||||
}
|
||||
require.Equal(t, seenMetrics, expectedMetrics)
|
||||
}
|
||||
|
||||
return c, cleanup
|
||||
}
|
@ -0,0 +1,195 @@
|
||||
/*
|
||||
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package dcgmexporter
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net"
|
||||
"os"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
|
||||
"github.com/sirupsen/logrus"
|
||||
"google.golang.org/grpc"
|
||||
podresourcesapi "k8s.io/kubelet/pkg/apis/podresources/v1alpha1"
|
||||
)
|
||||
|
||||
var (
|
||||
socketDir = "/var/lib/kubelet/pod-resources"
|
||||
socketPath = socketDir + "/kubelet.sock"
|
||||
|
||||
connectionTimeout = 10 * time.Second
|
||||
|
||||
gkeMigDeviceIdRegex = regexp.MustCompile(`^nvidia([0-9]+)/gi([0-9]+)$`)
|
||||
gkeVirtualGPUDeviceIdSeparator = "/vgpu"
|
||||
)
|
||||
|
||||
func NewPodMapper(c *Config) (*PodMapper, error) {
|
||||
logrus.Infof("Kubernetes metrics collection enabled!")
|
||||
|
||||
ret := nvml.Init()
|
||||
|
||||
if ret != nil {
|
||||
return nil, ret
|
||||
}
|
||||
|
||||
return &PodMapper{
|
||||
Config: c,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (p *PodMapper) Name() string {
|
||||
return "podMapper"
|
||||
}
|
||||
|
||||
func (p *PodMapper) Process(metrics [][]Metric, sysInfo SystemInfo) error {
|
||||
_, err := os.Stat(socketPath)
|
||||
if os.IsNotExist(err) {
|
||||
logrus.Infof("No Kubelet socket, ignoring")
|
||||
return nil
|
||||
}
|
||||
|
||||
// TODO: This needs to be moved out of the critical path.
|
||||
c, cleanup, err := connectToServer(socketPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer cleanup()
|
||||
|
||||
pods, err := ListPods(c)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
deviceToPod := ToDeviceToPod(pods, sysInfo)
|
||||
|
||||
// Note: for loop are copies the value, if we want to change the value
|
||||
// and not the copy, we need to use the indexes
|
||||
for i, device := range metrics {
|
||||
for j, val := range device {
|
||||
deviceId, err := val.getIDOfType(p.Config.KubernetesGPUIdType)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if !p.Config.UseOldNamespace {
|
||||
metrics[i][j].Attributes[podAttribute] = deviceToPod[deviceId].Name
|
||||
metrics[i][j].Attributes[namespaceAttribute] = deviceToPod[deviceId].Namespace
|
||||
metrics[i][j].Attributes[containerAttribute] = deviceToPod[deviceId].Container
|
||||
} else {
|
||||
metrics[i][j].Attributes[oldPodAttribute] = deviceToPod[deviceId].Name
|
||||
metrics[i][j].Attributes[oldNamespaceAttribute] = deviceToPod[deviceId].Namespace
|
||||
metrics[i][j].Attributes[oldContainerAttribute] = deviceToPod[deviceId].Container
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func connectToServer(socket string) (*grpc.ClientConn, func(), error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), connectionTimeout)
|
||||
defer cancel()
|
||||
|
||||
conn, err := grpc.DialContext(ctx, socket, grpc.WithInsecure(), grpc.WithBlock(),
|
||||
grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) {
|
||||
return net.DialTimeout("unix", addr, timeout)
|
||||
}),
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
return nil, func() {}, fmt.Errorf("failure connecting to %s: %v", socket, err)
|
||||
}
|
||||
|
||||
return conn, func() { conn.Close() }, nil
|
||||
}
|
||||
|
||||
func ListPods(conn *grpc.ClientConn) (*podresourcesapi.ListPodResourcesResponse, error) {
|
||||
client := podresourcesapi.NewPodResourcesListerClient(conn)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), connectionTimeout)
|
||||
defer cancel()
|
||||
|
||||
resp, err := client.List(ctx, &podresourcesapi.ListPodResourcesRequest{})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failure getting pod resources %v", err)
|
||||
}
|
||||
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
func ToDeviceToPod(devicePods *podresourcesapi.ListPodResourcesResponse, sysInfo SystemInfo) map[string]PodInfo {
|
||||
deviceToPodMap := make(map[string]PodInfo)
|
||||
|
||||
for _, pod := range devicePods.GetPodResources() {
|
||||
for _, container := range pod.GetContainers() {
|
||||
for _, device := range container.GetDevices() {
|
||||
|
||||
resourceName := device.GetResourceName()
|
||||
if resourceName != nvidiaResourceName {
|
||||
// Mig resources appear differently than GPU resources
|
||||
if strings.HasPrefix(resourceName, nvidiaMigResourcePrefix) == false {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
podInfo := PodInfo{
|
||||
Name: pod.GetName(),
|
||||
Namespace: pod.GetNamespace(),
|
||||
Container: container.GetName(),
|
||||
}
|
||||
|
||||
for _, deviceid := range device.GetDeviceIds() {
|
||||
if strings.HasPrefix(deviceid, MIG_UUID_PREFIX) {
|
||||
gpuUuid, gi, _, err := nvml.ParseMigDeviceUUID(deviceid)
|
||||
if err == nil {
|
||||
giIdentifier := GetGpuInstanceIdentifier(sysInfo, gpuUuid, gi)
|
||||
deviceToPodMap[giIdentifier] = podInfo
|
||||
} else {
|
||||
gpuUuid = deviceid[len(MIG_UUID_PREFIX):]
|
||||
}
|
||||
deviceToPodMap[gpuUuid] = podInfo
|
||||
} else if gkeMigDeviceIdMatches := gkeMigDeviceIdRegex.FindStringSubmatch(deviceid); gkeMigDeviceIdMatches != nil {
|
||||
var gpuIndex string
|
||||
var gpuInstanceId string
|
||||
for groupIdx, group := range gkeMigDeviceIdMatches {
|
||||
switch groupIdx {
|
||||
case 1:
|
||||
gpuIndex = group
|
||||
case 2:
|
||||
gpuInstanceId = group
|
||||
}
|
||||
}
|
||||
giIdentifier := fmt.Sprintf("%s-%s", gpuIndex, gpuInstanceId)
|
||||
deviceToPodMap[giIdentifier] = podInfo
|
||||
} else if strings.Contains(deviceid, gkeVirtualGPUDeviceIdSeparator) {
|
||||
deviceToPodMap[strings.Split(deviceid, gkeVirtualGPUDeviceIdSeparator)[0]] = podInfo
|
||||
} else if strings.Contains(deviceid, "::") {
|
||||
gpuInstanceId := strings.Split(deviceid, "::")[0]
|
||||
deviceToPodMap[gpuInstanceId] = podInfo
|
||||
} else {
|
||||
deviceToPodMap[deviceid] = podInfo
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return deviceToPodMap
|
||||
}
|
@ -0,0 +1,159 @@
|
||||
/*
|
||||
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package dcgmexporter
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/NVIDIA/go-dcgm/pkg/dcgm"
|
||||
"github.com/stretchr/testify/require"
|
||||
"google.golang.org/grpc"
|
||||
podresourcesapi "k8s.io/kubernetes/pkg/kubelet/apis/podresources/v1alpha1"
|
||||
"k8s.io/kubernetes/pkg/kubelet/util"
|
||||
)
|
||||
|
||||
var tmpDir string
|
||||
|
||||
func TestProcessPodMapper(t *testing.T) {
|
||||
cleanup := CreateTmpDir(t)
|
||||
defer cleanup()
|
||||
|
||||
cleanup, err := dcgm.Init(dcgm.Embedded)
|
||||
require.NoError(t, err)
|
||||
defer cleanup()
|
||||
|
||||
c, cleanup := testDCGMCollector(t, sampleCounters)
|
||||
defer cleanup()
|
||||
|
||||
out, err := c.GetMetrics()
|
||||
require.NoError(t, err)
|
||||
original := append(out[:0:0], out...)
|
||||
|
||||
socketPath = tmpDir + "/kubelet.sock"
|
||||
server := grpc.NewServer()
|
||||
gpus := GetGPUUUIDs(original)
|
||||
podresourcesapi.RegisterPodResourcesListerServer(server, NewPodResourcesMockServer(gpus))
|
||||
|
||||
cleanup = StartMockServer(t, server, socketPath)
|
||||
defer cleanup()
|
||||
|
||||
podMapper, err := NewPodMapper(&Config{KubernetesGPUIdType: GPUUID})
|
||||
require.NoError(t, err)
|
||||
var sysInfo SystemInfo
|
||||
err = podMapper.Process(out, sysInfo)
|
||||
require.NoError(t, err)
|
||||
|
||||
require.Len(t, out, len(original))
|
||||
for i, dev := range out {
|
||||
for _, metric := range dev {
|
||||
require.Contains(t, metric.Attributes, podAttribute)
|
||||
require.Contains(t, metric.Attributes, namespaceAttribute)
|
||||
require.Contains(t, metric.Attributes, containerAttribute)
|
||||
|
||||
// TODO currently we rely on ordering and implicit expectations of the mock implementation
|
||||
// This should be a table comparison
|
||||
require.Equal(t, metric.Attributes[podAttribute], fmt.Sprintf("gpu-pod-%d", i))
|
||||
require.Equal(t, metric.Attributes[namespaceAttribute], "default")
|
||||
require.Equal(t, metric.Attributes[containerAttribute], "default")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func GetGPUUUIDs(metrics [][]Metric) []string {
|
||||
gpus := make([]string, len(metrics))
|
||||
for i, dev := range metrics {
|
||||
gpus[i] = dev[0].GPUUUID
|
||||
}
|
||||
|
||||
return gpus
|
||||
}
|
||||
|
||||
func StartMockServer(t *testing.T, server *grpc.Server, socket string) func() {
|
||||
l, err := util.CreateListener("unix://" + socket)
|
||||
require.NoError(t, err)
|
||||
|
||||
stopped := make(chan interface{})
|
||||
|
||||
go func() {
|
||||
server.Serve(l)
|
||||
close(stopped)
|
||||
}()
|
||||
|
||||
return func() {
|
||||
server.Stop()
|
||||
select {
|
||||
case <-stopped:
|
||||
return
|
||||
case <-time.After(1 * time.Second):
|
||||
t.Fatal("Failed waiting for gRPC server to stop")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func CreateTmpDir(t *testing.T) func() {
|
||||
path, err := ioutil.TempDir("", "dcgm-exporter")
|
||||
require.NoError(t, err)
|
||||
|
||||
tmpDir = path
|
||||
|
||||
return func() {
|
||||
require.NoError(t, os.RemoveAll(tmpDir))
|
||||
}
|
||||
}
|
||||
|
||||
// Contains a list of UUIDs
|
||||
type PodResourcesMockServer struct {
|
||||
gpus []string
|
||||
}
|
||||
|
||||
func NewPodResourcesMockServer(used []string) *PodResourcesMockServer {
|
||||
return &PodResourcesMockServer{
|
||||
gpus: used,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *PodResourcesMockServer) List(ctx context.Context, req *podresourcesapi.ListPodResourcesRequest) (*podresourcesapi.ListPodResourcesResponse, error) {
|
||||
podResources := make([]*podresourcesapi.PodResources, len(s.gpus))
|
||||
|
||||
for i, gpu := range s.gpus {
|
||||
podResources[i] = &podresourcesapi.PodResources{
|
||||
Name: fmt.Sprintf("gpu-pod-%d", i),
|
||||
Namespace: "default",
|
||||
Containers: []*podresourcesapi.ContainerResources{
|
||||
&podresourcesapi.ContainerResources{
|
||||
Name: "default",
|
||||
Devices: []*podresourcesapi.ContainerDevices{
|
||||
&podresourcesapi.ContainerDevices{
|
||||
ResourceName: nvidiaResourceName,
|
||||
DeviceIds: []string{gpu},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
return &podresourcesapi.ListPodResourcesResponse{
|
||||
PodResources: podResources,
|
||||
}, nil
|
||||
|
||||
}
|
@ -0,0 +1,201 @@
|
||||
/*
|
||||
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package dcgmexporter
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/csv"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/NVIDIA/go-dcgm/pkg/dcgm"
|
||||
"github.com/sirupsen/logrus"
|
||||
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
"k8s.io/client-go/rest"
|
||||
)
|
||||
|
||||
func ExtractCounters(c *Config) ([]Counter, error) {
|
||||
var err error
|
||||
var records [][]string
|
||||
|
||||
if c.ConfigMapData != undefinedConfigMapData {
|
||||
var client kubernetes.Interface
|
||||
client, err = getKubeClient()
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
records, err = readConfigMap(client, c)
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
} else {
|
||||
err = fmt.Errorf("No configmap data specified")
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
logrus.Infof("%v, falling back to metric file %s", err, c.CollectorsFile)
|
||||
|
||||
records, err = ReadCSVFile(c.CollectorsFile)
|
||||
if err != nil {
|
||||
logrus.Errorf("Could not read metrics file '%s': %v\n", c.CollectorsFile, err)
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
counters, err := extractCounters(records, c)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return counters, err
|
||||
}
|
||||
|
||||
func ReadCSVFile(filename string) ([][]string, error) {
|
||||
file, err := os.Open(filename)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
defer file.Close()
|
||||
|
||||
r := csv.NewReader(file)
|
||||
r.Comment = '#'
|
||||
records, err := r.ReadAll()
|
||||
|
||||
return records, err
|
||||
}
|
||||
|
||||
func extractCounters(records [][]string, c *Config) ([]Counter, error) {
|
||||
f := make([]Counter, 0, len(records))
|
||||
|
||||
for i, record := range records {
|
||||
var useOld = false
|
||||
if len(record) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
for j, r := range record {
|
||||
record[j] = strings.Trim(r, " ")
|
||||
}
|
||||
|
||||
if len(record) != 3 {
|
||||
return nil, fmt.Errorf("Malformed CSV record, failed to parse line %d (`%v`), expected 3 fields", i, record)
|
||||
}
|
||||
|
||||
fieldID, ok := dcgm.DCGM_FI[record[0]]
|
||||
oldFieldID, oldOk := dcgm.OLD_DCGM_FI[record[0]]
|
||||
if !ok && !oldOk {
|
||||
return nil, fmt.Errorf("Could not find DCGM field %s", record[0])
|
||||
}
|
||||
|
||||
if !ok && oldOk {
|
||||
useOld = true
|
||||
}
|
||||
|
||||
if !useOld {
|
||||
if !fieldIsSupported(uint(fieldID), c) {
|
||||
logrus.Warnf("Skipping line %d ('%s'): metric not enabled", i, record[0])
|
||||
continue
|
||||
}
|
||||
|
||||
if _, ok := promMetricType[record[1]]; !ok {
|
||||
return nil, fmt.Errorf("Could not find Prometheus metric type %s", record[1])
|
||||
}
|
||||
|
||||
f = append(f, Counter{fieldID, record[0], record[1], record[2]})
|
||||
} else {
|
||||
if !fieldIsSupported(uint(oldFieldID), c) {
|
||||
logrus.Warnf("Skipping line %d ('%s'): metric not enabled", i, record[0])
|
||||
continue
|
||||
}
|
||||
|
||||
if _, ok := promMetricType[record[1]]; !ok {
|
||||
return nil, fmt.Errorf("Could not find Prometheus metric type %s", record[1])
|
||||
}
|
||||
|
||||
f = append(f, Counter{oldFieldID, record[0], record[1], record[2]})
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return f, nil
|
||||
}
|
||||
|
||||
func fieldIsSupported(fieldID uint, c *Config) bool {
|
||||
if fieldID < 1000 {
|
||||
return true
|
||||
}
|
||||
|
||||
if !c.CollectDCP {
|
||||
return false
|
||||
}
|
||||
|
||||
for i := int(0); i < len(c.MetricGroups); i++ {
|
||||
for j := int(0); j < len(c.MetricGroups[i].FieldIds); j++ {
|
||||
if fieldID == c.MetricGroups[i].FieldIds[j] {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func readConfigMap(kubeClient kubernetes.Interface, c *Config) ([][]string, error) {
|
||||
parts := strings.Split(c.ConfigMapData, ":")
|
||||
if len(parts) != 2 {
|
||||
return nil, fmt.Errorf("Malformed configmap-data: %s", c.ConfigMapData)
|
||||
}
|
||||
|
||||
var cm *corev1.ConfigMap
|
||||
cm, err := kubeClient.CoreV1().ConfigMaps(parts[0]).Get(context.TODO(), parts[1], metav1.GetOptions{})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("Could not retrieve ConfigMap '%s': %v", c.ConfigMapData, err)
|
||||
}
|
||||
|
||||
if _, ok := cm.Data["metrics"]; !ok {
|
||||
return nil, fmt.Errorf("Malformed ConfigMap '%s': no 'metrics' key", c.ConfigMapData)
|
||||
}
|
||||
|
||||
r := csv.NewReader(strings.NewReader(cm.Data["metrics"]))
|
||||
r.Comment = '#'
|
||||
records, err := r.ReadAll()
|
||||
|
||||
if len(records) == 0 {
|
||||
return nil, fmt.Errorf("Malformed configmap contents. No metrics found")
|
||||
}
|
||||
|
||||
return records, err
|
||||
}
|
||||
|
||||
func getKubeClient() (kubernetes.Interface, error) {
|
||||
config, err := rest.InClusterConfig()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
client, err := kubernetes.NewForConfig(config)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return client, err
|
||||
}
|
@ -0,0 +1,129 @@
|
||||
package dcgmexporter
|
||||
|
||||
import (
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/client-go/kubernetes/fake"
|
||||
)
|
||||
|
||||
func TestEmptyConfigMap(t *testing.T) {
|
||||
// ConfigMap matches criteria but is empty
|
||||
clientset := fake.NewSimpleClientset(&v1.ConfigMap{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "configmap1",
|
||||
Namespace: "default",
|
||||
},
|
||||
Data: map[string]string{"metrics": ""},
|
||||
})
|
||||
|
||||
c := Config{
|
||||
ConfigMapData: "default:configmap1",
|
||||
}
|
||||
records, err := readConfigMap(clientset, &c)
|
||||
if len(records) != 0 || err == nil {
|
||||
t.Fatalf("Should have returned an error and no records")
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidConfigMap(t *testing.T) {
|
||||
clientset := fake.NewSimpleClientset(&v1.ConfigMap{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "configmap1",
|
||||
Namespace: "default",
|
||||
},
|
||||
Data: map[string]string{"metrics": "DCGM_FI_DEV_GPU_TEMP, gauge, temperature"},
|
||||
})
|
||||
|
||||
c := Config{
|
||||
ConfigMapData: "default:configmap1",
|
||||
}
|
||||
records, err := readConfigMap(clientset, &c)
|
||||
if len(records) != 1 || err != nil {
|
||||
t.Fatalf("Should have succeeded")
|
||||
}
|
||||
}
|
||||
|
||||
func TestInvalidConfigMapData(t *testing.T) {
|
||||
clientset := fake.NewSimpleClientset(&v1.ConfigMap{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "configmap1",
|
||||
Namespace: "default",
|
||||
},
|
||||
Data: map[string]string{"bad": "DCGM_FI_DEV_GPU_TEMP, gauge, temperature"},
|
||||
})
|
||||
|
||||
c := Config{
|
||||
ConfigMapData: "default:configmap1",
|
||||
}
|
||||
records, err := readConfigMap(clientset, &c)
|
||||
if len(records) != 0 || err == nil {
|
||||
t.Fatalf("Should have returned an error and no records")
|
||||
}
|
||||
}
|
||||
|
||||
func TestInvalidConfigMapName(t *testing.T) {
|
||||
clientset := fake.NewSimpleClientset(&v1.ConfigMap{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "configmap",
|
||||
Namespace: "default",
|
||||
},
|
||||
})
|
||||
|
||||
c := Config{
|
||||
ConfigMapData: "default:configmap1",
|
||||
}
|
||||
records, err := readConfigMap(clientset, &c)
|
||||
if len(records) != 0 || err == nil {
|
||||
t.Fatalf("Should have returned an error and no records")
|
||||
}
|
||||
}
|
||||
|
||||
func TestInvalidConfigMapNamespace(t *testing.T) {
|
||||
clientset := fake.NewSimpleClientset(&v1.ConfigMap{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "configmap",
|
||||
Namespace: "c1",
|
||||
},
|
||||
})
|
||||
|
||||
c := Config{
|
||||
ConfigMapData: "default:configmap1",
|
||||
}
|
||||
records, err := readConfigMap(clientset, &c)
|
||||
if len(records) != 0 || err == nil {
|
||||
t.Fatalf("Should have returned an error and no records")
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractCounters(t *testing.T) {
|
||||
tmpFile, err := ioutil.TempFile(os.TempDir(), "prefix-")
|
||||
if err != nil {
|
||||
t.Fatalf("Cannot create temporary file: %v", err)
|
||||
}
|
||||
|
||||
defer os.Remove(tmpFile.Name())
|
||||
|
||||
text := []byte("DCGM_FI_DEV_GPU_TEMP, gauge, temperature\n")
|
||||
if _, err = tmpFile.Write(text); err != nil {
|
||||
t.Fatalf("Failed to write to temporary file: %v", err)
|
||||
}
|
||||
|
||||
t.Logf("Using file: %s", tmpFile.Name())
|
||||
|
||||
if err := tmpFile.Close(); err != nil {
|
||||
t.Fatalf("Cannot close temp file: %v", err)
|
||||
}
|
||||
|
||||
c := Config{
|
||||
ConfigMapData: undefinedConfigMapData,
|
||||
CollectorsFile: tmpFile.Name(),
|
||||
}
|
||||
records, err := ExtractCounters(&c)
|
||||
if len(records) != 1 || err != nil {
|
||||
t.Fatalf("Should have succeeded: records (%d != 1) err=%v", len(records), err)
|
||||
}
|
||||
}
|
@ -0,0 +1,255 @@
|
||||
/*
|
||||
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package dcgmexporter
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"sync"
|
||||
"text/template"
|
||||
"time"
|
||||
|
||||
"github.com/NVIDIA/go-dcgm/pkg/dcgm"
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
func NewMetricsPipeline(c *Config) (*MetricsPipeline, func(), error) {
|
||||
counters, err := ExtractCounters(c)
|
||||
if err != nil {
|
||||
return nil, func() {}, err
|
||||
}
|
||||
|
||||
gpuCollector, cleanup, err := NewDCGMCollector(counters, c, dcgm.FE_GPU)
|
||||
if err != nil {
|
||||
return nil, func() {}, err
|
||||
}
|
||||
|
||||
switchCollector, cleanup, err := NewDCGMCollector(counters, c, dcgm.FE_SWITCH)
|
||||
if err != nil {
|
||||
logrus.Info("Not collecting switch metrics: ", err)
|
||||
}
|
||||
|
||||
linkCollector, cleanup, err := NewDCGMCollector(counters, c, dcgm.FE_LINK)
|
||||
if err != nil {
|
||||
logrus.Info("Not collecting link metrics: ", err)
|
||||
}
|
||||
|
||||
transformations := []Transform{}
|
||||
if c.Kubernetes {
|
||||
podMapper, err := NewPodMapper(c)
|
||||
if err != nil {
|
||||
logrus.Warnf("Could not enable kubernetes metric collection: %v", err)
|
||||
} else {
|
||||
transformations = append(transformations, podMapper)
|
||||
}
|
||||
}
|
||||
|
||||
return &MetricsPipeline{
|
||||
config: c,
|
||||
|
||||
migMetricsFormat: template.Must(template.New("migMetrics").Parse(migMetricsFormat)),
|
||||
switchMetricsFormat: template.Must(template.New("switchMetrics").Parse(switchMetricsFormat)),
|
||||
linkMetricsFormat: template.Must(template.New("switchMetrics").Parse(linkMetricsFormat)),
|
||||
|
||||
counters: counters,
|
||||
gpuCollector: gpuCollector,
|
||||
switchCollector: switchCollector,
|
||||
linkCollector: linkCollector,
|
||||
transformations: transformations,
|
||||
}, func() {
|
||||
cleanup()
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Primarely for testing, caller expected to cleanup the collector
|
||||
func NewMetricsPipelineWithGPUCollector(c *Config, collector *DCGMCollector) (*MetricsPipeline, func(), error) {
|
||||
return &MetricsPipeline{
|
||||
config: c,
|
||||
|
||||
migMetricsFormat: template.Must(template.New("migMetrics").Parse(migMetricsFormat)),
|
||||
switchMetricsFormat: template.Must(template.New("switchMetrics").Parse(switchMetricsFormat)),
|
||||
linkMetricsFormat: template.Must(template.New("switchMetrics").Parse(linkMetricsFormat)),
|
||||
|
||||
counters: collector.Counters,
|
||||
gpuCollector: collector,
|
||||
}, func() {}, nil
|
||||
}
|
||||
|
||||
func (m *MetricsPipeline) Run(out chan string, stop chan interface{}, wg *sync.WaitGroup) {
|
||||
defer wg.Done()
|
||||
|
||||
logrus.Info("Pipeline starting")
|
||||
|
||||
// Note we are using a ticker so that we can stick as close as possible to the collect interval.
|
||||
// e.g: The CollectInterval is 10s and the transformation pipeline takes 5s, the time will
|
||||
// ensure we really collect metrics every 10s by firing an event 5s after the run function completes.
|
||||
t := time.NewTicker(time.Millisecond * time.Duration(m.config.CollectInterval))
|
||||
defer t.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-stop:
|
||||
return
|
||||
case <-t.C:
|
||||
o, err := m.run()
|
||||
if err != nil {
|
||||
logrus.Errorf("Failed to collect metrics with error: %v", err)
|
||||
/* flush output rather than output stale data */
|
||||
out <- ""
|
||||
continue
|
||||
}
|
||||
|
||||
if len(out) == cap(out) {
|
||||
logrus.Errorf("Channel is full skipping")
|
||||
} else {
|
||||
out <- o
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (m *MetricsPipeline) run() (string, error) {
|
||||
/* Collect GPU Metrics */
|
||||
metrics, err := m.gpuCollector.GetMetrics()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("Failed to collect gpu metrics with error: %v", err)
|
||||
}
|
||||
|
||||
for _, transform := range m.transformations {
|
||||
err := transform.Process(metrics, m.gpuCollector.SysInfo)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("Failed to transform metrics for transform %s: %v", err, transform.Name())
|
||||
}
|
||||
}
|
||||
|
||||
formated, err := FormatMetrics(m.migMetricsFormat, metrics)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("Failed to format metrics with error: %v", err)
|
||||
}
|
||||
|
||||
if m.switchCollector != nil {
|
||||
/* Collect Switch Metrics */
|
||||
metrics, err = m.switchCollector.GetMetrics()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("Failed to collect switch metrics with error: %v", err)
|
||||
}
|
||||
|
||||
if len(metrics) > 0 {
|
||||
switchFormated, err := FormatMetrics(m.switchMetricsFormat, metrics)
|
||||
if err != nil {
|
||||
logrus.Warnf("Failed to format switch metrics with error: %v", err)
|
||||
}
|
||||
|
||||
formated = formated + switchFormated
|
||||
}
|
||||
}
|
||||
|
||||
if m.linkCollector != nil {
|
||||
/* Collect Link Metrics */
|
||||
metrics, err = m.linkCollector.GetMetrics()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("Failed to collect link metrics with error: %v", err)
|
||||
}
|
||||
|
||||
if len(metrics) > 0 {
|
||||
switchFormated, err := FormatMetrics(m.linkMetricsFormat, metrics)
|
||||
if err != nil {
|
||||
logrus.Warnf("Failed to format link metrics with error: %v", err)
|
||||
}
|
||||
|
||||
formated = formated + switchFormated
|
||||
}
|
||||
}
|
||||
|
||||
return formated, nil
|
||||
}
|
||||
|
||||
/*
|
||||
* The goal here is to get to the following format:
|
||||
* ```
|
||||
* # HELP FIELD_ID HELP_MSG
|
||||
* # TYPE FIELD_ID PROM_TYPE
|
||||
* FIELD_ID{gpu="GPU_INDEX_0",uuid="GPU_UUID", attr...} VALUE
|
||||
* FIELD_ID{gpu="GPU_INDEX_N",uuid="GPU_UUID", attr...} VALUE
|
||||
* ...
|
||||
* ```
|
||||
*/
|
||||
|
||||
var migMetricsFormat = `
|
||||
{{- range $counter, $metrics := . -}}
|
||||
# HELP {{ $counter.FieldName }} {{ $counter.Help }}
|
||||
# TYPE {{ $counter.FieldName }} {{ $counter.PromType }}
|
||||
{{- range $metric := $metrics }}
|
||||
{{ $counter.FieldName }}{gpu="{{ $metric.GPU }}",{{ $metric.UUID }}="{{ $metric.GPUUUID }}",device="{{ $metric.GPUDevice }}",modelName="{{ $metric.GPUModelName }}"{{if $metric.MigProfile}},GPU_I_PROFILE="{{ $metric.MigProfile }}",GPU_I_ID="{{ $metric.GPUInstanceID }}"{{end}}{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}}
|
||||
|
||||
{{- range $k, $v := $metric.Labels -}}
|
||||
,{{ $k }}="{{ $v }}"
|
||||
{{- end -}}
|
||||
{{- range $k, $v := $metric.Attributes -}}
|
||||
,{{ $k }}="{{ $v }}"
|
||||
{{- end -}}
|
||||
|
||||
} {{ $metric.Value -}}
|
||||
{{- end }}
|
||||
{{ end }}`
|
||||
|
||||
var switchMetricsFormat = `
|
||||
{{- range $counter, $metrics := . -}}
|
||||
# HELP {{ $counter.FieldName }} {{ $counter.Help }}
|
||||
# TYPE {{ $counter.FieldName }} {{ $counter.PromType }}
|
||||
{{- range $metric := $metrics }}
|
||||
{{ $counter.FieldName }}{nvswitch="{{ $metric.GPU }}"{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}}
|
||||
|
||||
{{- range $k, $v := $metric.Labels -}}
|
||||
,{{ $k }}="{{ $v }}"
|
||||
{{- end -}}
|
||||
} {{ $metric.Value -}}
|
||||
{{- end }}
|
||||
{{ end }}`
|
||||
|
||||
var linkMetricsFormat = `
|
||||
{{- range $counter, $metrics := . -}}
|
||||
# HELP {{ $counter.FieldName }} {{ $counter.Help }}
|
||||
# TYPE {{ $counter.FieldName }} {{ $counter.PromType }}
|
||||
{{- range $metric := $metrics }}
|
||||
{{ $counter.FieldName }}{nvlink="{{ $metric.GPU }}",nvswitch="{{ $metric.GPUDevice }}"{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}}
|
||||
|
||||
{{- range $k, $v := $metric.Labels -}}
|
||||
,{{ $k }}="{{ $v }}"
|
||||
{{- end -}}
|
||||
} {{ $metric.Value -}}
|
||||
{{- end }}
|
||||
{{ end }}`
|
||||
|
||||
// Template is passed here so that it isn't recompiled at each iteration
|
||||
func FormatMetrics(t *template.Template, m [][]Metric) (string, error) {
|
||||
// Group metrics by counter instead of by device
|
||||
groupedMetrics := make(map[*Counter][]Metric)
|
||||
for _, deviceMetrics := range m {
|
||||
for _, deviceMetric := range deviceMetrics {
|
||||
groupedMetrics[deviceMetric.Counter] = append(groupedMetrics[deviceMetric.Counter], deviceMetric)
|
||||
}
|
||||
}
|
||||
|
||||
// Format metrics
|
||||
var res bytes.Buffer
|
||||
if err := t.Execute(&res, groupedMetrics); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return res.String(), nil
|
||||
}
|
@ -0,0 +1,45 @@
|
||||
/*
|
||||
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package dcgmexporter
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/NVIDIA/go-dcgm/pkg/dcgm"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestRun(t *testing.T) {
|
||||
cleanup, err := dcgm.Init(dcgm.Embedded)
|
||||
require.NoError(t, err)
|
||||
defer cleanup()
|
||||
|
||||
c, cleanup := testDCGMCollector(t, sampleCounters)
|
||||
defer cleanup()
|
||||
|
||||
p, cleanup, err := NewMetricsPipelineWithGPUCollector(&Config{}, c)
|
||||
defer cleanup()
|
||||
|
||||
out, err := p.run()
|
||||
require.NoError(t, err)
|
||||
require.NotEmpty(t, out)
|
||||
|
||||
// Note it is pretty difficult to make non superficial tests without
|
||||
// writting a full blown parser, always look at the results
|
||||
// We'll be testing them more throughly in the e2e tests (e.g: by running prometheus).
|
||||
t.Logf("Pipeline result is:\n%v", out)
|
||||
}
|
@ -0,0 +1,126 @@
|
||||
/*
|
||||
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package dcgmexporter
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/gorilla/mux"
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
func NewMetricsServer(c *Config, metrics chan string) (*MetricsServer, func(), error) {
|
||||
router := mux.NewRouter()
|
||||
serverv1 := &MetricsServer{
|
||||
server: http.Server{
|
||||
Addr: c.Address,
|
||||
Handler: router,
|
||||
ReadTimeout: 10 * time.Second,
|
||||
WriteTimeout: 10 * time.Second,
|
||||
},
|
||||
metricsChan: metrics,
|
||||
metrics: "",
|
||||
}
|
||||
|
||||
router.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("X-Content-Type-Options", "nosniff")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
w.Write([]byte(`<html>
|
||||
<head><title>GPU Exporter</title></head>
|
||||
<body>
|
||||
<h1>GPU Exporter</h1>
|
||||
<p><a href="./metrics">Metrics</a></p>
|
||||
</body>
|
||||
</html>`))
|
||||
})
|
||||
|
||||
router.HandleFunc("/health", serverv1.Health)
|
||||
router.HandleFunc("/metrics", serverv1.Metrics)
|
||||
|
||||
return serverv1, func() {}, nil
|
||||
}
|
||||
|
||||
func (s *MetricsServer) Run(stop chan interface{}, wg *sync.WaitGroup) {
|
||||
defer wg.Done()
|
||||
|
||||
var httpwg sync.WaitGroup
|
||||
httpwg.Add(1)
|
||||
go func() {
|
||||
defer httpwg.Done()
|
||||
logrus.Info("Starting webserver")
|
||||
if err := s.server.ListenAndServe(); err != nil && err != http.ErrServerClosed {
|
||||
logrus.Fatalf("Failed to Listen and Server HTTP server with err: `%v`", err)
|
||||
}
|
||||
}()
|
||||
|
||||
httpwg.Add(1)
|
||||
go func() {
|
||||
defer httpwg.Done()
|
||||
for {
|
||||
select {
|
||||
case <-stop:
|
||||
return
|
||||
case m := <-s.metricsChan:
|
||||
s.updateMetrics(m)
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
<-stop
|
||||
if err := s.server.Shutdown(context.Background()); err != nil {
|
||||
logrus.Fatalf("Failed to shutdown HTTP server, with err: `%v`", err)
|
||||
}
|
||||
|
||||
if err := WaitWithTimeout(&httpwg, 3*time.Second); err != nil {
|
||||
logrus.Fatalf("Failed waiting for HTTP server to shutdown, with err: `%v`", err)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *MetricsServer) Metrics(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("X-Content-Type-Options", "nosniff")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
w.Write([]byte(s.getMetrics()))
|
||||
}
|
||||
|
||||
func (s *MetricsServer) Health(w http.ResponseWriter, r *http.Request) {
|
||||
if s.getMetrics() == "" {
|
||||
w.Header().Set("X-Content-Type-Options", "nosniff")
|
||||
w.WriteHeader(http.StatusServiceUnavailable)
|
||||
w.Write([]byte("KO"))
|
||||
} else {
|
||||
w.Header().Set("X-Content-Type-Options", "nosniff")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
w.Write([]byte("OK"))
|
||||
}
|
||||
}
|
||||
|
||||
func (s *MetricsServer) updateMetrics(m string) {
|
||||
s.Lock()
|
||||
defer s.Unlock()
|
||||
|
||||
s.metrics = m
|
||||
}
|
||||
|
||||
func (s *MetricsServer) getMetrics() string {
|
||||
s.Lock()
|
||||
defer s.Unlock()
|
||||
|
||||
return s.metrics
|
||||
}
|
@ -0,0 +1,610 @@
|
||||
/*
|
||||
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package dcgmexporter
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math/rand"
|
||||
|
||||
"github.com/NVIDIA/go-dcgm/pkg/dcgm"
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
const PARENT_ID_IGNORED = 0
|
||||
|
||||
type GroupInfo struct {
|
||||
groupHandle dcgm.GroupHandle
|
||||
groupType dcgm.Field_Entity_Group
|
||||
}
|
||||
|
||||
type ComputeInstanceInfo struct {
|
||||
InstanceInfo dcgm.MigEntityInfo
|
||||
ProfileName string
|
||||
EntityId uint
|
||||
}
|
||||
|
||||
type GpuInstanceInfo struct {
|
||||
Info dcgm.MigEntityInfo
|
||||
ProfileName string
|
||||
EntityId uint
|
||||
ComputeInstances []ComputeInstanceInfo
|
||||
}
|
||||
|
||||
type GpuInfo struct {
|
||||
DeviceInfo dcgm.Device
|
||||
GpuInstances []GpuInstanceInfo
|
||||
MigEnabled bool
|
||||
}
|
||||
|
||||
type SwitchInfo struct {
|
||||
EntityId uint
|
||||
NvLinks []dcgm.NvLinkStatus
|
||||
}
|
||||
|
||||
type SystemInfo struct {
|
||||
GpuCount uint
|
||||
Gpus [dcgm.MAX_NUM_DEVICES]GpuInfo
|
||||
gOpt DeviceOptions
|
||||
sOpt DeviceOptions
|
||||
InfoType dcgm.Field_Entity_Group
|
||||
Switches []SwitchInfo
|
||||
}
|
||||
|
||||
type MonitoringInfo struct {
|
||||
Entity dcgm.GroupEntityPair
|
||||
DeviceInfo dcgm.Device
|
||||
InstanceInfo *GpuInstanceInfo
|
||||
ParentId uint
|
||||
}
|
||||
|
||||
func SetGpuInstanceProfileName(sysInfo *SystemInfo, entityId uint, profileName string) bool {
|
||||
for i := uint(0); i < sysInfo.GpuCount; i++ {
|
||||
for j := range sysInfo.Gpus[i].GpuInstances {
|
||||
if sysInfo.Gpus[i].GpuInstances[j].EntityId == entityId {
|
||||
sysInfo.Gpus[i].GpuInstances[j].ProfileName = profileName
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func SetMigProfileNames(sysInfo *SystemInfo, values []dcgm.FieldValue_v2) error {
|
||||
notFound := false
|
||||
err := fmt.Errorf("Cannot find match for entities:")
|
||||
for _, v := range values {
|
||||
found := SetGpuInstanceProfileName(sysInfo, v.EntityId, dcgm.Fv2_String(v))
|
||||
if found == false {
|
||||
err = fmt.Errorf("%s group %d, id %d", err, v.EntityGroupId, v.EntityId)
|
||||
notFound = true
|
||||
}
|
||||
}
|
||||
|
||||
if notFound {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func PopulateMigProfileNames(sysInfo *SystemInfo, entities []dcgm.GroupEntityPair) error {
|
||||
if len(entities) == 0 {
|
||||
// There are no entities to populate
|
||||
return nil
|
||||
}
|
||||
|
||||
var fields []dcgm.Short
|
||||
fields = append(fields, dcgm.DCGM_FI_DEV_NAME)
|
||||
flags := dcgm.DCGM_FV_FLAG_LIVE_DATA
|
||||
values, err := dcgm.EntitiesGetLatestValues(entities, fields, flags)
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return SetMigProfileNames(sysInfo, values)
|
||||
}
|
||||
|
||||
func GpuIdExists(sysInfo *SystemInfo, gpuId int) bool {
|
||||
for i := uint(0); i < sysInfo.GpuCount; i++ {
|
||||
if sysInfo.Gpus[i].DeviceInfo.GPU == uint(gpuId) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func SwitchIdExists(sysInfo *SystemInfo, switchId int) bool {
|
||||
for _, sw := range sysInfo.Switches {
|
||||
if sw.EntityId == uint(switchId) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func GpuInstanceIdExists(sysInfo *SystemInfo, gpuInstanceId int) bool {
|
||||
for i := uint(0); i < sysInfo.GpuCount; i++ {
|
||||
for _, instance := range sysInfo.Gpus[i].GpuInstances {
|
||||
if instance.EntityId == uint(gpuInstanceId) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func LinkIdExists(sysInfo *SystemInfo, linkId int) bool {
|
||||
for _, sw := range sysInfo.Switches {
|
||||
for _, link := range sw.NvLinks {
|
||||
if link.Index == uint(linkId) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func VerifySwitchDevicePresence(sysInfo *SystemInfo, sOpt DeviceOptions) error {
|
||||
if sOpt.Flex {
|
||||
return nil
|
||||
}
|
||||
|
||||
if len(sOpt.MajorRange) > 0 && sOpt.MajorRange[0] != -1 {
|
||||
// Verify we can find all the specified Switches
|
||||
for _, swId := range sOpt.MajorRange {
|
||||
if !SwitchIdExists(sysInfo, swId) {
|
||||
return fmt.Errorf("couldn't find requested NvSwitch id %d", swId)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(sOpt.MinorRange) > 0 && sOpt.MinorRange[0] != -1 {
|
||||
for _, linkId := range sOpt.MinorRange {
|
||||
if !LinkIdExists(sysInfo, linkId) {
|
||||
return fmt.Errorf("couldn't find requested NvLink %d", linkId)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func VerifyDevicePresence(sysInfo *SystemInfo, gOpt DeviceOptions) error {
|
||||
if gOpt.Flex {
|
||||
return nil
|
||||
}
|
||||
|
||||
if len(gOpt.MajorRange) > 0 && gOpt.MajorRange[0] != -1 {
|
||||
// Verify we can find all the specified GPUs
|
||||
for _, gpuId := range gOpt.MajorRange {
|
||||
if GpuIdExists(sysInfo, gpuId) == false {
|
||||
return fmt.Errorf("Couldn't find requested GPU id %d", gpuId)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(gOpt.MinorRange) > 0 && gOpt.MinorRange[0] != -1 {
|
||||
for _, gpuInstanceId := range gOpt.MinorRange {
|
||||
if GpuInstanceIdExists(sysInfo, gpuInstanceId) == false {
|
||||
return fmt.Errorf("Couldn't find requested GPU instance id %d", gpuInstanceId)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func InitializeNvSwitchInfo(sysInfo SystemInfo, sOpt DeviceOptions) (SystemInfo, error) {
|
||||
switches, err := dcgm.GetEntityGroupEntities(dcgm.FE_SWITCH)
|
||||
if err != nil {
|
||||
return sysInfo, err
|
||||
}
|
||||
|
||||
if len(switches) <= 0 {
|
||||
return sysInfo, fmt.Errorf("no switches to monitor")
|
||||
}
|
||||
|
||||
links, err := dcgm.GetNvLinkLinkStatus()
|
||||
if err != nil {
|
||||
return sysInfo, err
|
||||
}
|
||||
|
||||
for i := 0; i < len(switches); i++ {
|
||||
var matchingLinks []dcgm.NvLinkStatus
|
||||
for _, link := range links {
|
||||
if link.ParentType == dcgm.FE_SWITCH && link.ParentId == uint(switches[i]) {
|
||||
matchingLinks = append(matchingLinks, link)
|
||||
}
|
||||
}
|
||||
|
||||
sw := SwitchInfo{
|
||||
switches[i],
|
||||
matchingLinks,
|
||||
}
|
||||
|
||||
sysInfo.Switches = append(sysInfo.Switches, sw)
|
||||
}
|
||||
|
||||
sysInfo.sOpt = sOpt
|
||||
err = VerifySwitchDevicePresence(&sysInfo, sOpt)
|
||||
|
||||
return sysInfo, nil
|
||||
}
|
||||
|
||||
func InitializeGpuInfo(sysInfo SystemInfo, gOpt DeviceOptions, useFakeGpus bool) (SystemInfo, error) {
|
||||
gpuCount, err := dcgm.GetAllDeviceCount()
|
||||
if err != nil {
|
||||
return sysInfo, err
|
||||
}
|
||||
sysInfo.GpuCount = gpuCount
|
||||
|
||||
for i := uint(0); i < sysInfo.GpuCount; i++ {
|
||||
// Default mig enabled to false
|
||||
sysInfo.Gpus[i].MigEnabled = false
|
||||
sysInfo.Gpus[i].DeviceInfo, err = dcgm.GetDeviceInfo(i)
|
||||
if err != nil {
|
||||
if useFakeGpus {
|
||||
sysInfo.Gpus[i].DeviceInfo.GPU = i
|
||||
sysInfo.Gpus[i].DeviceInfo.UUID = fmt.Sprintf("fake%d", i)
|
||||
} else {
|
||||
return sysInfo, err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
hierarchy, err := dcgm.GetGpuInstanceHierarchy()
|
||||
if err != nil {
|
||||
return sysInfo, err
|
||||
}
|
||||
|
||||
if hierarchy.Count > 0 {
|
||||
var entities []dcgm.GroupEntityPair
|
||||
|
||||
gpuId := uint(0)
|
||||
instanceIndex := 0
|
||||
for i := uint(0); i < hierarchy.Count; i++ {
|
||||
if hierarchy.EntityList[i].Parent.EntityGroupId == dcgm.FE_GPU {
|
||||
// We are adding a GPU instance
|
||||
gpuId = hierarchy.EntityList[i].Parent.EntityId
|
||||
entityId := hierarchy.EntityList[i].Entity.EntityId
|
||||
instanceInfo := GpuInstanceInfo{
|
||||
Info: hierarchy.EntityList[i].Info,
|
||||
ProfileName: "",
|
||||
EntityId: entityId,
|
||||
}
|
||||
sysInfo.Gpus[gpuId].MigEnabled = true
|
||||
sysInfo.Gpus[gpuId].GpuInstances = append(sysInfo.Gpus[gpuId].GpuInstances, instanceInfo)
|
||||
entities = append(entities, dcgm.GroupEntityPair{dcgm.FE_GPU_I, entityId})
|
||||
instanceIndex = len(sysInfo.Gpus[gpuId].GpuInstances) - 1
|
||||
} else if hierarchy.EntityList[i].Parent.EntityGroupId == dcgm.FE_GPU_I {
|
||||
// Add the compute instance, gpuId is recorded previously
|
||||
entityId := hierarchy.EntityList[i].Entity.EntityId
|
||||
ciInfo := ComputeInstanceInfo{hierarchy.EntityList[i].Info, "", entityId}
|
||||
sysInfo.Gpus[gpuId].GpuInstances[instanceIndex].ComputeInstances = append(sysInfo.Gpus[gpuId].GpuInstances[instanceIndex].ComputeInstances, ciInfo)
|
||||
}
|
||||
}
|
||||
|
||||
err = PopulateMigProfileNames(&sysInfo, entities)
|
||||
if err != nil {
|
||||
return sysInfo, err
|
||||
}
|
||||
}
|
||||
|
||||
sysInfo.gOpt = gOpt
|
||||
err = VerifyDevicePresence(&sysInfo, gOpt)
|
||||
|
||||
return sysInfo, nil
|
||||
}
|
||||
|
||||
func InitializeSystemInfo(gOpt DeviceOptions, sOpt DeviceOptions, useFakeGpus bool, entityType dcgm.Field_Entity_Group) (SystemInfo, error) {
|
||||
sysInfo := SystemInfo{}
|
||||
|
||||
logrus.Info("Initializing system entities of type: ", entityType)
|
||||
switch entityType {
|
||||
case dcgm.FE_LINK:
|
||||
sysInfo.InfoType = dcgm.FE_LINK
|
||||
return InitializeNvSwitchInfo(sysInfo, sOpt)
|
||||
case dcgm.FE_SWITCH:
|
||||
sysInfo.InfoType = dcgm.FE_SWITCH
|
||||
return InitializeNvSwitchInfo(sysInfo, sOpt)
|
||||
case dcgm.FE_GPU:
|
||||
sysInfo.InfoType = dcgm.FE_GPU
|
||||
return InitializeGpuInfo(sysInfo, gOpt, useFakeGpus)
|
||||
}
|
||||
|
||||
return sysInfo, fmt.Errorf("unhandled entity type: %d", entityType)
|
||||
}
|
||||
|
||||
func CreateLinkGroupsFromSystemInfo(sysInfo SystemInfo) ([]dcgm.GroupHandle, []func(), error) {
|
||||
var groups []dcgm.GroupHandle
|
||||
var cleanups []func()
|
||||
|
||||
/* Create per-switch link groups */
|
||||
for _, sw := range sysInfo.Switches {
|
||||
if !IsSwitchWatched(sw.EntityId, sysInfo) {
|
||||
continue
|
||||
}
|
||||
|
||||
groupId, err := dcgm.CreateGroup(fmt.Sprintf("gpu-collector-group-%d", rand.Uint64()))
|
||||
if err != nil {
|
||||
return nil, cleanups, err
|
||||
}
|
||||
|
||||
groups = append(groups, groupId)
|
||||
|
||||
for _, link := range sw.NvLinks {
|
||||
if link.State != dcgm.LS_UP {
|
||||
continue
|
||||
}
|
||||
|
||||
if !IsLinkWatched(link.Index, sw.EntityId, sysInfo) {
|
||||
continue
|
||||
}
|
||||
|
||||
err = dcgm.AddLinkEntityToGroup(groupId, link.Index, link.ParentId)
|
||||
|
||||
if err != nil {
|
||||
return groups, cleanups, err
|
||||
}
|
||||
|
||||
cleanups = append(cleanups, func() { dcgm.DestroyGroup(groupId) })
|
||||
}
|
||||
}
|
||||
|
||||
return groups, cleanups, nil
|
||||
}
|
||||
|
||||
func CreateGroupFromSystemInfo(sysInfo SystemInfo) (dcgm.GroupHandle, func(), error) {
|
||||
monitoringInfo := GetMonitoredEntities(sysInfo)
|
||||
groupId, err := dcgm.CreateGroup(fmt.Sprintf("gpu-collector-group-%d", rand.Uint64()))
|
||||
if err != nil {
|
||||
return dcgm.GroupHandle{}, func() {}, err
|
||||
}
|
||||
|
||||
for _, mi := range monitoringInfo {
|
||||
err := dcgm.AddEntityToGroup(groupId, mi.Entity.EntityGroupId, mi.Entity.EntityId)
|
||||
if err != nil {
|
||||
return groupId, func() { dcgm.DestroyGroup(groupId) }, err
|
||||
}
|
||||
}
|
||||
|
||||
return groupId, func() { dcgm.DestroyGroup(groupId) }, nil
|
||||
}
|
||||
|
||||
func AddAllGpus(sysInfo SystemInfo) []MonitoringInfo {
|
||||
var monitoring []MonitoringInfo
|
||||
|
||||
for i := uint(0); i < sysInfo.GpuCount; i++ {
|
||||
mi := MonitoringInfo{
|
||||
dcgm.GroupEntityPair{dcgm.FE_GPU, sysInfo.Gpus[i].DeviceInfo.GPU},
|
||||
sysInfo.Gpus[i].DeviceInfo,
|
||||
nil,
|
||||
PARENT_ID_IGNORED,
|
||||
}
|
||||
monitoring = append(monitoring, mi)
|
||||
}
|
||||
|
||||
return monitoring
|
||||
}
|
||||
|
||||
func AddAllSwitches(sysInfo SystemInfo) []MonitoringInfo {
|
||||
var monitoring []MonitoringInfo
|
||||
|
||||
for _, sw := range sysInfo.Switches {
|
||||
if !IsSwitchWatched(sw.EntityId, sysInfo) {
|
||||
continue
|
||||
}
|
||||
|
||||
mi := MonitoringInfo{
|
||||
dcgm.GroupEntityPair{dcgm.FE_SWITCH, sw.EntityId},
|
||||
dcgm.Device{
|
||||
0, "", "", 0,
|
||||
dcgm.PCIInfo{"", 0, 0, 0},
|
||||
dcgm.DeviceIdentifiers{"", "", "", "", "", ""},
|
||||
nil, "",
|
||||
},
|
||||
nil,
|
||||
PARENT_ID_IGNORED,
|
||||
}
|
||||
monitoring = append(monitoring, mi)
|
||||
}
|
||||
|
||||
return monitoring
|
||||
}
|
||||
|
||||
func AddAllLinks(sysInfo SystemInfo) []MonitoringInfo {
|
||||
var monitoring []MonitoringInfo
|
||||
|
||||
for _, sw := range sysInfo.Switches {
|
||||
for _, link := range sw.NvLinks {
|
||||
if link.State != dcgm.LS_UP {
|
||||
continue
|
||||
}
|
||||
|
||||
if !IsSwitchWatched(sw.EntityId, sysInfo) {
|
||||
continue
|
||||
}
|
||||
|
||||
if !IsLinkWatched(link.Index, sw.EntityId, sysInfo) {
|
||||
continue
|
||||
}
|
||||
|
||||
mi := MonitoringInfo{
|
||||
dcgm.GroupEntityPair{dcgm.FE_LINK, link.Index},
|
||||
dcgm.Device{
|
||||
0, "", "", 0,
|
||||
dcgm.PCIInfo{"", 0, 0, 0},
|
||||
dcgm.DeviceIdentifiers{"", "", "", "", "", ""},
|
||||
nil, "",
|
||||
},
|
||||
nil,
|
||||
link.ParentId,
|
||||
}
|
||||
monitoring = append(monitoring, mi)
|
||||
}
|
||||
}
|
||||
|
||||
return monitoring
|
||||
}
|
||||
|
||||
func IsSwitchWatched(switchId uint, sysInfo SystemInfo) bool {
|
||||
if sysInfo.sOpt.Flex {
|
||||
return true
|
||||
}
|
||||
|
||||
if len(sysInfo.sOpt.MajorRange) <= 0 {
|
||||
return true
|
||||
}
|
||||
|
||||
for _, sw := range sysInfo.sOpt.MajorRange {
|
||||
if uint(sw) == switchId {
|
||||
return true
|
||||
}
|
||||
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func IsLinkWatched(linkId uint, switchId uint, sysInfo SystemInfo) bool {
|
||||
if sysInfo.sOpt.Flex {
|
||||
return true
|
||||
}
|
||||
|
||||
for _, sw := range sysInfo.Switches {
|
||||
if !IsSwitchWatched(sw.EntityId, sysInfo) {
|
||||
return false
|
||||
}
|
||||
|
||||
if len(sysInfo.sOpt.MinorRange) <= 0 {
|
||||
return true
|
||||
}
|
||||
|
||||
for _, link := range sysInfo.sOpt.MinorRange {
|
||||
if uint(link) == linkId {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func AddAllGpuInstances(sysInfo SystemInfo, addFlexibly bool) []MonitoringInfo {
|
||||
var monitoring []MonitoringInfo
|
||||
|
||||
for i := uint(0); i < sysInfo.GpuCount; i++ {
|
||||
if addFlexibly == true && len(sysInfo.Gpus[i].GpuInstances) == 0 {
|
||||
mi := MonitoringInfo{
|
||||
dcgm.GroupEntityPair{dcgm.FE_GPU, sysInfo.Gpus[i].DeviceInfo.GPU},
|
||||
sysInfo.Gpus[i].DeviceInfo,
|
||||
nil,
|
||||
PARENT_ID_IGNORED,
|
||||
}
|
||||
monitoring = append(monitoring, mi)
|
||||
} else {
|
||||
for j := 0; j < len(sysInfo.Gpus[i].GpuInstances); j++ {
|
||||
mi := MonitoringInfo{
|
||||
dcgm.GroupEntityPair{dcgm.FE_GPU_I, sysInfo.Gpus[i].GpuInstances[j].EntityId},
|
||||
sysInfo.Gpus[i].DeviceInfo,
|
||||
&sysInfo.Gpus[i].GpuInstances[j],
|
||||
PARENT_ID_IGNORED,
|
||||
}
|
||||
monitoring = append(monitoring, mi)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return monitoring
|
||||
}
|
||||
|
||||
func GetMonitoringInfoForGpu(sysInfo SystemInfo, gpuId int) *MonitoringInfo {
|
||||
for i := uint(0); i < sysInfo.GpuCount; i++ {
|
||||
if sysInfo.Gpus[i].DeviceInfo.GPU == uint(gpuId) {
|
||||
return &MonitoringInfo{
|
||||
dcgm.GroupEntityPair{dcgm.FE_GPU, sysInfo.Gpus[i].DeviceInfo.GPU},
|
||||
sysInfo.Gpus[i].DeviceInfo,
|
||||
nil,
|
||||
PARENT_ID_IGNORED,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func GetMonitoringInfoForGpuInstance(sysInfo SystemInfo, gpuInstanceId int) *MonitoringInfo {
|
||||
for i := uint(0); i < sysInfo.GpuCount; i++ {
|
||||
for _, instance := range sysInfo.Gpus[i].GpuInstances {
|
||||
if instance.EntityId == uint(gpuInstanceId) {
|
||||
return &MonitoringInfo{
|
||||
dcgm.GroupEntityPair{dcgm.FE_GPU_I, uint(gpuInstanceId)},
|
||||
sysInfo.Gpus[i].DeviceInfo,
|
||||
&instance,
|
||||
PARENT_ID_IGNORED,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func GetMonitoredEntities(sysInfo SystemInfo) []MonitoringInfo {
|
||||
var monitoring []MonitoringInfo
|
||||
|
||||
if sysInfo.InfoType == dcgm.FE_SWITCH {
|
||||
monitoring = AddAllSwitches(sysInfo)
|
||||
} else if sysInfo.InfoType == dcgm.FE_LINK {
|
||||
monitoring = AddAllLinks(sysInfo)
|
||||
} else if sysInfo.gOpt.Flex == true {
|
||||
monitoring = AddAllGpuInstances(sysInfo, true)
|
||||
} else {
|
||||
if len(sysInfo.gOpt.MajorRange) > 0 && sysInfo.gOpt.MajorRange[0] == -1 {
|
||||
monitoring = AddAllGpus(sysInfo)
|
||||
} else {
|
||||
for _, gpuId := range sysInfo.gOpt.MajorRange {
|
||||
// We've already verified that everything in the options list exists
|
||||
monitoring = append(monitoring, *GetMonitoringInfoForGpu(sysInfo, gpuId))
|
||||
}
|
||||
}
|
||||
|
||||
if len(sysInfo.gOpt.MinorRange) > 0 && sysInfo.gOpt.MinorRange[0] == -1 {
|
||||
monitoring = AddAllGpuInstances(sysInfo, false)
|
||||
} else {
|
||||
for _, gpuInstanceId := range sysInfo.gOpt.MinorRange {
|
||||
// We've already verified that everything in the options list exists
|
||||
monitoring = append(monitoring, *GetMonitoringInfoForGpuInstance(sysInfo, gpuInstanceId))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return monitoring
|
||||
}
|
||||
|
||||
func GetGpuInstanceIdentifier(sysInfo SystemInfo, gpuuuid string, gpuInstanceId uint) string {
|
||||
for i := uint(0); i < sysInfo.GpuCount; i++ {
|
||||
if sysInfo.Gpus[i].DeviceInfo.UUID == gpuuuid {
|
||||
identifier := fmt.Sprintf("%d-%d", sysInfo.Gpus[i].DeviceInfo.GPU, gpuInstanceId)
|
||||
return identifier
|
||||
}
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
@ -0,0 +1,190 @@
|
||||
/*
|
||||
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package dcgmexporter
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/NVIDIA/go-dcgm/pkg/dcgm"
|
||||
"github.com/stretchr/testify/require"
|
||||
"testing"
|
||||
)
|
||||
|
||||
const (
|
||||
fakeProfileName string = "2fake.4gb"
|
||||
)
|
||||
|
||||
func SpoofSwitchSystemInfo() SystemInfo {
|
||||
var sysInfo SystemInfo
|
||||
sysInfo.InfoType = dcgm.FE_SWITCH
|
||||
sw1 := SwitchInfo{
|
||||
EntityId: 0,
|
||||
}
|
||||
sw2 := SwitchInfo{
|
||||
EntityId: 1,
|
||||
}
|
||||
|
||||
l1 := dcgm.NvLinkStatus{
|
||||
ParentId: 0,
|
||||
ParentType: dcgm.FE_SWITCH,
|
||||
State: 2,
|
||||
Index: 0,
|
||||
}
|
||||
|
||||
l2 := dcgm.NvLinkStatus{
|
||||
ParentId: 0,
|
||||
ParentType: dcgm.FE_SWITCH,
|
||||
State: 3,
|
||||
Index: 1,
|
||||
}
|
||||
|
||||
l3 := dcgm.NvLinkStatus{
|
||||
ParentId: 1,
|
||||
ParentType: dcgm.FE_SWITCH,
|
||||
State: 2,
|
||||
Index: 0,
|
||||
}
|
||||
|
||||
l4 := dcgm.NvLinkStatus{
|
||||
ParentId: 1,
|
||||
ParentType: dcgm.FE_SWITCH,
|
||||
State: 3,
|
||||
Index: 1,
|
||||
}
|
||||
|
||||
sw1.NvLinks = append(sw1.NvLinks, l1)
|
||||
sw1.NvLinks = append(sw1.NvLinks, l2)
|
||||
sw2.NvLinks = append(sw2.NvLinks, l3)
|
||||
sw2.NvLinks = append(sw2.NvLinks, l4)
|
||||
|
||||
sysInfo.Switches = append(sysInfo.Switches, sw1)
|
||||
sysInfo.Switches = append(sysInfo.Switches, sw2)
|
||||
|
||||
return sysInfo
|
||||
}
|
||||
|
||||
func SpoofSystemInfo() SystemInfo {
|
||||
var sysInfo SystemInfo
|
||||
sysInfo.GpuCount = 2
|
||||
sysInfo.Gpus[0].DeviceInfo.GPU = 0
|
||||
gi := GpuInstanceInfo{
|
||||
Info: dcgm.MigEntityInfo{"fake", 0, 0, 0, 0, 3},
|
||||
ProfileName: fakeProfileName,
|
||||
EntityId: 0,
|
||||
}
|
||||
sysInfo.Gpus[0].GpuInstances = append(sysInfo.Gpus[0].GpuInstances, gi)
|
||||
gi2 := GpuInstanceInfo{
|
||||
Info: dcgm.MigEntityInfo{"fake", 0, 1, 0, 0, 3},
|
||||
ProfileName: fakeProfileName,
|
||||
EntityId: 14,
|
||||
}
|
||||
sysInfo.Gpus[1].GpuInstances = append(sysInfo.Gpus[1].GpuInstances, gi2)
|
||||
sysInfo.Gpus[1].DeviceInfo.GPU = 1
|
||||
|
||||
return sysInfo
|
||||
}
|
||||
|
||||
func TestMonitoredEntities(t *testing.T) {
|
||||
sysInfo := SpoofSystemInfo()
|
||||
sysInfo.gOpt.Flex = true
|
||||
|
||||
monitoring := GetMonitoredEntities(sysInfo)
|
||||
require.Equal(t, len(monitoring), 2, fmt.Sprintf("Should have 2 monitored entities but found %d", len(monitoring)))
|
||||
instanceCount := 0
|
||||
gpuCount := 0
|
||||
for _, mi := range monitoring {
|
||||
if mi.Entity.EntityGroupId == dcgm.FE_GPU_I {
|
||||
instanceCount = instanceCount + 1
|
||||
require.NotEqual(t, mi.InstanceInfo, nil, "Expected InstanceInfo to be populated but it wasn't")
|
||||
require.Equal(t, mi.InstanceInfo.ProfileName, fakeProfileName, "Expected profile named '%s' but found '%s'", fakeProfileName, mi.InstanceInfo.ProfileName)
|
||||
if mi.Entity.EntityId != uint(0) {
|
||||
// One of these should be 0, the other should be 14
|
||||
require.Equal(t, mi.Entity.EntityId, uint(14), "Expected 14 as EntityId but found %s", monitoring[1].Entity.EntityId)
|
||||
}
|
||||
} else {
|
||||
gpuCount = gpuCount + 1
|
||||
require.Equal(t, mi.InstanceInfo, (*GpuInstanceInfo)(nil), "Expected InstanceInfo to be nil but it wasn't")
|
||||
}
|
||||
}
|
||||
require.Equal(t, instanceCount, 2, "Expected 2 GPU instances but found %d", instanceCount)
|
||||
require.Equal(t, gpuCount, 0, "Expected 0 GPUs but found %d", gpuCount)
|
||||
|
||||
sysInfo.Gpus[0].GpuInstances = sysInfo.Gpus[0].GpuInstances[:0]
|
||||
sysInfo.Gpus[1].GpuInstances = sysInfo.Gpus[1].GpuInstances[:0]
|
||||
monitoring = GetMonitoredEntities(sysInfo)
|
||||
require.Equal(t, 2, len(monitoring), fmt.Sprintf("Should have 2 monitored entities but found %d", len(monitoring)))
|
||||
for i, mi := range monitoring {
|
||||
require.Equal(t, mi.Entity.EntityGroupId, dcgm.FE_GPU, "Expected FE_GPU but found %d", mi.Entity.EntityGroupId)
|
||||
require.Equal(t, uint(i), mi.DeviceInfo.GPU, "Expected GPU %d but found %d", i, mi.DeviceInfo.GPU)
|
||||
require.Equal(t, (*GpuInstanceInfo)(nil), mi.InstanceInfo, "Expected InstanceInfo not to be populated but it was")
|
||||
}
|
||||
}
|
||||
|
||||
func TestVerifyDevicePresence(t *testing.T) {
|
||||
sysInfo := SpoofSystemInfo()
|
||||
var dOpt DeviceOptions
|
||||
dOpt.Flex = true
|
||||
err := VerifyDevicePresence(&sysInfo, dOpt)
|
||||
require.Equal(t, err, nil, "Expected to have no error, but found %s", err)
|
||||
|
||||
dOpt.Flex = false
|
||||
dOpt.MajorRange = append(dOpt.MajorRange, -1)
|
||||
dOpt.MinorRange = append(dOpt.MinorRange, -1)
|
||||
err = VerifyDevicePresence(&sysInfo, dOpt)
|
||||
require.Equal(t, err, nil, "Expected to have no error, but found %s", err)
|
||||
|
||||
dOpt.MinorRange[0] = 10 // this GPU instance doesn't exist
|
||||
err = VerifyDevicePresence(&sysInfo, dOpt)
|
||||
require.NotEqual(t, err, nil, "Expected to have an error for a non-existent GPU instance, but none found")
|
||||
|
||||
dOpt.MajorRange[0] = 10 // this GPU doesn't exist
|
||||
dOpt.MinorRange[0] = -1
|
||||
err = VerifyDevicePresence(&sysInfo, dOpt)
|
||||
require.NotEqual(t, err, nil, "Expected to have an error for a non-existent GPU, but none found")
|
||||
|
||||
// Add GPUs and instances that exist
|
||||
dOpt.MajorRange[0] = 0
|
||||
dOpt.MajorRange = append(dOpt.MajorRange, 1)
|
||||
dOpt.MinorRange[0] = 0
|
||||
dOpt.MinorRange = append(dOpt.MinorRange, 14)
|
||||
err = VerifyDevicePresence(&sysInfo, dOpt)
|
||||
require.Equal(t, err, nil, "Expected to have no error, but found %s", err)
|
||||
}
|
||||
|
||||
//func TestMigProfileNames(t *testing.T) {
|
||||
// sysInfo := SpoofSystemInfo()
|
||||
// SetMigProfileNames(sysInfo, values)
|
||||
//}
|
||||
|
||||
func TestMonitoredSwitches(t *testing.T) {
|
||||
sysInfo := SpoofSwitchSystemInfo()
|
||||
|
||||
/* test that only switches are returned */
|
||||
monitoring := GetMonitoredEntities(sysInfo)
|
||||
require.Equal(t, len(monitoring), 2, fmt.Sprintf("Should have 2 monitored switches but found %d", len(monitoring)))
|
||||
for _, mi := range monitoring {
|
||||
require.Equal(t, mi.Entity.EntityGroupId, dcgm.FE_SWITCH, fmt.Sprintf("Should have only returned switches but returned %d", mi.Entity.EntityGroupId))
|
||||
}
|
||||
|
||||
/* test that only "up" links are monitored and 1 from each switch */
|
||||
sysInfo.InfoType = dcgm.FE_LINK
|
||||
monitoring = GetMonitoredEntities(sysInfo)
|
||||
require.Equal(t, len(monitoring), 2, fmt.Sprintf("Should have 2 monitored links but found %d", len(monitoring)))
|
||||
for i, mi := range monitoring {
|
||||
require.Equal(t, mi.Entity.EntityGroupId, dcgm.FE_LINK, fmt.Sprintf("Should have only returned links but returned %d", mi.Entity.EntityGroupId))
|
||||
require.Equal(t, mi.ParentId, uint(i), fmt.Sprint("Link should reference switch parent"))
|
||||
}
|
||||
}
|
@ -0,0 +1,171 @@
|
||||
/*
|
||||
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package dcgmexporter
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"sync"
|
||||
"text/template"
|
||||
|
||||
"github.com/NVIDIA/go-dcgm/pkg/dcgm"
|
||||
)
|
||||
|
||||
var (
|
||||
SkipDCGMValue = "SKIPPING DCGM VALUE"
|
||||
FailedToConvert = "ERROR - FAILED TO CONVERT TO STRING"
|
||||
|
||||
nvidiaResourceName = "nvidia.com/gpu"
|
||||
nvidiaMigResourcePrefix = "nvidia.com/mig-"
|
||||
MIG_UUID_PREFIX = "MIG-"
|
||||
|
||||
// Note standard resource attributes
|
||||
podAttribute = "pod"
|
||||
namespaceAttribute = "namespace"
|
||||
containerAttribute = "container"
|
||||
|
||||
oldPodAttribute = "pod_name"
|
||||
oldNamespaceAttribute = "pod_namespace"
|
||||
oldContainerAttribute = "container_name"
|
||||
|
||||
undefinedConfigMapData = "none"
|
||||
)
|
||||
|
||||
type KubernetesGPUIDType string
|
||||
|
||||
const (
|
||||
GPUUID KubernetesGPUIDType = "uid"
|
||||
DeviceName KubernetesGPUIDType = "device-name"
|
||||
)
|
||||
|
||||
type DeviceOptions struct {
|
||||
Flex bool // If true, then monitor all GPUs if MIG mode is disabled or all GPU instances if MIG is enabled.
|
||||
MajorRange []int // The indices of each GPU/NvSwitch to monitor, or -1 to monitor all
|
||||
MinorRange []int // The indices of each GPUInstance/NvLink to monitor, or -1 to monitor all
|
||||
}
|
||||
|
||||
type Config struct {
|
||||
CollectorsFile string
|
||||
Address string
|
||||
CollectInterval int
|
||||
Kubernetes bool
|
||||
KubernetesGPUIdType KubernetesGPUIDType
|
||||
CollectDCP bool
|
||||
UseOldNamespace bool
|
||||
UseRemoteHE bool
|
||||
RemoteHEInfo string
|
||||
GPUDevices DeviceOptions
|
||||
SwitchDevices DeviceOptions
|
||||
NoHostname bool
|
||||
UseFakeGpus bool
|
||||
ConfigMapData string
|
||||
MetricGroups []dcgm.MetricGroup
|
||||
}
|
||||
|
||||
type Transform interface {
|
||||
Process(metrics [][]Metric, sysInfo SystemInfo) error
|
||||
Name() string
|
||||
}
|
||||
|
||||
type MetricsPipeline struct {
|
||||
config *Config
|
||||
|
||||
transformations []Transform
|
||||
migMetricsFormat *template.Template
|
||||
switchMetricsFormat *template.Template
|
||||
linkMetricsFormat *template.Template
|
||||
|
||||
counters []Counter
|
||||
gpuCollector *DCGMCollector
|
||||
switchCollector *DCGMCollector
|
||||
linkCollector *DCGMCollector
|
||||
}
|
||||
|
||||
type DCGMCollector struct {
|
||||
Counters []Counter
|
||||
DeviceFields []dcgm.Short
|
||||
Cleanups []func()
|
||||
UseOldNamespace bool
|
||||
SysInfo SystemInfo
|
||||
Hostname string
|
||||
}
|
||||
|
||||
type Counter struct {
|
||||
FieldID dcgm.Short
|
||||
FieldName string
|
||||
PromType string
|
||||
Help string
|
||||
}
|
||||
|
||||
type Metric struct {
|
||||
Counter *Counter
|
||||
Value string
|
||||
|
||||
GPU string
|
||||
GPUUUID string
|
||||
GPUDevice string
|
||||
GPUModelName string
|
||||
|
||||
UUID string
|
||||
|
||||
MigProfile string
|
||||
GPUInstanceID string
|
||||
Hostname string
|
||||
|
||||
Labels *map[string]string
|
||||
Attributes map[string]string
|
||||
}
|
||||
|
||||
func (m Metric) getIDOfType(idType KubernetesGPUIDType) (string, error) {
|
||||
// For MIG devices, return the MIG profile instead of
|
||||
if m.MigProfile != "" {
|
||||
return fmt.Sprintf("%s-%s", m.GPU, m.GPUInstanceID), nil
|
||||
}
|
||||
switch idType {
|
||||
case GPUUID:
|
||||
return m.GPUUUID, nil
|
||||
case DeviceName:
|
||||
return m.GPUDevice, nil
|
||||
}
|
||||
return "", fmt.Errorf("unsupported KubernetesGPUIDType for MetricID '%s'", idType)
|
||||
}
|
||||
|
||||
var promMetricType = map[string]bool{
|
||||
"gauge": true,
|
||||
"counter": true,
|
||||
"histogram": true,
|
||||
"summary": true,
|
||||
"label": true,
|
||||
}
|
||||
|
||||
type MetricsServer struct {
|
||||
sync.Mutex
|
||||
|
||||
server http.Server
|
||||
metrics string
|
||||
metricsChan chan string
|
||||
}
|
||||
|
||||
type PodMapper struct {
|
||||
Config *Config
|
||||
}
|
||||
|
||||
type PodInfo struct {
|
||||
Name string
|
||||
Namespace string
|
||||
Container string
|
||||
}
|
@ -0,0 +1,37 @@
|
||||
/*
|
||||
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package dcgmexporter
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
func WaitWithTimeout(wg *sync.WaitGroup, timeout time.Duration) error {
|
||||
c := make(chan struct{})
|
||||
go func() {
|
||||
defer close(c)
|
||||
wg.Wait()
|
||||
}()
|
||||
select {
|
||||
case <-c:
|
||||
return nil
|
||||
case <-time.After(timeout):
|
||||
return fmt.Errorf("Timeout waiting for WaitGroup")
|
||||
}
|
||||
}
|
@ -0,0 +1,44 @@
|
||||
# SECURITY
|
||||
|
||||
## Security
|
||||
|
||||
NVIDIA takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations.
|
||||
|
||||
If you believe you have found a security vulnerability in any NVIDIA-owned repository that meets [NVIDIA's definition of a security vulnerability](https://www.nvidia.com/en-us/security/psirt-policies/), please report it to us as described below.
|
||||
|
||||
## Reporting Security Issues
|
||||
|
||||
**Please do not report security vulnerabilities through public GitHub issues.**
|
||||
|
||||
To report any security vulnerabilities, please contact the NVIDIA Product Security Incident Response Team (PSIRT) at:
|
||||
|
||||
1. [https://www.nvidia.com/en-us/security/report-vulnerability/](https://www.nvidia.com/en-us/security/report-vulnerability/).
|
||||
2. If you prefer to submit without logging in, please email [psirt@nvidia.com](mailto:psirt@nvidia.com). If you report a potential vulnerability via email, please encrypt your communication using NVIDIA's public PGP key ([see PGP Key page](https://www.nvidia.com/en-us/security/pgp-key/)).
|
||||
3. Alternatively, you can report a security issue through GitHub using the GitHub Security Advisories feature at [https://github.com/NVIDIA/dcgm-exporter/security/advisories/new](https://github.com/NVIDIA/dcgm-exporter/security/advisories/new).
|
||||
|
||||
Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
|
||||
|
||||
- Type of issue (e.g., buffer overflow, remote code execution, etc.)
|
||||
- Permanent link of the source file(s) related to the manifestation of the issue
|
||||
- Any special configuration required to reproduce the issue
|
||||
- Step-by-step instructions to reproduce the issue
|
||||
- Proof-of-concept or exploit code (if possible)
|
||||
- Impact of the issue, including how an attacker might exploit the issue
|
||||
|
||||
This information will help us triage your report more quickly.
|
||||
|
||||
NVIDIA reserves the right to delete vulnerability reports until they're fixed.
|
||||
|
||||
## Preferred Languages
|
||||
|
||||
We prefer all communications to be in English.
|
||||
|
||||
## **Coordinated Vulnerability Disclosure**
|
||||
|
||||
NVIDIA strives to follow Coordinated Vulnerability Disclosure (CVD). CVD is a process by which independent reporters who discover a vulnerability in our product contact NVIDIA directly and allow us the opportunity to investigate and remediate the vulnerability before the reporter discloses the information to the public.
|
||||
|
||||
NVIDIA PSIRT will coordinate with the reporter throughout the vulnerability investigation and provide the reporter with updates on progress as appropriate. With the reporter's agreement, NVIDIA PSIRT may recognize the reporter on our Acknowledgement page for finding a valid product vulnerability and privately reporting the issue. After an update or mitigation information is publicly released by NVIDIA, the reporter is welcome to discuss the vulnerability publicly.
|
||||
|
||||
Following NVIDIA's CVD allows us to protect our customers while coordinating public disclosures and appropriately acknowledging the reporter(s) for their findings.
|
||||
|
||||
Occasionally NVIDIA will discover security vulnerabilities in products from other vendors. NVIDIA will follow its standard Coordinated Vulnerability Disclosure process and communicate the identified issue to the affected vendor or a third-party coordination center if this occurs.
|
@ -0,0 +1,29 @@
|
||||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: "dcgm-exporter"
|
||||
labels:
|
||||
app.kubernetes.io/name: "dcgm-exporter"
|
||||
app.kubernetes.io/version: "3.1.7"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "dcgm-exporter"
|
||||
app.kubernetes.io/version: "3.1.7"
|
||||
endpoints:
|
||||
- port: "metrics"
|
||||
path: "/metrics"
|
@ -0,0 +1,85 @@
|
||||
#! /bin/bash -x
|
||||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
set -euxo pipefail
|
||||
shopt -s lastpipe
|
||||
|
||||
readonly basedir="$(dirname "$(realpath "$0")")"
|
||||
|
||||
# shellcheck source=tests/common.sh
|
||||
source "${basedir}/common.sh"
|
||||
|
||||
# shellcheck source=tests/metrics.sh
|
||||
source "${basedir}/metrics.sh"
|
||||
|
||||
CI_REGISTRY_IMAGE=${CI_REGISTRY_IMAGE:-"undefined"}
|
||||
CI_COMMIT_SHORT_SHA=${CI_COMMIT_SHORT_SHA:-"undefined"}
|
||||
|
||||
install::jq() {
|
||||
apt update && apt install -y --no-install-recommends jq
|
||||
}
|
||||
|
||||
install::helm() {
|
||||
curl https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 | bash
|
||||
}
|
||||
|
||||
install::dcgm::exporter() {
|
||||
helm package deployment
|
||||
helm install --wait dcgm-exporter ./*.tgz --set "image.repository=${CI_REGISTRY_IMAGE}/dcgm-exporter" --set "image.tag=${CI_COMMIT_SHORT_SHA}" --set "serviceMonitor.enabled=true"
|
||||
}
|
||||
|
||||
install::prom() {
|
||||
helm repo add stable https://charts.helm.sh/stable
|
||||
helm install --wait stable/prometheus-operator --generate-name \
|
||||
--set "prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false"
|
||||
}
|
||||
|
||||
query::prom() {
|
||||
IP="$(kubectl get svc -l app=prometheus-operator-prometheus -o jsonpath='{.items[0].spec.clusterIP}')"
|
||||
val="$(curl -sL "http://$IP:9090/api/v1/query?query=$1" | jq -r '.data.result')"
|
||||
|
||||
[ "${val}" != "" ] || return 1
|
||||
[ "${val}" != "[]" ] || return 1
|
||||
|
||||
echo "$val"
|
||||
}
|
||||
|
||||
query::pod::phase() {
|
||||
state="$(kubectl get pods "$1" -o jsonpath='{.status.phase}')"
|
||||
[ "$state" = "$2" ] || return 1
|
||||
}
|
||||
|
||||
testing::log::kube() {
|
||||
kubectl get pods
|
||||
kubectl get svc
|
||||
kubectl get serviceMonitor
|
||||
|
||||
kubectl get pods -l "app.kubernetes.io/component=dcgm-exporter" -o yaml
|
||||
}
|
||||
|
||||
install::jq
|
||||
install::helm
|
||||
install::prom
|
||||
install::dcgm::exporter
|
||||
|
||||
trap 'testing::log::kube' ERR
|
||||
|
||||
for test_case in "metrics"; do
|
||||
log INFO "=================Testing ${test_case}================="
|
||||
testing::${test_case}::setup "$@"
|
||||
testing::${test_case}::main "$@"
|
||||
testing::${test_case}::cleanup "$@"
|
||||
done
|
||||
|
@ -0,0 +1,69 @@
|
||||
#! /bin/bash
|
||||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# shellcheck disable=SC2015
|
||||
[ -t 2 ] && readonly LOG_TTY=1 || readonly LOG_NO_TTY=1
|
||||
|
||||
if [ "${LOG_TTY-0}" -eq 1 ] && [ "$(tput colors)" -ge 15 ]; then
|
||||
readonly FMT_BOLD=$(tput bold)
|
||||
readonly FMT_RED=$(tput setaf 1)
|
||||
readonly FMT_YELLOW=$(tput setaf 3)
|
||||
readonly FMT_BLUE=$(tput setaf 12)
|
||||
readonly FMT_CLEAR=$(tput sgr0)
|
||||
fi
|
||||
|
||||
log() {
|
||||
local -r level="$1"; shift
|
||||
local -r message="$*"
|
||||
|
||||
local fmt_on="${FMT_CLEAR-}"
|
||||
local -r fmt_off="${FMT_CLEAR-}"
|
||||
|
||||
case "${level}" in
|
||||
INFO) fmt_on="${FMT_BLUE-}" ;;
|
||||
WARN) fmt_on="${FMT_YELLOW-}" ;;
|
||||
ERROR) fmt_on="${FMT_RED-}" ;;
|
||||
esac
|
||||
printf "%s[%s]%s %b\n" "${fmt_on}" "${level}" "${fmt_off}" "${message}" >&2
|
||||
}
|
||||
|
||||
with_retry() {
|
||||
local -r max_attempts="$1" delay="$2"
|
||||
shift 2
|
||||
local count=0 rc
|
||||
|
||||
while true; do
|
||||
set +e
|
||||
"$@"
|
||||
rc="$?"
|
||||
set -e
|
||||
count="$((count+1))"
|
||||
|
||||
if [[ "${rc}" -eq 0 ]]; then
|
||||
echo "'$*' SUCCEEDED in ${count} attempts !"
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [[ "${max_attempts}" -le 0 ]] || [[ "${count}" -lt "${max_attempts}" ]]; then
|
||||
echo "'$*' FAILED at attempt ${count}, will retry in ${delay} seconds ..."
|
||||
sleep "${delay}"
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "'$*' FAILED in ${count} attempts !"
|
||||
return 1
|
||||
}
|
@ -0,0 +1,14 @@
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: nbody-pod
|
||||
spec:
|
||||
restartPolicy: OnFailure
|
||||
containers:
|
||||
- name: nbody
|
||||
image: "nvcr.io/nvidia/k8s/cuda-sample:nbody"
|
||||
command: ["nbody"]
|
||||
args: ["-benchmark", "-i=10000000"]
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 1
|
@ -0,0 +1,54 @@
|
||||
#! /bin/bash -x
|
||||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
testing::metrics::setup() {
|
||||
:
|
||||
}
|
||||
|
||||
testing::metrics::cleanup() {
|
||||
kubectl delete -f tests/gpu-pod.yaml
|
||||
}
|
||||
|
||||
testing::metrics::utilization::increase() {
|
||||
# For a short while we might have multiple values returned
|
||||
# In this case it seems like the first item is the oldest
|
||||
val="$(query::prom "DCGM_FI_DEV_GPU_UTIL" | jq -r '.[-1].value[1]')"
|
||||
[ "$val" -ge 0 ] || return 1
|
||||
}
|
||||
|
||||
testing::metrics::ensure::kube::labels() {
|
||||
val="$(query::prom "DCGM_FI_DEV_GPU_UTIL")"
|
||||
UUID="$(echo "${val}" | jq -r '.[0].metric.UUID')"
|
||||
gpu="$(echo "${val}" | jq -r '.[0].metric.gpu')"
|
||||
pod="$(echo "${val}" | jq -r '.[0].metric.exported_pod')"
|
||||
namespace="$(echo "${val}" | jq -r '.[0].metric.exported_namespace')"
|
||||
|
||||
[ "$UUID" != "" ] || return 1
|
||||
[ "$gpu" != "" ] || return 1
|
||||
|
||||
[ "$pod" = "nbody-pod" ] || return 1
|
||||
[ "$namespace" = "default" ] || return 1
|
||||
}
|
||||
|
||||
testing::metrics::main() {
|
||||
# Prometheus can take a while to pickup the exporter
|
||||
with_retry 30 10s query::prom "DCGM_FI_DEV_MEMORY_TEMP"
|
||||
|
||||
kubectl create -f tests/gpu-pod.yaml
|
||||
with_retry 30 10s query::pod::phase "nbody-pod" "Running"
|
||||
|
||||
with_retry 10 10s testing::metrics::utilization::increase
|
||||
with_retry 10 10s testing::metrics::ensure::kube::labels
|
||||
}
|
@ -0,0 +1,3 @@
|
||||
instance_type = "p3.2xlarge"
|
||||
project_name = "dcgm-exporter"
|
||||
setup_params = "--driver --k8s-plugin --nvcr"
|
Loading…
Reference in New Issue