# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. image: repository: nvcr.io/nvidia/k8s/dcgm-exporter pullPolicy: IfNotPresent # Image tag defaults to AppVersion, but you can use the tag key # for the image tag, e.g: tag: 3.2.5-3.1.7-ubuntu20.04 # Change the following reference to "/etc/dcgm-exporter/default-counters.csv" # to stop profiling metrics from DCGM arguments: ["-f", "/etc/dcgm-exporter/dcp-metrics-included.csv"] # NOTE: in general, add any command line arguments to arguments above # and they will be passed through. # Use "-r", ":" to connect to an already running hostengine # Example arguments: ["-r", "host123:5555"] # Use "-n" to remove the hostname tag from the output. # Example arguments: ["-n"] # Use "-d" to specify the devices to monitor. -d must be followed by a string # in the following format: [f] or [g[:numeric_range][+]][i[:numeric_range]] # Where a numeric range is something like 0-4 or 0,2,4, etc. # Example arguments: ["-d", "g+i"] to monitor all GPUs and GPU instances or # ["-d", "g:0-3"] to monitor GPUs 0-3. # Use "-m" to specify the namespace and name of a configmap containing # the watched exporter fields. # Example arguments: ["-m", "default:exporter-metrics-config-map"] imagePullSecrets: [] nameOverride: "" fullnameOverride: "" namespaceOverride: "" runtimeClassName: "" serviceAccount: # Specifies whether a service account should be created create: true # Annotations to add to the service account annotations: {} # The name of the service account to use. # If not set and create is true, a name is generated using the fullname template name: rollingUpdate: # Specifies maximum number of DaemonSet pods that can be unavailable during the update maxUnavailable: 1 # Specifies maximum number of nodes with an existing available DaemonSet pod that can have an updated DaemonSet pod during during an update maxSurge: 0 podAnnotations: {} # Using this annotation which is required for prometheus scraping # prometheus.io/scrape: "true" # prometheus.io/port: "9400" podSecurityContext: {} # fsGroup: 2000 securityContext: runAsNonRoot: false runAsUser: 0 capabilities: add: ["SYS_ADMIN"] # readOnlyRootFilesystem: true service: enable: true type: ClusterIP port: 9400 address: ":9400" # Annotations to add to the service annotations: {} resources: {} # limits: # cpu: 100m # memory: 128Mi # requests: # cpu: 100m # memory: 128Mi serviceMonitor: enabled: true interval: 15s honorLabels: false additionalLabels: {} #monitoring: prometheus relabelings: [] # - sourceLabels: [__meta_kubernetes_pod_node_name] # separator: ; # regex: ^(.*)$ # targetLabel: nodename # replacement: $1 # action: replace nodeSelector: {} #node: gpu tolerations: [] #- operator: Exists affinity: {} #nodeAffinity: # requiredDuringSchedulingIgnoredDuringExecution: # nodeSelectorTerms: # - matchExpressions: # - key: nvidia-gpu # operator: Exists extraHostVolumes: [] #- name: host-binaries # hostPath: /opt/bin extraConfigMapVolumes: [] #- name: exporter-metrics-volume # configMap: # name: exporter-metrics-config-map extraVolumeMounts: [] #- name: host-binaries # mountPath: /opt/bin # readOnly: true extraEnv: [] #- name: EXTRA_VAR # value: "TheStringValue" kubeletPath: "/var/lib/kubelet/pod-resources"