From 11a5c7b0cd669c95fa48f21b2bc6d1525d8eb6f6 Mon Sep 17 00:00:00 2001 From: Kevin Klues Date: Fri, 3 Apr 2020 09:36:11 +0000 Subject: [PATCH 1/2] Bump version to 1.0.0-beta5 Signed-off-by: Kevin Klues --- README.md | 23 +++++++++++++++------ extensions-v1beta1-nvidia-device-plugin.yml | 2 +- nvidia-device-plugin.yml | 2 +- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 980843523..9ae96c052 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ Once you have enabled this option on *all* the GPU nodes you wish to use, you can then enable GPU support in your cluster by deploying the following Daemonset: ```shell -$ kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/1.0.0-beta4/nvidia-device-plugin.yml +$ kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/1.0.0-beta5/nvidia-device-plugin.yml ``` ### Running GPU Jobs @@ -115,24 +115,24 @@ The next sections are focused on building the device plugin and running it. #### Build Option 1, pull the prebuilt image from [Docker Hub](https://hub.docker.com/r/nvidia/k8s-device-plugin): ```shell -$ docker pull nvidia/k8s-device-plugin:1.0.0-beta4 +$ docker pull nvidia/k8s-device-plugin:1.0.0-beta5 ``` Option 2, build without cloning the repository: ```shell -$ docker build -t nvidia/k8s-device-plugin:1.0.0-beta4 https://github.com/NVIDIA/k8s-device-plugin.git#1.0.0-beta4 +$ docker build -t nvidia/k8s-device-plugin:1.0.0-beta5 https://github.com/NVIDIA/k8s-device-plugin.git#1.0.0-beta5 ``` Option 3, if you want to modify the code: ```shell $ git clone https://github.com/NVIDIA/k8s-device-plugin.git && cd k8s-device-plugin -$ git checkout 1.0.0-beta4 -$ docker build -t nvidia/k8s-device-plugin:1.0.0-beta4 . +$ git checkout 1.0.0-beta5 +$ docker build -t nvidia/k8s-device-plugin:1.0.0-beta5 . ``` #### Run locally ```shell -$ docker run --security-opt=no-new-privileges --cap-drop=ALL --network=none -it -v /var/lib/kubelet/device-plugins:/var/lib/kubelet/device-plugins nvidia/k8s-device-plugin:1.0.0-beta4 +$ docker run --security-opt=no-new-privileges --cap-drop=ALL --network=none -it -v /var/lib/kubelet/device-plugins:/var/lib/kubelet/device-plugins nvidia/k8s-device-plugin:1.0.0-beta5 ``` #### Deploy as Daemon Set: @@ -154,6 +154,17 @@ $ ./k8s-device-plugin ## Changelog +### Version 1.0.0-beta5 + +- Add a new plugin.yml variant that is compatible with the CPUManager +- Change CMD in Dockerfile to ENTRYPOINT +- Add flag to optionally return list of device nodes in Allocate() call +- Refactor device plugin to eventually handle multiple resource types +- Move plugin error retry to event loop so we can exit with a signal +- Update all vendored dependencies to their latest versions +- Fix bug that was inadvertently *always* disabling health checks +- Update minimal driver version to 384.81 + ### Version 1.0.0-beta4 - Fixes a bug with a nil pointer dereference around `getDevices:CPUAffinity` diff --git a/extensions-v1beta1-nvidia-device-plugin.yml b/extensions-v1beta1-nvidia-device-plugin.yml index a631b6c2a..f590016b5 100644 --- a/extensions-v1beta1-nvidia-device-plugin.yml +++ b/extensions-v1beta1-nvidia-device-plugin.yml @@ -43,7 +43,7 @@ spec: # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ priorityClassName: "system-node-critical" containers: - - image: nvidia/k8s-device-plugin:1.0.0-beta4 + - image: nvidia/k8s-device-plugin:1.0.0-beta5 name: nvidia-device-plugin-ctr securityContext: allowPrivilegeEscalation: false diff --git a/nvidia-device-plugin.yml b/nvidia-device-plugin.yml index 5eff0547a..4d11a40ee 100644 --- a/nvidia-device-plugin.yml +++ b/nvidia-device-plugin.yml @@ -46,7 +46,7 @@ spec: # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ priorityClassName: "system-node-critical" containers: - - image: nvidia/k8s-device-plugin:1.0.0-beta4 + - image: nvidia/k8s-device-plugin:1.0.0-beta5 name: nvidia-device-plugin-ctr securityContext: allowPrivilegeEscalation: false From c4be4be056863fb9fd169826f4619e25828706b2 Mon Sep 17 00:00:00 2001 From: Kevin Klues Date: Fri, 3 Apr 2020 09:36:51 +0000 Subject: [PATCH 2/2] Add a new plugin.yml variant that is compatible with the CPUManager Signed-off-by: Kevin Klues --- RELEASE.md | 1 + ...a-device-plugin-compat-with-cpumanager.yml | 60 +++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 nvidia-device-plugin-compat-with-cpumanager.yml diff --git a/RELEASE.md b/RELEASE.md index 75352460b..c760e5b95 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -11,6 +11,7 @@ Publishing the container is automated through gitlab-ci and only requires on to - [ ] Update the README changelog - [ ] Update the device plugin (1.16+) to use the new container version (nvidia-device-plugin.yml) +- [ ] Update the device plugin compatible with the CPUManager (1.16+) to use the new container version (nvidia-device-plugin-compat-with-cpumanager.yml) - [ ] Update the legacy device plugin (pre 1.16) to use the new container version (extensions-v1beta1-nvidia-device-plugin.yml) - [ ] Commit, Tag and Push to Gitlab - [ ] Trigger the [multi arch manifest CI](https://gitlab.com/nvidia/container-images/dockerhub-manifests) diff --git a/nvidia-device-plugin-compat-with-cpumanager.yml b/nvidia-device-plugin-compat-with-cpumanager.yml new file mode 100644 index 000000000..47124f392 --- /dev/null +++ b/nvidia-device-plugin-compat-with-cpumanager.yml @@ -0,0 +1,60 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-device-plugin-daemonset + namespace: kube-system +spec: + selector: + matchLabels: + name: nvidia-device-plugin-ds + updateStrategy: + type: RollingUpdate + template: + metadata: + # This annotation is deprecated. Kept here for backward compatibility + # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ + annotations: + scheduler.alpha.kubernetes.io/critical-pod: "" + labels: + name: nvidia-device-plugin-ds + spec: + tolerations: + # This toleration is deprecated. Kept here for backward compatibility + # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ + - key: CriticalAddonsOnly + operator: Exists + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + # Mark this pod as a critical add-on; when enabled, the critical add-on + # scheduler reserves resources for critical add-on pods so that they can + # be rescheduled after a failure. + # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ + priorityClassName: "system-node-critical" + containers: + - image: nvidia/k8s-device-plugin:1.0.0-beta5 + name: nvidia-device-plugin-ctr + args: ["--pass-device-specs"] + securityContext: + privileged: true + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins