From ba7e97029da76ed30bd5e47325aa182b798f7044 Mon Sep 17 00:00:00 2001 From: chaitan1731 Date: Thu, 11 Apr 2024 03:12:15 -0400 Subject: [PATCH] ansible: Added inital playbook code Signed-off-by: chaitanya1731 --- one_click/README.md | 31 ++++ one_click/gpu_provisioning_playbook.yaml | 192 +++++++++++++++++++++++ 2 files changed, 223 insertions(+) create mode 100644 one_click/README.md create mode 100644 one_click/gpu_provisioning_playbook.yaml diff --git a/one_click/README.md b/one_click/README.md new file mode 100644 index 00000000..07b58b88 --- /dev/null +++ b/one_click/README.md @@ -0,0 +1,31 @@ +# Deploy Intel Technology Enabling Solutions with Red Hat OpenShift using “One-Click” + +## Overview +Red Hat [Ansible](https://www.ansible.com/) and Operator technologies are used for “One-Click Deployment” of Intel technology enabling solutions with Red Hat OpenShift Container Platform (RHOCP). Ansible technology automates the operator installation and configuration steps using a playbook, making deployment as simple as a single click. + +The referenced Ansible playbooks here can be used by the cluster administrators to customize their own playbooks. + +>[!NOTE] +> It is recommended to start from [Get started](/README.md#getting-started) to get familiar with the installation and configuration of the general operator before composing the first playbook. + +## Reference Playbook – Intel Data Center GPU Provisioning + +This playbook demonstrates the one-click provisioning of Intel Data Center GPU on an RHOCP cluster. The steps involved are installation and configuration of general Operators including Node Feature Discovery (NFD) operator, Kernel Module Management (KMM) operator, and the Intel Device Plugins Operator. + +### Prerequisite + +Before running the playbook, ensure the following prerequisites are met: +- Provisioned RHOCP Cluster +- Red Hat Enterprise Linux (RHEL) system with [Ansible](https://docs.ansible.com/ansible/2.9/installation_guide/intro_installation.html#installing-ansible-on-rhel-centos-or-fedora) installed and configured with a `kubeconfig` to connect to your RHOCP cluster. + +### Run the Playbook +To run the ansible playbook, clone this repository to your RHEL system. Navigate to the directory containing the playbook. +``` +$ git clone https://github.com/intel/intel-technology-enabling-for-openshift.git +$ cd intel-technology-enabling-for-openshift/one_click +``` + +Execute below single command to provision Intel Data Center GPU: +``` +$ ansible-playbook gpu_provisioning_playbook.yaml +``` \ No newline at end of file diff --git a/one_click/gpu_provisioning_playbook.yaml b/one_click/gpu_provisioning_playbook.yaml new file mode 100644 index 00000000..83870b30 --- /dev/null +++ b/one_click/gpu_provisioning_playbook.yaml @@ -0,0 +1,192 @@ +# Copyright (c) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +- hosts: localhost + vars: + kubeconfig_path: "~/.kube/config" + environment: + KUBECONFIG: "{{ kubeconfig_path }}" + tasks: + - name: Install Dependencies + tags: + - install_dependencies + block: + - name: NFD - Install Node Feature Discovery Operator + tags: + - nfd + block: + - name: NFD - Create openshift-nfd namespace + k8s: + name: openshift-nfd + api_version: v1 + kind: Namespace + state: present + wait: yes + - name: NFD - Create an nfd-operator group v1 + k8s: + definition: + apiVersion: operators.coreos.com/v1 + kind: OperatorGroup + metadata: + generateName: openshift-nfd- + name: openshift-nfd + namespace: openshift-nfd + spec: + targetNamespaces: + - openshift-nfd + wait: yes + - name: NFD - Create subscription for RH NFD operator + k8s: + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: Subscription + metadata: + name: nfd + namespace: openshift-nfd + spec: + channel: "stable" + installPlanApproval: Automatic + name: nfd + source: redhat-operators + sourceNamespace: openshift-marketplace + wait: yes + wait_condition: + reason: AllCatalogSourcesHealthy + type: CatalogSourcesUnhealthy + status: 'False' + - name: NFD - Wait until the nfd-operator-controller Deployment is available + k8s_info: + kind: Deployment + wait: yes + name: nfd-controller-manager + namespace: openshift-nfd + wait_condition: + type: Available + status: 'True' + reason: MinimumReplicasAvailable + - name: KMM - Install Kernel Module Management Operator + tags: + - kmm + block: + - name: KMM - Create openshift-kmm namespace + k8s: + name: openshift-kmm + api_version: v1 + kind: Namespace + state: present + wait: yes + - name: KMM - Create OperatorGroup v1 in openshift-kmm namespace + k8s: + definition: + apiVersion: operators.coreos.com/v1 + kind: OperatorGroup + metadata: + name: kernel-module-management + namespace: openshift-kmm + wait: yes + - name: KMM - Create Subscription for KMM Operator + k8s: + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: Subscription + metadata: + name: kernel-module-management + namespace: openshift-kmm + spec: + channel: stable + installPlanApproval: Automatic + name: kernel-module-management + source: redhat-operators + sourceNamespace: openshift-marketplace + startingCSV: kernel-module-management.v2.1.1 + wait: yes + wait_condition: + reason: AllCatalogSourcesHealthy + type: CatalogSourcesUnhealthy + status: 'False' + - name: KMM - Wait until the kmm-operator-controller Deployment is available + k8s_info: + kind: Deployment + wait: yes + name: kmm-operator-controller + namespace: openshift-kmm + wait_condition: + type: Available + status: 'True' + reason: MinimumReplicasAvailable + - name: KMM - Update KMM CM to set FW path + command: | + oc patch configmap kmm-operator-manager-config -n openshift-kmm --type='json' -p='[{"op": "add", "path": "/data/controller_config.yaml", "value": "healthProbeBindAddress: :8081\nmetricsBindAddress: 127.0.0.1:8080\nleaderElection:\n enabled: true\n resourceID: kmm.sigs.x-k8s.io\nwebhook:\n disableHTTP2: true\n port: 9443\nworker:\n runAsUser: 0\n seLinuxType: spc_t\n setFirmwareClassPath: /var/lib/firmware"}]' + - name: KMM - Delete the KMM operator controller pod for `ConfigMap` changes to take effect. + shell: oc get pods -n openshift-kmm | grep -i "kmm-operator-controller-" | awk '{print $1}' | xargs oc delete pod -n openshift-kmm + - name: KMM - wait 10 seconds until KMM operator pod is up and running + pause: + seconds: 10 + - name: IDPO - Install Intel Device Plugins Operator + tags: + - idpo + block: + - name: IDPO - Install Intel Device Plugins Operator + k8s: + state: present + src: "../device_plugins/install_operator.yaml" + wait: yes + - name: IDPO - Wait until the inteldeviceplugins-controller-manager Deployment is available + k8s_info: + kind: Deployment + wait: yes + name: inteldeviceplugins-operator-controller + namespace: openshift-operators + wait_condition: + type: Available + status: 'True' + reason: MinimumReplicasAvailable + - name: NFD - Install NFD CRs + block: + - name: NFD - Create NFD discovery CR + k8s: + state: present + src: "../nfd/node-feature-discovery-openshift.yaml" + wait: yes + - name: NFD - Create NFD rules instance CR + k8s: + state: present + src: "../nfd/node-feature-rules-openshift.yaml" + wait: yes + - name: KMM - Deploy Pre-built Out-of-Tree Intel Data Center GPU Driver Container for OpenShift using KMM + block: + - name: KMM - Install KMM pre-build Module CR + k8s: + state: present + src: "../kmmo/intel-dgpu.yaml" + wait: yes + - name: IDPO - Install GPU plugin + block: + - name: IDPO - Create Intel GPU device plugin + k8s: + state: present + src: "../device_plugins/gpu_device_plugin.yaml" + wait: yes + - name: GPU TEST + tags: + - gpu_test + block: + - name: GPU TEST - Get Node Resource Information + kubernetes.core.k8s_info: + api: v1 + kind: Node + label_selectors: + - "intel.feature.node.kubernetes.io/gpu=true" + - "kmm.node.kubernetes.io/openshift-kmm.intel-dgpu.ready" + wait: yes + wait_timeout: 120 + register: cluster_nodes_info + until: + - cluster_nodes_info.resources is defined + - name: Print cluster resources + debug: + msg: + - "Please verify Capacity and Allocatable Intel Data Center GPU Resources on the node - " + - "Capacity: { + 'gpu.intel.com/i915': {{ cluster_nodes_info.resources[0].status.capacity['gpu.intel.com/i915'] }}," + - "Allocatable Resources: { + 'gpu.intel.com/i915': {{ cluster_nodes_info.resources[0].status.allocatable['gpu.intel.com/i915'] }}," \ No newline at end of file