From acb9ea1fcbf7ed03c1a27b99b9af17a28cf17fed Mon Sep 17 00:00:00 2001 From: Pawel Palucki Date: Wed, 22 May 2024 10:47:14 -0100 Subject: [PATCH] New: support for PERFMON capability, silent mode and some extra env debug variables --- deployment/pcm/README.md | 6 +++--- deployment/pcm/templates/_helpers.tpl | 3 +-- deployment/pcm/templates/daemonset.yaml | 7 +++++++ deployment/pcm/values.yaml | 10 ++++++++++ 4 files changed, 21 insertions(+), 5 deletions(-) diff --git a/deployment/pcm/README.md b/deployment/pcm/README.md index 1bf8e838..cf874dc8 100644 --- a/deployment/pcm/README.md +++ b/deployment/pcm/README.md @@ -8,8 +8,9 @@ Helm chart instructions - Support for bare-metal and VM host configurations (files: [values-metal.yaml](values-metal.yaml), [values-vm.yaml](values-vm.yaml)), - Ability to deploy multiple releases alongside configured differently to handle different kinds of machines (bare-metal, VM) at the [same time](#heterogeneous-mixed-vmmetal-instances-cluster), - Linux Watchdog handling (controlled with `PCM_KEEP_NMI_WATCHDOG`, `PCM_NO_AWS_WORKAROUND`, `nmiWatchdogMount` values). -- Deploy to own namespace with "helm install ... **-n pcm --create-namespace**" -- Silent mode (value: `silent=false`, default) +- Deploy to own namespace with "helm install ... **-n pcm --create-namespace**". +- Silent mode (value: `silent=false`, default). +- Backward compatbile with older Linux kernels (<5.8) - (value: cap_perfmon). Here are available methods in this chart of metrics collection w.r.t interfaces and required access: @@ -87,7 +88,6 @@ More information here: https://kubernetes.io/docs/tutorials/security/ns-level-ps #### 1) (Optionally) mount resctrl filesystem (for RDT metrics) to unload "msr" kernel module for validation ``` -echo 0 > /proc/sys/kernel/perf_event_paranoid mount -t resctrl resctrl /sys/fs/resctrl ``` diff --git a/deployment/pcm/templates/_helpers.tpl b/deployment/pcm/templates/_helpers.tpl index 446325cc..fffa7025 100644 --- a/deployment/pcm/templates/_helpers.tpl +++ b/deployment/pcm/templates/_helpers.tpl @@ -60,9 +60,8 @@ securityContext: */}} capabilities: add: - - SYS_ADMIN + - {{ if .Values.cap_perfmon }}PERFMON{{ else }}SYS_ADMIN{{ end }} - SYS_RAWIO - #- PERFMON {{- end }} {{- end }} diff --git a/deployment/pcm/templates/daemonset.yaml b/deployment/pcm/templates/daemonset.yaml index 3ed2b56e..1bf21217 100644 --- a/deployment/pcm/templates/daemonset.yaml +++ b/deployment/pcm/templates/daemonset.yaml @@ -110,6 +110,13 @@ spec: value: {{ .Values.PCM_KEEP_NMI_WATCHDOG | quote }} - name: PCM_NO_AWS_WORKAROUND value: {{ .Values.PCM_NO_AWS_WORKAROUND | quote }} + - name: PCM_NO_UNCORE_PMU_DISCOVERY + value: {{ .Values.PCM_NO_UNCORE_PMU_DISCOVERY | quote }} + - name: PCM_PRINT_UNCORE_PMU_DISCOVERY + value: {{ .Values.PCM_PRINT_UNCORE_PMU_DISCOVERY | quote }} + - name: PCM_PRINT_TOPOLOGY + value: {{ .Values.PCM_PRINT_TOPOLOGY | quote }} + {{- with .Values.probes }} livenessProbe: {{- include "pcm.probe" . | nindent 12 }} diff --git a/deployment/pcm/values.yaml b/deployment/pcm/values.yaml index 917cf243..73a5d553 100644 --- a/deployment/pcm/values.yaml +++ b/deployment/pcm/values.yaml @@ -18,6 +18,11 @@ imagePullSecrets: {} # Configures SecurityContext to not privileged (by default) so SYS_ADMIN/SYS_RAWIO capabilietes are required for running pod privileged: false +# Use new kernel 5.8+ PERFMON (least privileged) instead of generic SYS_ADMIN capability +# !Warning requires kernel 5.8+ +# more info here: https://www.kernel.org/doc/html/latest/admin-guide/perf-security.html#perf-events-access-control +cap_perfmon: true + # Run pcm in silent mode (additional -silent argument to pcm-sensor-server binary) # Removes some of debug outputs (like warnings about unability to open some /sys... /proc... files) silent: false @@ -72,6 +77,11 @@ PCM_NO_AWS_WORKAROUND: 0 # mounting watchdog is recommened when PCM_KEEP_NMI_WATCHDOG=0 or we expect AWS workaround to be applied nmiWatchdogMount: true +### -------------- Other (Debugging options for uncore pmu discovery) +PCM_NO_UNCORE_PMU_DISCOVERY: 0 # skip 1: this is not required for direct privileged access and with 0 ends with WARNING enumaration failed +PCM_PRINT_UNCORE_PMU_DISCOVERY: 1 # show: discovered pmu +PCM_PRINT_TOPOLOGY: 0 # show individual CPU topology for each core (plenty of lines) + ### =============================== Optional POD fields no related to PCM =============================== # Pod level podAnnotations: {}