From e01dff8f1148c2d1ce614bf2c6f20a67ace4f9b9 Mon Sep 17 00:00:00 2001
From: "mergify[bot]" <37929162+mergify[bot]@users.noreply.github.com>
Date: Wed, 15 Nov 2023 08:15:11 +0100
Subject: [PATCH 01/21] Changelog for 8.11.1 (#3751) (#3767)

* Changelog for 8.11.1

* Update changelog/8.11.1.asciidoc

Co-authored-by: David Kilfoyle <41695641+kilfoyle@users.noreply.github.com>

---------

Co-authored-by: David Kilfoyle <41695641+kilfoyle@users.noreply.github.com>
(cherry picked from commit 9f6cf349e44688ff26e47c2fbfbc934bf28fd3d2)

Co-authored-by: Pierre HILBERT <pierre.hilbert@elastic.co>
---
 changelog/8.11.1.asciidoc                     | 33 +++++++++++++++++++
 changelog/8.11.1.yaml                         | 14 ++++++++
 ...35373-Add-component-fields-to-metrics.yaml | 32 ------------------
 3 files changed, 47 insertions(+), 32 deletions(-)
 create mode 100644 changelog/8.11.1.asciidoc
 create mode 100644 changelog/8.11.1.yaml
 delete mode 100644 changelog/fragments/1697735373-Add-component-fields-to-metrics.yaml

diff --git a/changelog/8.11.1.asciidoc b/changelog/8.11.1.asciidoc
new file mode 100644
index 00000000000..8fba60d7440
--- /dev/null
+++ b/changelog/8.11.1.asciidoc
@@ -0,0 +1,33 @@
+// begin 8.11.1 relnotes
+
+[[release-notes-8.11.1]]
+==  8.11.1
+
+Review important information about the  8.11.1 release.
+
+
+
+
+
+
+
+
+
+[discrete]
+[[new-features-8.11.1]]
+=== New features
+
+The 8.11.1 release adds the following new and notable features.
+
+
+elastic-agent::
+
+* Add component {id, binary} to monitoring metrics from {agent} and {beats}. {elastic-agent-pull}https://github.com/elastic/elastic-agent/pull/3626[#https://github.com/elastic/elastic-agent/pull/3626] {elastic-agent-issue}https://github.com/elastic/integrations/issues/7977[#https://github.com/elastic/integrations/issues/7977]
+
+
+
+
+
+
+
+// end 8.11.1 relnotes
diff --git a/changelog/8.11.1.yaml b/changelog/8.11.1.yaml
new file mode 100644
index 00000000000..73c0e4888c0
--- /dev/null
+++ b/changelog/8.11.1.yaml
@@ -0,0 +1,14 @@
+version: 8.11.1
+entries:
+    - kind: feature
+      summary: Add component {id, binary} to monitoring metrics from Elastic-Agent and Beats
+      description: ""
+      component: elastic-agent
+      pr:
+        - https://github.com/elastic/elastic-agent/pull/3626
+      issue:
+        - https://github.com/elastic/integrations/issues/7977
+      timestamp: 1697735373
+      file:
+        name: 1697735373-Add-component-fields-to-metrics.yaml
+        checksum: e78d27aa33269834757dceee23e195ae6ea8ecd5
diff --git a/changelog/fragments/1697735373-Add-component-fields-to-metrics.yaml b/changelog/fragments/1697735373-Add-component-fields-to-metrics.yaml
deleted file mode 100644
index 780093d1434..00000000000
--- a/changelog/fragments/1697735373-Add-component-fields-to-metrics.yaml
+++ /dev/null
@@ -1,32 +0,0 @@
-# Kind can be one of:
-# - breaking-change: a change to previously-documented behavior
-# - deprecation: functionality that is being removed in a later release
-# - bug-fix: fixes a problem in a previous version
-# - enhancement: extends functionality but does not break or fix existing behavior
-# - feature: new functionality
-# - known-issue: problems that we are aware of in a given version
-# - security: impacts on the security of a product or a user’s deployment.
-# - upgrade: important information for someone upgrading from a prior version
-# - other: does not fit into any of the other categories
-kind: feature
-
-# Change summary; a 80ish characters long description of the change.
-summary: Add component.{id, binary} to monitoring metrics from Elastic-Agent and Beats
-
-# Long description; in case the summary is not enough to describe the change
-# this field accommodate a description without length limits.
-# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
-#description:
-
-# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
-component: elastic-agent
-
-# PR URL; optional; the PR number that added the changeset.
-# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
-# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
-# Please provide it if you are adding a fragment for a different PR.
-pr: https://github.com/elastic/elastic-agent/pull/3626
-
-# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
-# If not present is automatically filled by the tooling with the issue linked to the PR number.
-issue: https://github.com/elastic/integrations/issues/7977

From 26947871957102cd972210202f15217789ff5cb9 Mon Sep 17 00:00:00 2001
From: Lee E Hinman <57081003+leehinman@users.noreply.github.com>
Date: Wed, 15 Nov 2023 00:26:48 -0800
Subject: [PATCH 02/21] Update pgp-workaround.md (#3766)

* Update pgp-workaround.md

Add info on TLS

* Update docs/pgp-workaround.md

Co-authored-by: Craig MacKenzie <craig.mackenzie@elastic.co>

---------

Co-authored-by: Craig MacKenzie <craig.mackenzie@elastic.co>
---
 docs/pgp-workaround.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/pgp-workaround.md b/docs/pgp-workaround.md
index 1da21bde919..621a22514ce 100644
--- a/docs/pgp-workaround.md
+++ b/docs/pgp-workaround.md
@@ -56,3 +56,7 @@ host { 'elastic-artifacts':
         path: '/etc/hosts'
         line: '<YOUR_HOST_IP> artifacts.elastic.co'
 ```
+
+## TLS
+
+Because the connection is `https`, the certificate that the host that is impersonating `https:\\artifacts.elastic.co` returns will have to have `artifacts.elastic.co` as one of it's Subject Alternate Names.

From 91c03aa7170141c550d9c8c70626fb2f4f0a1a0f Mon Sep 17 00:00:00 2001
From: Dan Kortschak <90160302+efd6@users.noreply.github.com>
Date: Thu, 16 Nov 2023 05:54:47 +1030
Subject: [PATCH 03/21] internal/pkg/agent/application/coordinator: fix logging
 calls (#3679)

Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
---
 .../fragments/1698808222-orphan-verbs.yaml    | 32 +++++++++++++++++++
 .../application/coordinator/coordinator.go    |  4 +--
 2 files changed, 34 insertions(+), 2 deletions(-)
 create mode 100644 changelog/fragments/1698808222-orphan-verbs.yaml

diff --git a/changelog/fragments/1698808222-orphan-verbs.yaml b/changelog/fragments/1698808222-orphan-verbs.yaml
new file mode 100644
index 00000000000..a423b94619f
--- /dev/null
+++ b/changelog/fragments/1698808222-orphan-verbs.yaml
@@ -0,0 +1,32 @@
+# Kind can be one of:
+# - breaking-change: a change to previously-documented behavior
+# - deprecation: functionality that is being removed in a later release
+# - bug-fix: fixes a problem in a previous version
+# - enhancement: extends functionality but does not break or fix existing behavior
+# - feature: new functionality
+# - known-issue: problems that we are aware of in a given version
+# - security: impacts on the security of a product or a user’s deployment.
+# - upgrade: important information for someone upgrading from a prior version
+# - other: does not fit into any of the other categories
+kind: bug-fix
+
+# Change summary; a 80ish characters long description of the change.
+summary: Fix logging calls incorrectly using non-f variants and missing args.
+
+# Long description; in case the summary is not enough to describe the change
+# this field accommodate a description without length limits.
+# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
+#description:
+
+# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
+component: elastic-agent
+
+# PR URL; optional; the PR number that added the changeset.
+# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
+# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
+# Please provide it if you are adding a fragment for a different PR.
+#pr: https://github.com/owner/repo/1234
+
+# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
+# If not present is automatically filled by the tooling with the issue linked to the PR number.
+#issue: https://github.com/owner/repo/1234
diff --git a/internal/pkg/agent/application/coordinator/coordinator.go b/internal/pkg/agent/application/coordinator/coordinator.go
index bce2188c0e7..1bbfde40a20 100644
--- a/internal/pkg/agent/application/coordinator/coordinator.go
+++ b/internal/pkg/agent/application/coordinator/coordinator.go
@@ -1206,11 +1206,11 @@ func (c *Coordinator) filterByCapabilities(comps []component.Component) []compon
 	for _, component := range comps {
 		// If this is an input component (not a shipper), make sure its type is allowed
 		if component.InputSpec != nil && !c.caps.AllowInput(component.InputType) {
-			c.logger.Info("Component '%v' with input type '%v' filtered by capabilities.yml", component.InputType)
+			c.logger.Infof("Component '%v' with input type '%v' filtered by capabilities.yml", component.ID, component.InputType)
 			continue
 		}
 		if !c.caps.AllowOutput(component.OutputType) {
-			c.logger.Info("Component '%v' with output type '%v' filtered by capabilities.yml", component.ID, component.OutputType)
+			c.logger.Infof("Component '%v' with output type '%v' filtered by capabilities.yml", component.ID, component.OutputType)
 			continue
 		}
 		result = append(result, component)

From a0d3dbdefeb7996f981fd6cbc22c5fe84c722b54 Mon Sep 17 00:00:00 2001
From: apmmachine <58790750+apmmachine@users.noreply.github.com>
Date: Wed, 15 Nov 2023 15:49:54 -0500
Subject: [PATCH 04/21] [Automation] Bump Golang version to 1.20.11 (#3748)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* chore: Update .golangci.yml

Made with ❤️️ by updatecli

* chore: Update .go-version

Made with ❤️️ by updatecli

* chore: Update from dockerfiles

Made with ❤️️ by updatecli

* chore: Update version.asciidoc

Made with ❤️️ by updatecli

* Add changelog fragment.

---------

Co-authored-by: apmmachine <apmmachine@users.noreply.github.com>
Co-authored-by: Craig MacKenzie <craig.mackenzie@elastic.co>
---
 .go-version                                   |  2 +-
 .golangci.yml                                 |  8 ++---
 Dockerfile                                    |  2 +-
 Dockerfile.skaffold                           |  2 +-
 .../1700000391-Upgrade-to-Go-1.20.11.yaml     | 32 +++++++++++++++++++
 version/docs/version.asciidoc                 |  2 +-
 6 files changed, 40 insertions(+), 8 deletions(-)
 create mode 100644 changelog/fragments/1700000391-Upgrade-to-Go-1.20.11.yaml

diff --git a/.go-version b/.go-version
index acdfc7930c8..4bb1a22f8ec 100644
--- a/.go-version
+++ b/.go-version
@@ -1 +1 @@
-1.20.10
+1.20.11
diff --git a/.golangci.yml b/.golangci.yml
index c72fb52ef8d..b3bcd8666a7 100644
--- a/.golangci.yml
+++ b/.golangci.yml
@@ -116,7 +116,7 @@ linters-settings:
 
   gosimple:
     # Select the Go version to target. The default is '1.13'.
-    go: "1.20.10"
+    go: "1.20.11"
 
   nakedret:
     # make an issue if func has more lines of code than this setting and it has naked returns; default is 30
@@ -136,17 +136,17 @@ linters-settings:
 
   staticcheck:
     # Select the Go version to target. The default is '1.13'.
-    go: "1.20.10"
+    go: "1.20.11"
     checks: ["all"]
 
   stylecheck:
     # Select the Go version to target. The default is '1.13'.
-    go: "1.20.10"
+    go: "1.20.11"
     checks: ["all"]
 
   unused:
     # Select the Go version to target. The default is '1.13'.
-    go: "1.20.10"
+    go: "1.20.11"
 
   gosec:
     excludes:
diff --git a/Dockerfile b/Dockerfile
index f3d3475041b..3f727a50317 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-ARG GO_VERSION=1.20.10
+ARG GO_VERSION=1.20.11
 FROM circleci/golang:${GO_VERSION}
 
 
diff --git a/Dockerfile.skaffold b/Dockerfile.skaffold
index 4122e146bf6..96d0a1ea800 100644
--- a/Dockerfile.skaffold
+++ b/Dockerfile.skaffold
@@ -1,4 +1,4 @@
-ARG GO_VERSION=1.20.10
+ARG GO_VERSION=1.20.11
 ARG crossbuild_image="docker.elastic.co/beats-dev/golang-crossbuild"
 ARG AGENT_VERSION=8.9.0-SNAPSHOT
 ARG AGENT_IMAGE="docker.elastic.co/beats/elastic-agent"
diff --git a/changelog/fragments/1700000391-Upgrade-to-Go-1.20.11.yaml b/changelog/fragments/1700000391-Upgrade-to-Go-1.20.11.yaml
new file mode 100644
index 00000000000..e7b4c74c851
--- /dev/null
+++ b/changelog/fragments/1700000391-Upgrade-to-Go-1.20.11.yaml
@@ -0,0 +1,32 @@
+# Kind can be one of:
+# - breaking-change: a change to previously-documented behavior
+# - deprecation: functionality that is being removed in a later release
+# - bug-fix: fixes a problem in a previous version
+# - enhancement: extends functionality but does not break or fix existing behavior
+# - feature: new functionality
+# - known-issue: problems that we are aware of in a given version
+# - security: impacts on the security of a product or a user’s deployment.
+# - upgrade: important information for someone upgrading from a prior version
+# - other: does not fit into any of the other categories
+kind: security
+
+# Change summary; a 80ish characters long description of the change.
+summary: Upgrade to Go 1.20.11.
+
+# Long description; in case the summary is not enough to describe the change
+# this field accommodate a description without length limits.
+# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
+#description:
+
+# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
+component: "elastic-agent"
+
+# PR URL; optional; the PR number that added the changeset.
+# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
+# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
+# Please provide it if you are adding a fragment for a different PR.
+pr: https://github.com/elastic/elastic-agent/pull/3748
+
+# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
+# If not present is automatically filled by the tooling with the issue linked to the PR number.
+#issue: https://github.com/owner/repo/1234
diff --git a/version/docs/version.asciidoc b/version/docs/version.asciidoc
index 87bce0ff1c3..f4443fa3f95 100644
--- a/version/docs/version.asciidoc
+++ b/version/docs/version.asciidoc
@@ -3,7 +3,7 @@
 // FIXME: once elastic.co docs have been switched over to use `main`, remove
 // the `doc-site-branch` line below as well as any references to it in the code.
 :doc-site-branch: master
-:go-version: 1.20.10
+:go-version: 1.20.11
 :release-state: unreleased
 :python: 3.7
 :docker: 1.12

From e0bac95c7c56e96d2210205f63b6a05303f06e5c Mon Sep 17 00:00:00 2001
From: Shaunak Kashyap <ycombinator@gmail.com>
Date: Thu, 16 Nov 2023 14:48:12 +0530
Subject: [PATCH 05/21] [Upgrade Details] Implement tracking of `UPG_SCHEDULED`
 state (#3757)

* Report queued upgrades details as UPG_SCHEDULED

* Add unit test

* Fix linter error

* Fix mock calls

* Implement Equals to address FIXME

* Check all errors from StartTime() call
---
 .../application/dispatcher/dispatcher.go      |  52 +++++++-
 .../application/dispatcher/dispatcher_test.go | 119 ++++++++++++++++--
 .../pkg/agent/application/managed_mode.go     |  11 +-
 .../agent/application/managed_mode_test.go    |  14 ++-
 .../application/upgrade/details/details.go    |  26 ++++
 5 files changed, 199 insertions(+), 23 deletions(-)

diff --git a/internal/pkg/agent/application/dispatcher/dispatcher.go b/internal/pkg/agent/application/dispatcher/dispatcher.go
index 92ac050f9ab..eda450d6a0b 100644
--- a/internal/pkg/agent/application/dispatcher/dispatcher.go
+++ b/internal/pkg/agent/application/dispatcher/dispatcher.go
@@ -14,6 +14,7 @@ import (
 	"go.elastic.co/apm"
 
 	"github.com/elastic/elastic-agent/internal/pkg/agent/application/actions"
+	"github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details"
 	"github.com/elastic/elastic-agent/internal/pkg/agent/errors"
 	"github.com/elastic/elastic-agent/internal/pkg/fleetapi"
 	"github.com/elastic/elastic-agent/internal/pkg/fleetapi/acker"
@@ -31,7 +32,7 @@ type priorityQueue interface {
 
 // Dispatcher processes actions coming from fleet api.
 type Dispatcher interface {
-	Dispatch(context.Context, acker.Acker, ...fleetapi.Action)
+	Dispatch(context.Context, details.Observer, acker.Acker, ...fleetapi.Action)
 	Errors() <-chan error
 }
 
@@ -101,7 +102,7 @@ func (ad *ActionDispatcher) key(a fleetapi.Action) string {
 // Dispatch will handle action queue operations, and retries.
 // Any action that implements the ScheduledAction interface may be added/removed from the queue based on StartTime.
 // Any action that implements the RetryableAction interface will be rescheduled if the handler returns an error.
-func (ad *ActionDispatcher) Dispatch(ctx context.Context, acker acker.Acker, actions ...fleetapi.Action) {
+func (ad *ActionDispatcher) Dispatch(ctx context.Context, detailsSetter details.Observer, acker acker.Acker, actions ...fleetapi.Action) {
 	var err error
 	span, ctx := apm.StartSpan(ctx, "dispatch", "app.internal")
 	defer func() {
@@ -110,6 +111,7 @@ func (ad *ActionDispatcher) Dispatch(ctx context.Context, acker acker.Acker, act
 	}()
 
 	ad.removeQueuedUpgrades(actions)
+	reportNextScheduledUpgrade(actions, detailsSetter, ad.log)
 	actions = ad.queueScheduledActions(actions)
 	actions = ad.dispatchCancelActions(ctx, actions, acker)
 	queued, expired := ad.gatherQueuedActions(time.Now().UTC())
@@ -277,3 +279,49 @@ func (ad *ActionDispatcher) scheduleRetry(ctx context.Context, action fleetapi.R
 		ad.log.Errorf("Unable to commit action retry (id %s) to fleet-server: %v", action.ID(), err)
 	}
 }
+
+func reportNextScheduledUpgrade(input []fleetapi.Action, detailsSetter details.Observer, log *logger.Logger) {
+	var nextUpgrade *fleetapi.ActionUpgrade
+	for _, action := range input {
+		sAction, ok := action.(fleetapi.ScheduledAction)
+		if !ok {
+			continue
+		}
+
+		uAction, ok := sAction.(*fleetapi.ActionUpgrade)
+		if !ok {
+			continue
+		}
+
+		start, err := uAction.StartTime()
+		if err != nil {
+			log.Errorf("failed to get start time for scheduled upgrade action [id = %s]", uAction.ID())
+			continue
+		}
+
+		if !start.After(time.Now()) {
+			continue
+		}
+
+		if nextUpgrade == nil {
+			nextUpgrade = uAction
+			continue
+		}
+
+		nextUpgradeStartTime, _ := nextUpgrade.StartTime()
+		if start.Before(nextUpgradeStartTime) {
+			nextUpgrade = uAction
+		}
+	}
+
+	// If there no scheduled upgrade, there are no upgrade details to report.
+	if nextUpgrade == nil {
+		return
+	}
+
+	upgradeDetails := details.NewDetails(nextUpgrade.Version, details.StateScheduled, nextUpgrade.ID())
+	startTime, _ := nextUpgrade.StartTime()
+	upgradeDetails.Metadata.ScheduledAt = startTime
+
+	detailsSetter(upgradeDetails)
+}
diff --git a/internal/pkg/agent/application/dispatcher/dispatcher_test.go b/internal/pkg/agent/application/dispatcher/dispatcher_test.go
index c82214f2299..212c50a3068 100644
--- a/internal/pkg/agent/application/dispatcher/dispatcher_test.go
+++ b/internal/pkg/agent/application/dispatcher/dispatcher_test.go
@@ -9,13 +9,17 @@ import (
 	"testing"
 	"time"
 
+	"go.uber.org/zap/zapcore"
+
 	"github.com/stretchr/testify/mock"
 	"github.com/stretchr/testify/require"
 
+	"github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details"
 	"github.com/elastic/elastic-agent/internal/pkg/agent/errors"
 	"github.com/elastic/elastic-agent/internal/pkg/fleetapi"
 	"github.com/elastic/elastic-agent/internal/pkg/fleetapi/acker"
 	"github.com/elastic/elastic-agent/internal/pkg/fleetapi/acker/noop"
+	"github.com/elastic/elastic-agent/pkg/core/logger"
 )
 
 type mockHandler struct {
@@ -107,6 +111,7 @@ func (m *mockQueue) Save() error {
 }
 
 func TestActionDispatcher(t *testing.T) {
+	detailsSetter := func(upgradeDetails *details.Details) {}
 	ack := noop.New()
 
 	t.Run("Success to dispatch multiples events", func(t *testing.T) {
@@ -139,7 +144,7 @@ func TestActionDispatcher(t *testing.T) {
 
 		dispatchCtx, cancelFn := context.WithCancel(ctx)
 		defer cancelFn()
-		go d.Dispatch(dispatchCtx, ack, action1, action2)
+		go d.Dispatch(dispatchCtx, detailsSetter, ack, action1, action2)
 		if err := <-d.Errors(); err != nil {
 			t.Fatalf("Unexpected error: %v", err)
 		}
@@ -166,7 +171,7 @@ func TestActionDispatcher(t *testing.T) {
 
 		dispatchCtx, cancelFn := context.WithCancel(ctx)
 		defer cancelFn()
-		go d.Dispatch(dispatchCtx, ack, action)
+		go d.Dispatch(dispatchCtx, detailsSetter, ack, action)
 		if err := <-d.Errors(); err != nil {
 			t.Fatalf("Unexpected error: %v", err)
 		}
@@ -216,7 +221,7 @@ func TestActionDispatcher(t *testing.T) {
 
 		dispatchCtx, cancelFn := context.WithCancel(context.Background())
 		defer cancelFn()
-		go d.Dispatch(dispatchCtx, ack, action1, action2)
+		go d.Dispatch(dispatchCtx, detailsSetter, ack, action1, action2)
 		if err := <-d.Errors(); err != nil {
 			t.Fatalf("Unexpected error: %v", err)
 		}
@@ -247,7 +252,7 @@ func TestActionDispatcher(t *testing.T) {
 
 		dispatchCtx, cancelFn := context.WithCancel(context.Background())
 		defer cancelFn()
-		go d.Dispatch(dispatchCtx, ack, action)
+		go d.Dispatch(dispatchCtx, detailsSetter, ack, action)
 		select {
 		case err := <-d.Errors():
 			t.Fatalf("Unexpected error: %v", err)
@@ -287,7 +292,7 @@ func TestActionDispatcher(t *testing.T) {
 
 		dispatchCtx, cancelFn := context.WithCancel(context.Background())
 		defer cancelFn()
-		go d.Dispatch(dispatchCtx, ack, action2)
+		go d.Dispatch(dispatchCtx, detailsSetter, ack, action2)
 		if err := <-d.Errors(); err != nil {
 			t.Fatalf("Unexpected error: %v", err)
 		}
@@ -310,7 +315,7 @@ func TestActionDispatcher(t *testing.T) {
 
 		dispatchCtx, cancelFn := context.WithCancel(context.Background())
 		defer cancelFn()
-		go d.Dispatch(dispatchCtx, ack)
+		go d.Dispatch(dispatchCtx, detailsSetter, ack)
 		select {
 		case err := <-d.Errors():
 			t.Fatalf("Unexpected error: %v", err)
@@ -345,7 +350,7 @@ func TestActionDispatcher(t *testing.T) {
 
 		dispatchCtx, cancelFn := context.WithCancel(context.Background())
 		defer cancelFn()
-		go d.Dispatch(dispatchCtx, ack, action)
+		go d.Dispatch(dispatchCtx, detailsSetter, ack, action)
 		if err := <-d.Errors(); err != nil {
 			t.Fatalf("Unexpected error: %v", err)
 		}
@@ -379,7 +384,7 @@ func TestActionDispatcher(t *testing.T) {
 		// launch in another routing and sleep to check if an error is generated
 		dispatchCtx, cancelFn := context.WithCancel(context.Background())
 		defer cancelFn()
-		go d.Dispatch(dispatchCtx, ack, action1, action2)
+		go d.Dispatch(dispatchCtx, detailsSetter, ack, action1, action2)
 		time.Sleep(time.Millisecond * 200)
 		select {
 		case <-d.Errors():
@@ -422,7 +427,7 @@ func TestActionDispatcher(t *testing.T) {
 		// launch in another routing and sleep to check if an error is generated
 		dispatchCtx1, cancelFn1 := context.WithCancel(context.Background())
 		defer cancelFn1()
-		go d.Dispatch(dispatchCtx1, ack, action1)
+		go d.Dispatch(dispatchCtx1, detailsSetter, ack, action1)
 		select {
 		case err := <-d.Errors():
 			if err == nil {
@@ -433,7 +438,7 @@ func TestActionDispatcher(t *testing.T) {
 
 		dispatchCtx2, cancelFn2 := context.WithCancel(context.Background())
 		defer cancelFn2()
-		go d.Dispatch(dispatchCtx2, ack, action2)
+		go d.Dispatch(dispatchCtx2, detailsSetter, ack, action2)
 		select {
 		case err := <-d.Errors():
 			if err != nil {
@@ -484,3 +489,97 @@ func Test_ActionDispatcher_scheduleRetry(t *testing.T) {
 		action.AssertExpectations(t)
 	})
 }
+
+func TestReportNextScheduledUpgrade(t *testing.T) {
+	now := time.Now().UTC()
+	later := now.Add(3 * time.Hour)
+	muchLater := later.Add(3 * time.Hour)
+
+	cases := map[string]struct {
+		actions           []fleetapi.Action
+		expectedDetails   *details.Details
+		expectedErrLogMsg string
+	}{
+		"no_scheduled_upgrades": {
+			actions: []fleetapi.Action{
+				&fleetapi.ActionUpgrade{
+					ActionID: "action1",
+					Version:  "8.12.3",
+				},
+			},
+			expectedErrLogMsg: "failed to get start time for scheduled upgrade action [id = action1]",
+		},
+		"one_scheduled_upgrade": {
+			actions: []fleetapi.Action{
+				&fleetapi.ActionUpgrade{
+					ActionID:        "action2",
+					ActionStartTime: later.Format(time.RFC3339),
+					Version:         "8.13.0",
+				},
+			},
+			expectedDetails: &details.Details{
+				TargetVersion: "8.13.0",
+				State:         details.StateScheduled,
+				ActionID:      "action2",
+				Metadata: details.Metadata{
+					ScheduledAt: later.Truncate(time.Second),
+				},
+			},
+		},
+		"many_scheduled_upgrades": {
+			actions: []fleetapi.Action{
+				&fleetapi.ActionUpgrade{
+					ActionID:        "action3",
+					ActionStartTime: muchLater.Format(time.RFC3339),
+					Version:         "8.14.1",
+				},
+				&fleetapi.ActionUpgrade{
+					ActionID:        "action4",
+					ActionStartTime: later.Format(time.RFC3339),
+					Version:         "8.13.5",
+				},
+			},
+			expectedDetails: &details.Details{
+				TargetVersion: "8.13.5",
+				State:         details.StateScheduled,
+				ActionID:      "action4",
+				Metadata: details.Metadata{
+					ScheduledAt: later.Truncate(time.Second),
+				},
+			},
+		},
+		"invalid_time_scheduled_upgrade": {
+			actions: []fleetapi.Action{
+				&fleetapi.ActionUpgrade{
+					ActionID:        "action1",
+					Version:         "8.13.2",
+					ActionStartTime: "invalid",
+				},
+			},
+			expectedErrLogMsg: "failed to get start time for scheduled upgrade action [id = action1]",
+		},
+	}
+
+	for name, test := range cases {
+		t.Run(name, func(t *testing.T) {
+			var actualDetails *details.Details
+			detailsSetter := func(upgradeDetails *details.Details) {
+				actualDetails = upgradeDetails
+			}
+			log, obs := logger.NewTesting("report_next_upgrade_details")
+
+			reportNextScheduledUpgrade(test.actions, detailsSetter, log)
+
+			require.True(t, test.expectedDetails.Equals(actualDetails))
+
+			logs := obs.TakeAll()
+			if test.expectedErrLogMsg != "" {
+				require.Len(t, logs, 1)
+				require.Equal(t, zapcore.ErrorLevel, logs[0].Level)
+				require.Equal(t, test.expectedErrLogMsg, logs[0].Message)
+			} else {
+				require.Empty(t, logs)
+			}
+		})
+	}
+}
diff --git a/internal/pkg/agent/application/managed_mode.go b/internal/pkg/agent/application/managed_mode.go
index 9ced07b678f..524500c661a 100644
--- a/internal/pkg/agent/application/managed_mode.go
+++ b/internal/pkg/agent/application/managed_mode.go
@@ -17,6 +17,7 @@ import (
 	fleetgateway "github.com/elastic/elastic-agent/internal/pkg/agent/application/gateway/fleet"
 	"github.com/elastic/elastic-agent/internal/pkg/agent/application/info"
 	"github.com/elastic/elastic-agent/internal/pkg/agent/application/paths"
+	"github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details"
 	"github.com/elastic/elastic-agent/internal/pkg/agent/configuration"
 	"github.com/elastic/elastic-agent/internal/pkg/agent/errors"
 	"github.com/elastic/elastic-agent/internal/pkg/agent/storage"
@@ -163,7 +164,7 @@ func (m *managedConfigManager) Run(ctx context.Context) error {
 		// persisted action on disk we should be able to ask Fleet to get the latest configuration.
 		// But at the moment this is not possible because the policy change was acked.
 		m.log.Info("restoring current policy from disk")
-		m.dispatcher.Dispatch(ctx, actionAcker, actions...)
+		m.dispatcher.Dispatch(ctx, m.coord.SetUpgradeDetails, actionAcker, actions...)
 		stateRestored = true
 	}
 
@@ -229,24 +230,24 @@ func (m *managedConfigManager) Run(ctx context.Context) error {
 		return gateway.Run(ctx)
 	})
 
-	go runDispatcher(ctx, m.dispatcher, gateway, actionAcker, dispatchFlushInterval)
+	go runDispatcher(ctx, m.dispatcher, gateway, m.coord.SetUpgradeDetails, actionAcker, dispatchFlushInterval)
 
 	<-ctx.Done()
 	return gatewayRunner.Err()
 }
 
 // runDispatcher passes actions collected from gateway to dispatcher or calls Dispatch with no actions every flushInterval.
-func runDispatcher(ctx context.Context, actionDispatcher dispatcher.Dispatcher, fleetGateway coordinator.FleetGateway, actionAcker acker.Acker, flushInterval time.Duration) {
+func runDispatcher(ctx context.Context, actionDispatcher dispatcher.Dispatcher, fleetGateway coordinator.FleetGateway, detailsSetter details.Observer, actionAcker acker.Acker, flushInterval time.Duration) {
 	t := time.NewTimer(flushInterval)
 	for {
 		select {
 		case <-ctx.Done():
 			return
 		case <-t.C: // periodically call the dispatcher to handle scheduled actions.
-			actionDispatcher.Dispatch(ctx, actionAcker)
+			actionDispatcher.Dispatch(ctx, detailsSetter, actionAcker)
 			t.Reset(flushInterval)
 		case actions := <-fleetGateway.Actions():
-			actionDispatcher.Dispatch(ctx, actionAcker, actions...)
+			actionDispatcher.Dispatch(ctx, detailsSetter, actionAcker, actions...)
 			t.Reset(flushInterval)
 		}
 	}
diff --git a/internal/pkg/agent/application/managed_mode_test.go b/internal/pkg/agent/application/managed_mode_test.go
index 25149896ab6..e7fca7ca2b5 100644
--- a/internal/pkg/agent/application/managed_mode_test.go
+++ b/internal/pkg/agent/application/managed_mode_test.go
@@ -10,6 +10,7 @@ import (
 	"testing"
 	"time"
 
+	"github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details"
 	"github.com/elastic/elastic-agent/internal/pkg/fleetapi"
 	"github.com/elastic/elastic-agent/internal/pkg/fleetapi/acker"
 	"github.com/elastic/elastic-agent/internal/pkg/fleetapi/client"
@@ -22,8 +23,8 @@ type mockDispatcher struct {
 	mock.Mock
 }
 
-func (m *mockDispatcher) Dispatch(ctx context.Context, ack acker.Acker, actions ...fleetapi.Action) {
-	m.Called(ctx, ack, actions)
+func (m *mockDispatcher) Dispatch(ctx context.Context, detailsSetter details.Observer, ack acker.Acker, actions ...fleetapi.Action) {
+	m.Called(ctx, detailsSetter, ack, actions)
 }
 
 func (m *mockDispatcher) Errors() <-chan error {
@@ -97,7 +98,7 @@ func Test_runDispatcher(t *testing.T) {
 		},
 		mockDispatcher: func() *mockDispatcher {
 			dispatcher := &mockDispatcher{}
-			dispatcher.On("Dispatch", mock.Anything, mock.Anything, mock.Anything).Once()
+			dispatcher.On("Dispatch", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Once()
 			return dispatcher
 		},
 		interval: time.Second,
@@ -110,8 +111,8 @@ func Test_runDispatcher(t *testing.T) {
 		},
 		mockDispatcher: func() *mockDispatcher {
 			dispatcher := &mockDispatcher{}
-			dispatcher.On("Dispatch", mock.Anything, mock.Anything, mock.Anything).Once()
-			dispatcher.On("Dispatch", mock.Anything, mock.Anything, mock.Anything).Maybe() // allow a second call in case there are timing issues in the CI pipeline
+			dispatcher.On("Dispatch", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Once()
+			dispatcher.On("Dispatch", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Maybe() // allow a second call in case there are timing issues in the CI pipeline
 			return dispatcher
 		},
 		interval: time.Millisecond * 60,
@@ -126,11 +127,12 @@ func Test_runDispatcher(t *testing.T) {
 			ch := make(chan []fleetapi.Action, 1)
 			gateway := tc.mockGateway(ch)
 			dispatcher := tc.mockDispatcher()
+			detailsSetter := func(upgradeDetails *details.Details) {}
 			acker := &mockAcker{}
 
 			ctx, cancel := context.WithTimeout(context.Background(), time.Millisecond*100)
 			defer cancel()
-			runDispatcher(ctx, dispatcher, gateway, acker, tc.interval)
+			runDispatcher(ctx, dispatcher, gateway, detailsSetter, acker, tc.interval)
 			assert.Empty(t, ch)
 
 			gateway.AssertExpectations(t)
diff --git a/internal/pkg/agent/application/upgrade/details/details.go b/internal/pkg/agent/application/upgrade/details/details.go
index 5bc1e409cad..80d039882a8 100644
--- a/internal/pkg/agent/application/upgrade/details/details.go
+++ b/internal/pkg/agent/application/upgrade/details/details.go
@@ -127,6 +127,24 @@ func (d *Details) RegisterObserver(observer Observer) {
 	d.notifyObserver(observer)
 }
 
+// Equals compares the non-lock fields of two Details structs.
+func (d *Details) Equals(otherD *Details) bool {
+	// If both addresses are equal or both are nil
+	if d == otherD {
+		return true
+	}
+
+	// If only one is nil but the other is not
+	if d == nil || otherD == nil {
+		return false
+	}
+
+	return d.State == otherD.State &&
+		d.TargetVersion == otherD.TargetVersion &&
+		d.ActionID == otherD.ActionID &&
+		d.Metadata.Equals(otherD.Metadata)
+}
+
 func (d *Details) notifyObservers() {
 	for _, observer := range d.observers {
 		d.notifyObserver(observer)
@@ -147,6 +165,14 @@ func (d *Details) notifyObserver(observer Observer) {
 	}
 }
 
+func (m Metadata) Equals(otherM Metadata) bool {
+	return m.ScheduledAt.Equal(otherM.ScheduledAt) &&
+		m.FailedState == otherM.FailedState &&
+		m.ErrorMsg == otherM.ErrorMsg &&
+		m.DownloadPercent == otherM.DownloadPercent &&
+		m.DownloadRate == otherM.DownloadRate
+}
+
 func (dr *downloadRate) MarshalJSON() ([]byte, error) {
 	downloadRateBytesPerSecond := float64(*dr)
 	if math.IsInf(downloadRateBytesPerSecond, 0) {

From 234577d6d3269890201135e983439a7ccdcc8b3c Mon Sep 17 00:00:00 2001
From: Anderson Queiroz <anderson.queiroz@elastic.co>
Date: Thu, 16 Nov 2023 07:47:13 -0300
Subject: [PATCH 06/21] add provisioner to Stack and Instance Provisioners
 (#3734)

The Stack and Instance types now have a provisioner property indicating which provisioner provisioned them. Currently it's used when saving and retrieving instances and stack from the saved state
---
 magefile.go                                   | 19 +++---
 ...provision.go => serverless_provisioner.go} | 25 +++++---
 pkg/testing/ess/serverless_test.go            |  2 +-
 ...{provisioner.go => statful_provisioner.go} | 27 +++++---
 pkg/testing/multipass/provisioner.go          | 18 ++++--
 pkg/testing/ogc/provisioner.go                | 22 ++++---
 pkg/testing/runner/provisioner.go             | 13 ++++
 pkg/testing/runner/runner.go                  | 62 ++++++++++---------
 pkg/testing/runner/runner_test.go             | 48 ++++++++------
 9 files changed, 147 insertions(+), 89 deletions(-)
 rename pkg/testing/ess/{serverless_provision.go => serverless_provisioner.go} (90%)
 rename pkg/testing/ess/{provisioner.go => statful_provisioner.go} (82%)

diff --git a/magefile.go b/magefile.go
index c6a73d147c3..6f29cd68991 100644
--- a/magefile.go
+++ b/magefile.go
@@ -1772,10 +1772,14 @@ func createTestRunner(matrix bool, singleTest string, goTestFlags string, batche
 	fmt.Printf(">>>> Using %s instance provisioner\n", instanceProvisionerMode)
 	stackProvisionerMode := os.Getenv("STACK_PROVISIONER")
 	if stackProvisionerMode == "" {
-		stackProvisionerMode = "stateful"
+		stackProvisionerMode = ess.ProvisionerStateful
 	}
-	if stackProvisionerMode != "stateful" && stackProvisionerMode != "serverless" {
-		return nil, fmt.Errorf("STACK_PROVISIONER environment variable must be one of 'serverless' or 'stateful', not %s", stackProvisionerMode)
+	if stackProvisionerMode != ess.ProvisionerStateful &&
+		stackProvisionerMode != ess.ProvisionerServerless {
+		return nil, fmt.Errorf("STACK_PROVISIONER environment variable must be one of %q or %q, not %s",
+			ess.ProvisionerStateful,
+			ess.ProvisionerServerless,
+			stackProvisionerMode)
 	}
 	fmt.Printf(">>>> Using %s stack provisioner\n", stackProvisionerMode)
 
@@ -1831,9 +1835,9 @@ func createTestRunner(matrix bool, singleTest string, goTestFlags string, batche
 	}
 
 	var instanceProvisioner runner.InstanceProvisioner
-	if instanceProvisionerMode == "multipass" {
+	if instanceProvisionerMode == multipass.Name {
 		instanceProvisioner = multipass.NewProvisioner()
-	} else if instanceProvisionerMode == "ogc" {
+	} else if instanceProvisionerMode == ogc.Name {
 		instanceProvisioner, err = ogc.NewProvisioner(ogcCfg)
 		if err != nil {
 			return nil, err
@@ -1848,12 +1852,13 @@ func createTestRunner(matrix bool, singleTest string, goTestFlags string, batche
 		Region:     essRegion,
 	}
 	var stackProvisioner runner.StackProvisioner
-	if stackProvisionerMode == "stateful" {
+	if stackProvisionerMode == ess.ProvisionerStateful {
 		stackProvisioner, err = ess.NewProvisioner(provisionCfg)
 		if err != nil {
 			return nil, err
 		}
-	} else if stackProvisionerMode == "serverless" {
+
+	} else if stackProvisionerMode == ess.ProvisionerServerless {
 		stackProvisioner, err = ess.NewServerlessProvisioner(provisionCfg)
 		if err != nil {
 			return nil, err
diff --git a/pkg/testing/ess/serverless_provision.go b/pkg/testing/ess/serverless_provisioner.go
similarity index 90%
rename from pkg/testing/ess/serverless_provision.go
rename to pkg/testing/ess/serverless_provisioner.go
index c9656f628af..08f01479b82 100644
--- a/pkg/testing/ess/serverless_provision.go
+++ b/pkg/testing/ess/serverless_provisioner.go
@@ -16,8 +16,10 @@ import (
 	"github.com/elastic/elastic-agent/pkg/testing/runner"
 )
 
-// ServerlessProvision contains
-type ServerlessProvision struct {
+const ProvisionerServerless = "serverless"
+
+// ServerlessProvisioner contains
+type ServerlessProvisioner struct {
 	cfg ProvisionerConfig
 	log runner.Logger
 }
@@ -46,7 +48,7 @@ type ServerlessRegions struct {
 
 // NewServerlessProvisioner creates a new StackProvisioner instance for serverless
 func NewServerlessProvisioner(cfg ProvisionerConfig) (runner.StackProvisioner, error) {
-	prov := &ServerlessProvision{
+	prov := &ServerlessProvisioner{
 		cfg: cfg,
 		log: &defaultLogger{wrapped: logp.L()},
 	}
@@ -57,13 +59,17 @@ func NewServerlessProvisioner(cfg ProvisionerConfig) (runner.StackProvisioner, e
 	return prov, nil
 }
 
+func (prov *ServerlessProvisioner) Name() string {
+	return ProvisionerServerless
+}
+
 // SetLogger sets the logger for the
-func (prov *ServerlessProvision) SetLogger(l runner.Logger) {
+func (prov *ServerlessProvisioner) SetLogger(l runner.Logger) {
 	prov.log = l
 }
 
 // Create creates a stack.
-func (prov *ServerlessProvision) Create(ctx context.Context, request runner.StackRequest) (runner.Stack, error) {
+func (prov *ServerlessProvisioner) Create(ctx context.Context, request runner.StackRequest) (runner.Stack, error) {
 	// allow up to 4 minutes for requests
 	createCtx, createCancel := context.WithTimeout(ctx, 4*time.Minute)
 	defer createCancel()
@@ -82,6 +88,7 @@ func (prov *ServerlessProvision) Create(ctx context.Context, request runner.Stac
 	}
 	stack := runner.Stack{
 		ID:            request.ID,
+		Provisioner:   prov.Name(),
 		Version:       request.Version,
 		Elasticsearch: client.proj.Endpoints.Elasticsearch,
 		Kibana:        client.proj.Endpoints.Kibana,
@@ -98,7 +105,7 @@ func (prov *ServerlessProvision) Create(ctx context.Context, request runner.Stac
 }
 
 // WaitForReady should block until the stack is ready or the context is cancelled.
-func (prov *ServerlessProvision) WaitForReady(ctx context.Context, stack runner.Stack) (runner.Stack, error) {
+func (prov *ServerlessProvisioner) WaitForReady(ctx context.Context, stack runner.Stack) (runner.Stack, error) {
 	deploymentID, deploymentType, err := prov.getDeploymentInfo(stack)
 	if err != nil {
 		return stack, fmt.Errorf("failed to get deployment info from the stack: %w", err)
@@ -155,7 +162,7 @@ func (prov *ServerlessProvision) WaitForReady(ctx context.Context, stack runner.
 }
 
 // Delete deletes a stack.
-func (prov *ServerlessProvision) Delete(ctx context.Context, stack runner.Stack) error {
+func (prov *ServerlessProvisioner) Delete(ctx context.Context, stack runner.Stack) error {
 	deploymentID, deploymentType, err := prov.getDeploymentInfo(stack)
 	if err != nil {
 		return fmt.Errorf("failed to get deployment info from the stack: %w", err)
@@ -181,7 +188,7 @@ func (prov *ServerlessProvision) Delete(ctx context.Context, stack runner.Stack)
 // CheckCloudRegion checks to see if the provided region is valid for the serverless
 // if we have an invalid region, overwrite with a valid one.
 // The "normal" and serverless ESS APIs have different regions, hence why we need this.
-func (prov *ServerlessProvision) CheckCloudRegion() error {
+func (prov *ServerlessProvisioner) CheckCloudRegion() error {
 	urlPath := fmt.Sprintf("%s/api/v1/serverless/regions", serverlessURL)
 
 	httpHandler, err := http.NewRequestWithContext(context.Background(), "GET", urlPath, nil)
@@ -231,7 +238,7 @@ func (prov *ServerlessProvision) CheckCloudRegion() error {
 	return nil
 }
 
-func (prov *ServerlessProvision) getDeploymentInfo(stack runner.Stack) (string, string, error) {
+func (prov *ServerlessProvisioner) getDeploymentInfo(stack runner.Stack) (string, string, error) {
 	if stack.Internal == nil {
 		return "", "", fmt.Errorf("missing internal information")
 	}
diff --git a/pkg/testing/ess/serverless_test.go b/pkg/testing/ess/serverless_test.go
index 2fc8e0075b1..f19bd91b7e2 100644
--- a/pkg/testing/ess/serverless_test.go
+++ b/pkg/testing/ess/serverless_test.go
@@ -25,7 +25,7 @@ func TestProvisionGetRegions(t *testing.T) {
 	require.True(t, found)
 
 	cfg := ProvisionerConfig{Region: "bad-region-ID", APIKey: key}
-	prov := &ServerlessProvision{
+	prov := &ServerlessProvisioner{
 		cfg: cfg,
 		log: &defaultLogger{wrapped: logp.L()},
 	}
diff --git a/pkg/testing/ess/provisioner.go b/pkg/testing/ess/statful_provisioner.go
similarity index 82%
rename from pkg/testing/ess/provisioner.go
rename to pkg/testing/ess/statful_provisioner.go
index 47e8d9dcba2..32dc70707ef 100644
--- a/pkg/testing/ess/provisioner.go
+++ b/pkg/testing/ess/statful_provisioner.go
@@ -14,7 +14,9 @@ import (
 	"github.com/elastic/elastic-agent/pkg/testing/runner"
 )
 
-// ProvisionerConfig is the configuration for the ESS provisioner.
+const ProvisionerStateful = "stateful"
+
+// ProvisionerConfig is the configuration for the ESS statefulProvisioner.
 type ProvisionerConfig struct {
 	Identifier string
 	APIKey     string
@@ -35,13 +37,13 @@ func (c *ProvisionerConfig) Validate() error {
 	return nil
 }
 
-type provisioner struct {
+type statefulProvisioner struct {
 	logger runner.Logger
 	cfg    ProvisionerConfig
 	client *Client
 }
 
-// NewProvisioner creates the ESS provisioner
+// NewProvisioner creates the ESS stateful Provisioner
 func NewProvisioner(cfg ProvisionerConfig) (runner.StackProvisioner, error) {
 	err := cfg.Validate()
 	if err != nil {
@@ -50,18 +52,22 @@ func NewProvisioner(cfg ProvisionerConfig) (runner.StackProvisioner, error) {
 	essClient := NewClient(Config{
 		ApiKey: cfg.APIKey,
 	})
-	return &provisioner{
+	return &statefulProvisioner{
 		cfg:    cfg,
 		client: essClient,
 	}, nil
 }
 
-func (p *provisioner) SetLogger(l runner.Logger) {
+func (p *statefulProvisioner) Name() string {
+	return ProvisionerStateful
+}
+
+func (p *statefulProvisioner) SetLogger(l runner.Logger) {
 	p.logger = l
 }
 
 // Create creates a stack.
-func (p *provisioner) Create(ctx context.Context, request runner.StackRequest) (runner.Stack, error) {
+func (p *statefulProvisioner) Create(ctx context.Context, request runner.StackRequest) (runner.Stack, error) {
 	// allow up to 2 minutes for request
 	createCtx, createCancel := context.WithTimeout(ctx, 2*time.Minute)
 	defer createCancel()
@@ -78,6 +84,7 @@ func (p *provisioner) Create(ctx context.Context, request runner.StackRequest) (
 	}
 	return runner.Stack{
 		ID:            request.ID,
+		Provisioner:   p.Name(),
 		Version:       request.Version,
 		Elasticsearch: resp.ElasticsearchEndpoint,
 		Kibana:        resp.KibanaEndpoint,
@@ -91,7 +98,7 @@ func (p *provisioner) Create(ctx context.Context, request runner.StackRequest) (
 }
 
 // WaitForReady should block until the stack is ready or the context is cancelled.
-func (p *provisioner) WaitForReady(ctx context.Context, stack runner.Stack) (runner.Stack, error) {
+func (p *statefulProvisioner) WaitForReady(ctx context.Context, stack runner.Stack) (runner.Stack, error) {
 	deploymentID, err := p.getDeploymentID(stack)
 	if err != nil {
 		return stack, fmt.Errorf("failed to get deployment ID from the stack: %w", err)
@@ -112,7 +119,7 @@ func (p *provisioner) WaitForReady(ctx context.Context, stack runner.Stack) (run
 }
 
 // Delete deletes a stack.
-func (p *provisioner) Delete(ctx context.Context, stack runner.Stack) error {
+func (p *statefulProvisioner) Delete(ctx context.Context, stack runner.Stack) error {
 	deploymentID, err := p.getDeploymentID(stack)
 	if err != nil {
 		return err
@@ -126,7 +133,7 @@ func (p *provisioner) Delete(ctx context.Context, stack runner.Stack) error {
 	return p.client.ShutdownDeployment(ctx, deploymentID)
 }
 
-func (p *provisioner) createDeployment(ctx context.Context, r runner.StackRequest, tags map[string]string) (*CreateDeploymentResponse, error) {
+func (p *statefulProvisioner) createDeployment(ctx context.Context, r runner.StackRequest, tags map[string]string) (*CreateDeploymentResponse, error) {
 	ctx, cancel := context.WithTimeout(ctx, 1*time.Minute)
 	defer cancel()
 
@@ -158,7 +165,7 @@ func (p *provisioner) createDeployment(ctx context.Context, r runner.StackReques
 	return resp, nil
 }
 
-func (p *provisioner) getDeploymentID(stack runner.Stack) (string, error) {
+func (p *statefulProvisioner) getDeploymentID(stack runner.Stack) (string, error) {
 	if stack.Internal == nil {
 		return "", fmt.Errorf("missing internal information")
 	}
diff --git a/pkg/testing/multipass/provisioner.go b/pkg/testing/multipass/provisioner.go
index 7703dc7e335..43a18987ce0 100644
--- a/pkg/testing/multipass/provisioner.go
+++ b/pkg/testing/multipass/provisioner.go
@@ -24,6 +24,7 @@ import (
 
 const (
 	Ubuntu = "ubuntu"
+	Name   = "multipass"
 )
 
 type provisioner struct {
@@ -35,6 +36,10 @@ func NewProvisioner() runner.InstanceProvisioner {
 	return &provisioner{}
 }
 
+func (p *provisioner) Name() string {
+	return Name
+}
+
 func (p *provisioner) SetLogger(l runner.Logger) {
 	p.logger = l
 }
@@ -92,12 +97,13 @@ func (p *provisioner) Provision(ctx context.Context, cfg runner.Config, batches
 			return nil, fmt.Errorf("instance %s is not marked as running", batch.ID)
 		}
 		results = append(results, runner.Instance{
-			ID:         batch.ID,
-			Name:       batch.ID,
-			IP:         mi.IPv4[0],
-			Username:   "ubuntu",
-			RemotePath: "/home/ubuntu/agent",
-			Internal:   nil,
+			ID:          batch.ID,
+			Provisioner: Name,
+			Name:        batch.ID,
+			IP:          mi.IPv4[0],
+			Username:    "ubuntu",
+			RemotePath:  "/home/ubuntu/agent",
+			Internal:    nil,
 		})
 	}
 	return results, nil
diff --git a/pkg/testing/ogc/provisioner.go b/pkg/testing/ogc/provisioner.go
index 74c92eb0b30..00a05459881 100644
--- a/pkg/testing/ogc/provisioner.go
+++ b/pkg/testing/ogc/provisioner.go
@@ -24,6 +24,7 @@ import (
 const (
 	// LayoutIntegrationTag is the tag added to all layouts for the integration testing framework.
 	LayoutIntegrationTag = "agent-integration"
+	Name                 = "ogc"
 )
 
 type provisioner struct {
@@ -42,6 +43,10 @@ func NewProvisioner(cfg Config) (runner.InstanceProvisioner, error) {
 	}, nil
 }
 
+func (p *provisioner) Name() string {
+	return Name
+}
+
 func (p *provisioner) SetLogger(l runner.Logger) {
 	p.logger = l
 }
@@ -83,8 +88,8 @@ func (p *provisioner) Provision(ctx context.Context, cfg runner.Config, batches
 		return nil, err
 	}
 	if len(machines) == 0 {
-		// print the output so its clear what went wrong
-		// without this it's unclear where OGC went wrong it
+		// Print the output so its clear what went wrong.
+		// Without this it's unclear where OGC went wrong, it
 		// doesn't do a great job of reporting a clean error
 		fmt.Fprintf(os.Stdout, "%s\n", upOutput)
 		return nil, fmt.Errorf("ogc didn't create any machines")
@@ -99,14 +104,15 @@ func (p *provisioner) Provision(ctx context.Context, cfg runner.Config, batches
 			// without this it's unclear where OGC went wrong it
 			// doesn't do a great job of reporting a clean error
 			fmt.Fprintf(os.Stdout, "%s\n", upOutput)
-			return nil, fmt.Errorf("failed to find machine for layout ID: %s", b.ID)
+			return nil, fmt.Errorf("failed to find machine for batch ID: %s", b.ID)
 		}
 		instances = append(instances, runner.Instance{
-			ID:         b.ID,
-			Name:       machine.InstanceName,
-			IP:         machine.PublicIP,
-			Username:   machine.Layout.Username,
-			RemotePath: machine.Layout.RemotePath,
+			ID:          b.ID,
+			Provisioner: Name,
+			Name:        machine.InstanceName,
+			IP:          machine.PublicIP,
+			Username:    machine.Layout.Username,
+			RemotePath:  machine.Layout.RemotePath,
 			Internal: map[string]interface{}{
 				"instance_id": machine.InstanceID,
 			},
diff --git a/pkg/testing/runner/provisioner.go b/pkg/testing/runner/provisioner.go
index 2708b0d204d..7a27ad6697b 100644
--- a/pkg/testing/runner/provisioner.go
+++ b/pkg/testing/runner/provisioner.go
@@ -18,6 +18,9 @@ type Instance struct {
 	ID string `yaml:"id"`
 	// Name is the nice-name of the instance.
 	Name string `yaml:"name"`
+	// Provisioner is the instance provider for the instance.
+	// See INSTANCE_PROVISIONER environment variable for the supported Provisioner.
+	Provisioner string `yaml:"provisioner"`
 	// IP is the IP address of the instance.
 	IP string `yaml:"ip"`
 	// Username is the username used to SSH to the instance.
@@ -32,6 +35,9 @@ type Instance struct {
 
 // InstanceProvisioner performs the provisioning of instances.
 type InstanceProvisioner interface {
+	// Name returns the name of the instance provisioner.
+	Name() string
+
 	// SetLogger sets the logger for it to use.
 	SetLogger(l Logger)
 
@@ -54,6 +60,10 @@ type Stack struct {
 	// This must be the same ID used for requesting a stack.
 	ID string `yaml:"id"`
 
+	// Provisioner is the stack provisioner. See STACK_PROVISIONER environment
+	// variable for the supported provisioners.
+	Provisioner string `yaml:"provisioner"`
+
 	// Version is the version of the stack.
 	Version string `yaml:"version"`
 
@@ -89,6 +99,9 @@ type StackRequest struct {
 
 // StackProvisioner performs the provisioning of stacks.
 type StackProvisioner interface {
+	// Name returns the name of the stack provisioner.
+	Name() string
+
 	// SetLogger sets the logger for it to use.
 	SetLogger(l Logger)
 
diff --git a/pkg/testing/runner/runner.go b/pkg/testing/runner/runner.go
index a2c77f77aa0..860ad57215e 100644
--- a/pkg/testing/runner/runner.go
+++ b/pkg/testing/runner/runner.go
@@ -594,23 +594,10 @@ func (r *Runner) startStacks(ctx context.Context) error {
 	var requests []stackReq
 	for _, version := range versions {
 		id := strings.Replace(version, ".", "", -1)
-		stack, ok := r.findStack(id)
-		if ok {
-			requests = append(requests, stackReq{
-				request: StackRequest{
-					ID:      id,
-					Version: version,
-				},
-				stack: &stack,
-			})
-		} else {
-			requests = append(requests, stackReq{
-				request: StackRequest{
-					ID:      id,
-					Version: version,
-				},
-			})
-		}
+		requests = append(requests, stackReq{
+			request: StackRequest{ID: id, Version: version},
+			stack:   r.findStack(id),
+		})
 	}
 
 	reportResult := func(version string, stack Stack, err error) {
@@ -706,7 +693,8 @@ func (r *Runner) findInstance(id string) (StateInstance, bool) {
 	r.stateMx.Lock()
 	defer r.stateMx.Unlock()
 	for _, existing := range r.state.Instances {
-		if existing.ID == id {
+		if existing.Same(StateInstance{
+			Instance: Instance{ID: id, Provisioner: r.ip.Name()}}) {
 			return existing, true
 		}
 	}
@@ -718,30 +706,30 @@ func (r *Runner) addOrUpdateInstance(instance StateInstance) error {
 	defer r.stateMx.Unlock()
 
 	state := r.state
-	existed := false
+	found := false
 	for idx, existing := range state.Instances {
-		if existing.ID == instance.ID {
+		if existing.Same(instance) {
 			state.Instances[idx] = instance
-			existed = true
+			found = true
 			break
 		}
 	}
-	if !existed {
+	if !found {
 		state.Instances = append(state.Instances, instance)
 	}
 	r.state = state
 	return r.writeState()
 }
 
-func (r *Runner) findStack(id string) (Stack, bool) {
+func (r *Runner) findStack(id string) *Stack {
 	r.stateMx.Lock()
 	defer r.stateMx.Unlock()
 	for _, existing := range r.state.Stacks {
-		if existing.ID == id {
-			return existing, true
+		if existing.Same(Stack{ID: id, Provisioner: r.sp.Name()}) {
+			return &existing
 		}
 	}
-	return Stack{}, false
+	return nil
 }
 
 func (r *Runner) addOrUpdateStack(stack Stack) error {
@@ -749,15 +737,15 @@ func (r *Runner) addOrUpdateStack(stack Stack) error {
 	defer r.stateMx.Unlock()
 
 	state := r.state
-	existed := false
+	found := false
 	for idx, existing := range state.Stacks {
-		if existing.ID == stack.ID {
+		if existing.Same(stack) {
 			state.Stacks[idx] = stack
-			existed = true
+			found = true
 			break
 		}
 	}
-	if !existed {
+	if !found {
 		state.Stacks = append(state.Stacks, stack)
 	}
 	r.state = state
@@ -829,6 +817,20 @@ func (r *Runner) mergeResults(results map[string]OSRunnerResult) (Result, error)
 	return complete, nil
 }
 
+// Same returns true if other is the same instance as this one.
+// Two instances are considered the same if their provider and ID are the same.
+func (s StateInstance) Same(other StateInstance) bool {
+	return s.Provisioner == other.Provisioner &&
+		s.ID == other.ID
+}
+
+// Same returns true if other is the same stack as this one.
+// Two stacks are considered the same if their provisioner and ID are the same.
+func (s Stack) Same(other Stack) bool {
+	return s.Provisioner == other.Provisioner &&
+		s.ID == other.ID
+}
+
 func mergePackageResult(pkg OSRunnerPackageResult, batchName string, sudo bool, rawOutput io.Writer, jsonOutput io.Writer, suites *JUnitTestSuites) error {
 	suffix := ""
 	sudoStr := "false"
diff --git a/pkg/testing/runner/runner_test.go b/pkg/testing/runner/runner_test.go
index c46b3b53761..c638b291bd4 100644
--- a/pkg/testing/runner/runner_test.go
+++ b/pkg/testing/runner/runner_test.go
@@ -38,12 +38,13 @@ func TestNewRunner_Clean(t *testing.T) {
 	require.NoError(t, err)
 
 	i1 := Instance{
-		ID:         "id-1",
-		Name:       "name-1",
-		IP:         "127.0.0.1",
-		Username:   "ubuntu",
-		RemotePath: "/home/ubuntu/agent",
-		Internal:   map[string]interface{}{}, // ElementsMatch fails without this set
+		ID:          "id-1",
+		Name:        "name-1",
+		Provisioner: ip.Name(),
+		IP:          "127.0.0.1",
+		Username:    "ubuntu",
+		RemotePath:  "/home/ubuntu/agent",
+		Internal:    map[string]interface{}{}, // ElementsMatch fails without this set
 	}
 	err = r.addOrUpdateInstance(StateInstance{
 		Instance: i1,
@@ -51,12 +52,13 @@ func TestNewRunner_Clean(t *testing.T) {
 	})
 	require.NoError(t, err)
 	i2 := Instance{
-		ID:         "id-2",
-		Name:       "name-2",
-		IP:         "127.0.0.2",
-		Username:   "ubuntu",
-		RemotePath: "/home/ubuntu/agent",
-		Internal:   map[string]interface{}{}, // ElementsMatch fails without this set
+		ID:          "id-2",
+		Name:        "name-2",
+		Provisioner: ip.Name(),
+		IP:          "127.0.0.2",
+		Username:    "ubuntu",
+		RemotePath:  "/home/ubuntu/agent",
+		Internal:    map[string]interface{}{}, // ElementsMatch fails without this set
 	}
 	err = r.addOrUpdateInstance(StateInstance{
 		Instance: i2,
@@ -64,16 +66,18 @@ func TestNewRunner_Clean(t *testing.T) {
 	})
 	require.NoError(t, err)
 	s1 := Stack{
-		ID:       "id-1",
-		Version:  "8.10.0",
-		Internal: map[string]interface{}{}, // ElementsMatch fails without this set
+		ID:          "id-1",
+		Provisioner: sp.Name(),
+		Version:     "8.10.0",
+		Internal:    map[string]interface{}{}, // ElementsMatch fails without this set
 	}
 	err = r.addOrUpdateStack(s1)
 	require.NoError(t, err)
 	s2 := Stack{
-		ID:       "id-2",
-		Version:  "8.9.0",
-		Internal: map[string]interface{}{}, // ElementsMatch fails without this set
+		ID:          "id-2",
+		Provisioner: sp.Name(),
+		Version:     "8.9.0",
+		Internal:    map[string]interface{}{}, // ElementsMatch fails without this set
 	}
 	err = r.addOrUpdateStack(s2)
 	require.NoError(t, err)
@@ -95,6 +99,10 @@ type fakeInstanceProvisioner struct {
 	instances []Instance
 }
 
+func (f *fakeInstanceProvisioner) Name() string {
+	return "fake"
+}
+
 func (f *fakeInstanceProvisioner) SetLogger(_ Logger) {
 }
 
@@ -129,6 +137,10 @@ type fakeStackProvisioner struct {
 	deletedStacks []Stack
 }
 
+func (f *fakeStackProvisioner) Name() string {
+	return "fake"
+}
+
 func (f *fakeStackProvisioner) SetLogger(_ Logger) {
 }
 

From d58ac9cb9183317ea0bc58bc6d1e38244000b5a3 Mon Sep 17 00:00:00 2001
From: Shaunak Kashyap <ycombinator@gmail.com>
Date: Thu, 16 Nov 2023 18:32:18 +0530
Subject: [PATCH 07/21] [Upgrade Details] Implement tracking of `UPG_ROLLBACK`
 state (#3694)

* Remove context and handle cancellation internally instead

* More optimizations

* Add back context

* Adding FSM for upgrades

* Implementing TODO

* WIP

* WIP

* Reorganizing imports

* Running go mod tidy

* Remove Fleet changes

* Fixing booboos introduced during conflict resolution

* Add nil guard

* Setting logger in test

* Regenerating protobuf implementation files

* Newline fixes

* Generating protobuf implementations

* Running mage update

* Add TODOs

* Serialize upgrade details to upgrade marker before starting upgrade watcher

* Refactoring: move watch function from coordinator -> upgrade package

* WIP: unit test

* Refactoring: make LoadMarker testable

* Fix unit test

* Consume from errorCh

* Make watchMarker implementation more robust

* Clarify godoc

* Export SaveMarker

* Set UPG_ROLLBACK state

* Update integration test to check for UPG_ROLLBACK state after rolling back

* Fix test

* Add FIXME comment

* Running make notice

* Avoid data race in test

* Remove old comment

* Introduce MarkerWatcher

* Add upgradeMarkerDirPath to error

* Make unit tests pass

* Remove FIXME

* Initialize upgrade details in marker if needed

* Fix tests

* [Debugging] Log statements at INFO level

* [Debugging] More debug logging

* [Debugging] More logging

* Fix logic in error check

* Removing debugging changes

* Fixing data type

* Resolve conflicts

* Add nil checks

* Fix comments formatting.

Co-authored-by: Anderson Queiroz <me@andersonq.me>

* Consistency + readability

* Simplifying marker watch implementation

* Expand comment

* Fix test

* Simplify pointer checks + comments

* Simplify bitwise logic

* Refactoring marker access in preparation for Windows-specific retry logic

* Replacing deprecated function calls

* Implement retries for marker file access on Windows

* Marker CreateMarkerTestDir as test helper

* Fix windows linter errors

* Simplify implementation

* Close fsnotify watcher when goroutine exits

* Remove createMarkerTestDir helper function

* Check error in test

* Prevent timeout

* Add warning if upgrade details are unexpectedly missing

* Fallback for recording UPG_ROLLBACK

* Clear upgrade details upon successful upgrade

* Fix fallback logic for recording UPG_ROLLBACK state

* Add + update unit tests

* Running mage fmt

* Remove data races in test code

* Reduce max timeout for retry operation

* Implement non-existent marker file check into OS-specific read implementations

---------

Co-authored-by: Anderson Queiroz <me@andersonq.me>
---
 NOTICE.txt                                    |  76 +++---
 go.mod                                        |   2 +-
 .../handlers/handler_action_upgrade_test.go   |   5 +
 internal/pkg/agent/application/application.go |   5 +-
 .../application/coordinator/coordinator.go    |  53 ++++-
 .../coordinator/coordinator_test.go           |  39 +--
 .../coordinator/coordinator_unit_test.go      |  13 +-
 .../application/upgrade/details/details.go    |   1 +
 .../upgrade/details/details_test.go           |  23 ++
 .../upgrade/marker_access_other.go            |  30 +++
 .../upgrade/marker_access_windows.go          |  76 ++++++
 .../application/upgrade/marker_watcher.go     | 132 ++++++++++
 .../upgrade/marker_watcher_test.go            | 225 ++++++++++++++++++
 .../application/upgrade/step_download_test.go |  16 +-
 .../agent/application/upgrade/step_mark.go    |  37 ++-
 .../pkg/agent/application/upgrade/upgrade.go  |  25 +-
 internal/pkg/agent/cmd/watch.go               |  46 +++-
 testing/integration/upgrade_rollback_test.go  |  35 +++
 18 files changed, 743 insertions(+), 96 deletions(-)
 create mode 100644 internal/pkg/agent/application/upgrade/marker_access_other.go
 create mode 100644 internal/pkg/agent/application/upgrade/marker_access_windows.go
 create mode 100644 internal/pkg/agent/application/upgrade/marker_watcher.go
 create mode 100644 internal/pkg/agent/application/upgrade/marker_watcher_test.go

diff --git a/NOTICE.txt b/NOTICE.txt
index 11fa3452084..a6e55930766 100644
--- a/NOTICE.txt
+++ b/NOTICE.txt
@@ -2673,6 +2673,44 @@ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 
+--------------------------------------------------------------------------------
+Dependency : github.com/adriansr/fsnotify
+Version: v1.4.8-0.20211018144411-a81f2b630e7c
+Licence type (autodetected): BSD-3-Clause
+--------------------------------------------------------------------------------
+
+Contents of probable licence file $GOMODCACHE/github.com/adriansr/fsnotify@v1.4.8-0.20211018144411-a81f2b630e7c/LICENSE:
+
+Copyright (c) 2012 The Go Authors. All rights reserved.
+Copyright (c) 2012 fsnotify Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
 --------------------------------------------------------------------------------
 Dependency : github.com/gofrs/flock
 Version: v0.8.1
@@ -10185,44 +10223,6 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
---------------------------------------------------------------------------------
-Dependency : github.com/adriansr/fsnotify
-Version: v1.4.8-0.20211018144411-a81f2b630e7c
-Licence type (autodetected): BSD-3-Clause
---------------------------------------------------------------------------------
-
-Contents of probable licence file $GOMODCACHE/github.com/adriansr/fsnotify@v1.4.8-0.20211018144411-a81f2b630e7c/LICENSE:
-
-Copyright (c) 2012 The Go Authors. All rights reserved.
-Copyright (c) 2012 fsnotify Authors. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-   * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-   * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-   * Neither the name of Google Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
 --------------------------------------------------------------------------------
 Dependency : github.com/ghodss/yaml
 Version: v1.0.0
diff --git a/go.mod b/go.mod
index 2016effc622..897874a5261 100644
--- a/go.mod
+++ b/go.mod
@@ -22,6 +22,7 @@ require (
 	github.com/elastic/go-sysinfo v1.11.1
 	github.com/elastic/go-ucfg v0.8.6
 	github.com/fatih/color v1.15.0
+	github.com/fsnotify/fsnotify v1.6.0
 	github.com/gofrs/flock v0.8.1
 	github.com/gofrs/uuid v4.4.0+incompatible
 	github.com/google/go-cmp v0.5.9
@@ -92,7 +93,6 @@ require (
 	github.com/elastic/gosigar v0.14.2 // indirect
 	github.com/emicklei/go-restful/v3 v3.10.1 // indirect
 	github.com/evanphx/json-patch v5.6.0+incompatible // indirect
-	github.com/fsnotify/fsnotify v1.6.0 // indirect
 	github.com/ghodss/yaml v1.0.0 // indirect
 	github.com/go-logr/logr v1.2.3 // indirect
 	github.com/go-ole/go-ole v1.2.6 // indirect
diff --git a/internal/pkg/agent/application/actions/handlers/handler_action_upgrade_test.go b/internal/pkg/agent/application/actions/handlers/handler_action_upgrade_test.go
index b917b00b900..01377e43313 100644
--- a/internal/pkg/agent/application/actions/handlers/handler_action_upgrade_test.go
+++ b/internal/pkg/agent/application/actions/handlers/handler_action_upgrade_test.go
@@ -14,6 +14,7 @@ import (
 	"github.com/elastic/elastic-agent/internal/pkg/agent/application/coordinator"
 	"github.com/elastic/elastic-agent/internal/pkg/agent/application/info"
 	"github.com/elastic/elastic-agent/internal/pkg/agent/application/reexec"
+	"github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade"
 	"github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details"
 	"github.com/elastic/elastic-agent/internal/pkg/agent/configuration"
 	"github.com/elastic/elastic-agent/internal/pkg/config"
@@ -51,6 +52,10 @@ func (u *mockUpgradeManager) Ack(ctx context.Context, acker acker.Acker) error {
 	return nil
 }
 
+func (u *mockUpgradeManager) MarkerWatcher() upgrade.MarkerWatcher {
+	return nil
+}
+
 func TestUpgradeHandler(t *testing.T) {
 	// Create a cancellable context that will shut down the coordinator after
 	// the test.
diff --git a/internal/pkg/agent/application/application.go b/internal/pkg/agent/application/application.go
index 6ca33c35c05..1f8853120e3 100644
--- a/internal/pkg/agent/application/application.go
+++ b/internal/pkg/agent/application/application.go
@@ -101,7 +101,10 @@ func New(
 
 	// monitoring is not supported in bootstrap mode https://github.com/elastic/elastic-agent/issues/1761
 	isMonitoringSupported := !disableMonitoring && cfg.Settings.V1MonitoringEnabled
-	upgrader := upgrade.NewUpgrader(log, cfg.Settings.DownloadConfig, agentInfo)
+	upgrader, err := upgrade.NewUpgrader(log, cfg.Settings.DownloadConfig, agentInfo)
+	if err != nil {
+		return nil, nil, nil, fmt.Errorf("failed to create upgrader: %w", err)
+	}
 	monitor := monitoring.New(isMonitoringSupported, cfg.Settings.DownloadConfig.OS(), cfg.Settings.MonitoringConfig, agentInfo)
 
 	runtime, err := runtime.NewManager(
diff --git a/internal/pkg/agent/application/coordinator/coordinator.go b/internal/pkg/agent/application/coordinator/coordinator.go
index 1bbfde40a20..c33cc700729 100644
--- a/internal/pkg/agent/application/coordinator/coordinator.go
+++ b/internal/pkg/agent/application/coordinator/coordinator.go
@@ -21,6 +21,7 @@ import (
 
 	"github.com/elastic/elastic-agent/internal/pkg/agent/application/info"
 	"github.com/elastic/elastic-agent/internal/pkg/agent/application/reexec"
+	"github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade"
 	"github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details"
 	"github.com/elastic/elastic-agent/internal/pkg/agent/configuration"
 	"github.com/elastic/elastic-agent/internal/pkg/agent/transpiler"
@@ -66,6 +67,9 @@ type UpgradeManager interface {
 
 	// Ack is used on startup to check if the agent has upgraded and needs to send an ack for the action
 	Ack(ctx context.Context, acker acker.Acker) error
+
+	// MarkerWatcher returns a watcher for the upgrade marker.
+	MarkerWatcher() upgrade.MarkerWatcher
 }
 
 // MonitorManager provides an interface to perform the monitoring action for the agent.
@@ -291,6 +295,8 @@ type managerChans struct {
 
 	varsManagerUpdate <-chan []*transpiler.Vars
 	varsManagerError  <-chan error
+
+	upgradeMarkerUpdate <-chan upgrade.UpdateMarker
 }
 
 // New creates a new coordinator.
@@ -372,6 +378,9 @@ func New(logger *logger.Logger, cfg *configuration.Configuration, logLevel logp.
 		c.managerChans.varsManagerUpdate = varsMgr.Watch()
 		c.managerChans.varsManagerError = varsMgr.Errors()
 	}
+	if upgradeMgr != nil && upgradeMgr.MarkerWatcher() != nil {
+		c.managerChans.upgradeMarkerUpdate = upgradeMgr.MarkerWatcher().Watch()
+	}
 	return c
 }
 
@@ -876,6 +885,18 @@ func (c *Coordinator) runner(ctx context.Context) error {
 		varsErrCh <- nil
 	}
 
+	upgradeMarkerWatcherErrCh := make(chan error, 1)
+	if c.upgradeMgr != nil && c.upgradeMgr.MarkerWatcher() != nil {
+		err := c.upgradeMgr.MarkerWatcher().Run(ctx)
+		if err != nil {
+			upgradeMarkerWatcherErrCh <- err
+		} else {
+			upgradeMarkerWatcherErrCh <- nil
+		}
+	} else {
+		upgradeMarkerWatcherErrCh <- nil
+	}
+
 	// Keep looping until the context ends.
 	for ctx.Err() == nil {
 		c.runLoopIteration(ctx)
@@ -883,7 +904,7 @@ func (c *Coordinator) runner(ctx context.Context) error {
 
 	// If we got fatal errors from any of the managers, return them.
 	// Otherwise, just return the context's closing error.
-	err := collectManagerErrors(managerShutdownTimeout, varsErrCh, runtimeErrCh, configErrCh)
+	err := collectManagerErrors(managerShutdownTimeout, varsErrCh, runtimeErrCh, configErrCh, upgradeMarkerWatcherErrCh)
 	if err != nil {
 		c.logger.Debugf("Manager errors on Coordinator shutdown: %v", err.Error())
 		return err
@@ -968,6 +989,11 @@ func (c *Coordinator) runLoopIteration(ctx context.Context) {
 		if ctx.Err() == nil {
 			c.processLogLevel(ctx, ll)
 		}
+
+	case upgradeMarker := <-c.managerChans.upgradeMarkerUpdate:
+		if ctx.Err() == nil {
+			c.setUpgradeDetails(upgradeMarker.Details)
+		}
 	}
 
 	// At the end of each iteration, if we made any changes to the state,
@@ -1219,19 +1245,20 @@ func (c *Coordinator) filterByCapabilities(comps []component.Component) []compon
 }
 
 // collectManagerErrors listens on the shutdown channels for the
-// runtime, config, and vars managers, and waits for up to
-// the specified timeout for them to report their final status.
+// runtime, config, and vars managers as well as the upgrade marker
+// watcher and waits for up to the specified timeout for them to
+// report their final status.
 // It returns any resulting errors as a multierror, or nil if no errors
 // were reported.
 // Called on the main Coordinator goroutine.
-func collectManagerErrors(timeout time.Duration, varsErrCh, runtimeErrCh, configErrCh chan error) error {
-	var runtimeErr error
-	var configErr error
-	var varsErr error
+func collectManagerErrors(timeout time.Duration, varsErrCh, runtimeErrCh, configErrCh, upgradeMarkerWatcherErrCh chan error) error {
+	var runtimeErr, configErr, varsErr, upgradeMarkerWatcherErr error
+	var returnedRuntime, returnedConfig, returnedVars, returnedUpgradeMarkerWatcher bool
+
 	// in case other components are locked up, let us time out
 	timeoutWait := time.NewTimer(timeout)
 	defer timeoutWait.Stop()
-	var returnedRuntime, returnedConfig, returnedVars bool
+
 	/*
 		Wait for all managers to gently shut down. All managers send
 		an error status on their termination channel after their Run method
@@ -1248,7 +1275,7 @@ func collectManagerErrors(timeout time.Duration, varsErrCh, runtimeErrCh, config
 	var combinedErr error
 
 waitLoop:
-	for !returnedRuntime || !returnedConfig || !returnedVars {
+	for !returnedRuntime || !returnedConfig || !returnedVars || !returnedUpgradeMarkerWatcher {
 		select {
 		case runtimeErr = <-runtimeErrCh:
 			returnedRuntime = true
@@ -1256,6 +1283,8 @@ waitLoop:
 			returnedConfig = true
 		case varsErr = <-varsErrCh:
 			returnedVars = true
+		case upgradeMarkerWatcherErr = <-upgradeMarkerWatcherErrCh:
+			returnedUpgradeMarkerWatcher = true
 		case <-timeoutWait.C:
 			var timeouts []string
 			if !returnedRuntime {
@@ -1267,6 +1296,9 @@ waitLoop:
 			if !returnedVars {
 				timeouts = append(timeouts, "no response from vars manager")
 			}
+			if !returnedUpgradeMarkerWatcher {
+				timeouts = append(timeouts, "no response from upgrade marker watcher")
+			}
 			timeoutStr := strings.Join(timeouts, ", ")
 			combinedErr = multierror.Append(combinedErr, fmt.Errorf("timeout while waiting for managers to shut down: %v", timeoutStr))
 			break waitLoop
@@ -1281,6 +1313,9 @@ waitLoop:
 	if varsErr != nil && !errors.Is(varsErr, context.Canceled) {
 		combinedErr = multierror.Append(combinedErr, fmt.Errorf("vars manager: %w", varsErr))
 	}
+	if upgradeMarkerWatcherErr != nil && !errors.Is(upgradeMarkerWatcherErr, context.Canceled) {
+		combinedErr = multierror.Append(combinedErr, fmt.Errorf("upgrade marker watcher: %w", upgradeMarkerWatcherErr))
+	}
 	return combinedErr
 }
 
diff --git a/internal/pkg/agent/application/coordinator/coordinator_test.go b/internal/pkg/agent/application/coordinator/coordinator_test.go
index 074e5cf6e74..676906bbde0 100644
--- a/internal/pkg/agent/application/coordinator/coordinator_test.go
+++ b/internal/pkg/agent/application/coordinator/coordinator_test.go
@@ -15,15 +15,11 @@ import (
 	"testing"
 	"time"
 
-	"github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details"
-
-	"github.com/stretchr/testify/assert"
-
 	"github.com/elastic/elastic-agent-client/v7/pkg/client"
-	agentclient "github.com/elastic/elastic-agent/pkg/control/v2/client"
-	"github.com/elastic/elastic-agent/pkg/control/v2/cproto"
 
+	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
+
 	"go.elastic.co/apm/apmtest"
 
 	"github.com/elastic/elastic-agent-libs/logp"
@@ -31,6 +27,8 @@ import (
 	"github.com/elastic/elastic-agent/internal/pkg/agent/application/info"
 	"github.com/elastic/elastic-agent/internal/pkg/agent/application/paths"
 	"github.com/elastic/elastic-agent/internal/pkg/agent/application/reexec"
+	"github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade"
+	"github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details"
 	"github.com/elastic/elastic-agent/internal/pkg/agent/configuration"
 	"github.com/elastic/elastic-agent/internal/pkg/agent/transpiler"
 	"github.com/elastic/elastic-agent/internal/pkg/capabilities"
@@ -39,6 +37,8 @@ import (
 	"github.com/elastic/elastic-agent/internal/pkg/fleetapi/acker"
 	"github.com/elastic/elastic-agent/pkg/component"
 	"github.com/elastic/elastic-agent/pkg/component/runtime"
+	agentclient "github.com/elastic/elastic-agent/pkg/control/v2/client"
+	"github.com/elastic/elastic-agent/pkg/control/v2/cproto"
 	"github.com/elastic/elastic-agent/pkg/core/logger"
 )
 
@@ -321,7 +321,7 @@ func TestCoordinator_StateSubscribe(t *testing.T) {
 }
 
 func TestCollectManagerErrorsTimeout(t *testing.T) {
-	handlerChan, _, _, _ := setupManagerShutdownChannels(time.Millisecond)
+	handlerChan, _, _, _, _ := setupManagerShutdownChannels(time.Millisecond)
 	// Don't send anything to the shutdown channels, causing a timeout
 	// in collectManagerErrors
 	waitAndTestError(t, func(err error) bool {
@@ -332,7 +332,7 @@ func TestCollectManagerErrorsTimeout(t *testing.T) {
 }
 
 func TestCollectManagerErrorsOneResponse(t *testing.T) {
-	handlerChan, _, _, config := setupManagerShutdownChannels(10 * time.Millisecond)
+	handlerChan, _, _, config, _ := setupManagerShutdownChannels(10 * time.Millisecond)
 
 	// Send an error for the config manager -- we should also get a
 	// timeout error since we don't send anything on the other two channels.
@@ -348,26 +348,30 @@ func TestCollectManagerErrorsOneResponse(t *testing.T) {
 }
 
 func TestCollectManagerErrorsAllResponses(t *testing.T) {
-	handlerChan, runtime, varWatcher, config := setupManagerShutdownChannels(5 * time.Second)
+	handlerChan, runtime, varWatcher, config, upgradeMarkerWatcher := setupManagerShutdownChannels(5 * time.Second)
 	runtimeErrStr := "runtime error"
 	varsErrStr := "vars error"
+	upgradeMarkerWatcherErrStr := "upgrade marker watcher error"
 	runtime <- errors.New(runtimeErrStr)
 	varWatcher <- errors.New(varsErrStr)
 	config <- nil
+	upgradeMarkerWatcher <- errors.New(upgradeMarkerWatcherErrStr)
 
 	waitAndTestError(t, func(err error) bool {
 		return err != nil &&
-			strings.Contains(err.Error(), "2 errors occurred") &&
+			strings.Contains(err.Error(), "3 errors occurred") &&
 			strings.Contains(err.Error(), runtimeErrStr) &&
-			strings.Contains(err.Error(), varsErrStr)
+			strings.Contains(err.Error(), varsErrStr) &&
+			strings.Contains(err.Error(), upgradeMarkerWatcherErrStr)
 	}, handlerChan)
 }
 
 func TestCollectManagerErrorsAllResponsesNoErrors(t *testing.T) {
-	handlerChan, runtime, varWatcher, config := setupManagerShutdownChannels(5 * time.Second)
+	handlerChan, runtime, varWatcher, config, upgradeMarkerWatcher := setupManagerShutdownChannels(5 * time.Second)
 	runtime <- nil
 	varWatcher <- nil
 	config <- context.Canceled
+	upgradeMarkerWatcher <- nil
 
 	// All errors are nil or context.Canceled, so collectManagerErrors
 	// should also return nil.
@@ -397,18 +401,19 @@ func waitAndTestError(t *testing.T, check func(error) bool, handlerErr chan erro
 	}
 }
 
-func setupManagerShutdownChannels(timeout time.Duration) (chan error, chan error, chan error, chan error) {
+func setupManagerShutdownChannels(timeout time.Duration) (chan error, chan error, chan error, chan error, chan error) {
 	runtime := make(chan error)
 	varWatcher := make(chan error)
 	config := make(chan error)
+	upgradeMarkerWatcher := make(chan error)
 
 	handlerChan := make(chan error)
 	go func() {
-		handlerErr := collectManagerErrors(timeout, varWatcher, runtime, config)
+		handlerErr := collectManagerErrors(timeout, varWatcher, runtime, config, upgradeMarkerWatcher)
 		handlerChan <- handlerErr
 	}()
 
-	return handlerChan, runtime, varWatcher, config
+	return handlerChan, runtime, varWatcher, config, upgradeMarkerWatcher
 }
 
 func TestCoordinator_ReExec(t *testing.T) {
@@ -649,6 +654,10 @@ func (f *fakeUpgradeManager) Ack(ctx context.Context, acker acker.Acker) error {
 	return nil
 }
 
+func (f *fakeUpgradeManager) MarkerWatcher() upgrade.MarkerWatcher {
+	return nil
+}
+
 type testMonitoringManager struct{}
 
 func newTestMonitoringMgr() *testMonitoringManager { return &testMonitoringManager{} }
diff --git a/internal/pkg/agent/application/coordinator/coordinator_unit_test.go b/internal/pkg/agent/application/coordinator/coordinator_unit_test.go
index fd4034f24c4..956eb33b8c0 100644
--- a/internal/pkg/agent/application/coordinator/coordinator_unit_test.go
+++ b/internal/pkg/agent/application/coordinator/coordinator_unit_test.go
@@ -334,6 +334,13 @@ func TestCoordinatorReportsInvalidPolicy(t *testing.T) {
 	defer cancel()
 	logger := logp.NewLogger("testing")
 
+	upgradeMgr, err := upgrade.NewUpgrader(
+		logger,
+		&artifact.Config{},
+		&info.AgentInfo{},
+	)
+	require.NoError(t, err)
+
 	// Channels have buffer length 1 so we don't have to run on multiple
 	// goroutines.
 	stateChan := make(chan State, 1)
@@ -353,11 +360,7 @@ func TestCoordinatorReportsInvalidPolicy(t *testing.T) {
 		// Policy changes are sent to the upgrade manager, which scans it
 		// for updated artifact URIs. We take advantage of this for the
 		// test by sending an invalid artifact URI to trigger an error.
-		upgradeMgr: upgrade.NewUpgrader(
-			logger,
-			&artifact.Config{},
-			&info.AgentInfo{},
-		),
+		upgradeMgr: upgradeMgr,
 		// Add a placeholder runtime manager that will accept any updates
 		runtimeMgr: &fakeRuntimeManager{},
 
diff --git a/internal/pkg/agent/application/upgrade/details/details.go b/internal/pkg/agent/application/upgrade/details/details.go
index 80d039882a8..884df46a3cc 100644
--- a/internal/pkg/agent/application/upgrade/details/details.go
+++ b/internal/pkg/agent/application/upgrade/details/details.go
@@ -68,6 +68,7 @@ func NewDetails(targetVersion string, initialState State, actionID string) *Deta
 
 // SetState is a convenience method to set the state of the upgrade and
 // notify all observers.
+// Do NOT call SetState with StateFailed; call the Fail method instead.
 func (d *Details) SetState(s State) {
 	d.mu.Lock()
 	defer d.mu.Unlock()
diff --git a/internal/pkg/agent/application/upgrade/details/details_test.go b/internal/pkg/agent/application/upgrade/details/details_test.go
index 670b40705de..2ecd933e9f8 100644
--- a/internal/pkg/agent/application/upgrade/details/details_test.go
+++ b/internal/pkg/agent/application/upgrade/details/details_test.go
@@ -98,3 +98,26 @@ func TestDetailsDownloadRateJSON(t *testing.T) {
 		require.Equal(t, 0.99, unmarshalledDetails.Metadata.DownloadPercent)
 	})
 }
+
+func TestEquals(t *testing.T) {
+	details1 := NewDetails("8.12.0", StateDownloading, "foobar")
+	details1.SetDownloadProgress(0.1234, 34.56)
+	details1.Fail(errors.New("download failed"))
+
+	details2 := NewDetails("8.12.0", StateDownloading, "foobar")
+	details2.SetDownloadProgress(0.1234, 34.56)
+	details2.Fail(errors.New("download failed"))
+
+	details3 := NewDetails("8.12.0", StateDownloading, "foobar")
+
+	require.True(t, details1.Equals(details2))
+	require.False(t, details1.Equals(details3))
+
+	// Nil checks
+	var details4 *Details
+	var details5 *Details
+
+	require.True(t, details4.Equals(details5))
+	require.False(t, details1.Equals(details4))
+	require.False(t, details4.Equals(details1))
+}
diff --git a/internal/pkg/agent/application/upgrade/marker_access_other.go b/internal/pkg/agent/application/upgrade/marker_access_other.go
new file mode 100644
index 00000000000..ed854160e94
--- /dev/null
+++ b/internal/pkg/agent/application/upgrade/marker_access_other.go
@@ -0,0 +1,30 @@
+// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+// or more contributor license agreements. Licensed under the Elastic License;
+// you may not use this file except in compliance with the Elastic License.
+
+//go:build !windows
+
+package upgrade
+
+import (
+	"os"
+
+	"github.com/elastic/elastic-agent/internal/pkg/agent/errors"
+)
+
+// On non-Windows platforms, readMarkerFile simply reads the marker file.
+// See marker_access_windows.go for behavior on Windows platforms.
+func readMarkerFile(markerFile string) ([]byte, error) {
+	markerFileBytes, err := os.ReadFile(markerFile)
+	if errors.Is(err, os.ErrNotExist) {
+		// marker doesn't exist, nothing to do
+		return nil, nil
+	}
+	return markerFileBytes, nil
+}
+
+// On non-Windows platforms, writeMarkerFile simply writes the marker file.
+// See marker_access_windows.go for behavior on Windows platforms.
+func writeMarkerFile(markerFile string, markerBytes []byte) error {
+	return os.WriteFile(markerFilePath(), markerBytes, 0600)
+}
diff --git a/internal/pkg/agent/application/upgrade/marker_access_windows.go b/internal/pkg/agent/application/upgrade/marker_access_windows.go
new file mode 100644
index 00000000000..673a57eeabf
--- /dev/null
+++ b/internal/pkg/agent/application/upgrade/marker_access_windows.go
@@ -0,0 +1,76 @@
+// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+// or more contributor license agreements. Licensed under the Elastic License;
+// you may not use this file except in compliance with the Elastic License.
+
+package upgrade
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"time"
+
+	"github.com/elastic/elastic-agent/internal/pkg/agent/errors"
+
+	"github.com/cenkalti/backoff/v4"
+)
+
+const markerAccessTimeout = 10 * time.Second
+const markerAccessBackoffInitialInterval = 50 * time.Millisecond
+const minMarkerAccessRetries = 5
+
+// On Windows, readMarkerFile tries to read the marker file, retrying with
+// randomized exponential backoff up to markerAccessTimeout duration. This retry
+// mechanism is necessary since the marker file could be accessed by multiple
+// processes (the Upgrade Watcher and the main Agent process) at the same time,
+// which could fail on Windows.
+func readMarkerFile(markerFile string) ([]byte, error) {
+	var markerFileBytes []byte
+	readFn := func() error {
+		var err error
+		markerFileBytes, err = os.ReadFile(markerFile)
+		if errors.Is(err, os.ErrNotExist) {
+			// marker doesn't exist, nothing to do
+			return nil
+		}
+
+		return err
+	}
+
+	if err := accessMarkerFileWithRetries(readFn); err != nil {
+		return nil, fmt.Errorf("failed to read upgrade marker file [%s] despite retrying: %w", markerFile, err)
+	}
+
+	return markerFileBytes, nil
+}
+
+// On Windows, writeMarkerFile tries to write the marker file, retrying with
+// randomized exponential backoff up to markerAccessTimeout duration. This retry
+// mechanism is necessary since the marker file could be accessed by multiple
+// processes (the Upgrade Watcher and the main Agent process) at the same time,
+// which could fail on Windows.
+func writeMarkerFile(markerFile string, markerBytes []byte) error {
+	writeFn := func() error {
+		return os.WriteFile(markerFile, markerBytes, 0600)
+	}
+
+	if err := accessMarkerFileWithRetries(writeFn); err != nil {
+		return fmt.Errorf("failed to write upgrade marker file [%s] despite retrying: %w", markerFile, err)
+	}
+
+	return nil
+}
+
+func accessMarkerFileWithRetries(accessFn func() error) error {
+	expBackoff := backoff.NewExponentialBackOff()
+	expBackoff.InitialInterval = markerAccessBackoffInitialInterval
+	expBackoff.MaxInterval = markerAccessTimeout / minMarkerAccessRetries
+	expBackoff.MaxElapsedTime = markerAccessTimeout
+
+	ctx, cancel := context.WithTimeout(context.Background(), markerAccessTimeout)
+	defer cancel()
+
+	expBackoffWithTimeout := backoff.WithContext(expBackoff, ctx)
+
+	return backoff.Retry(accessFn, expBackoffWithTimeout)
+}
diff --git a/internal/pkg/agent/application/upgrade/marker_watcher.go b/internal/pkg/agent/application/upgrade/marker_watcher.go
new file mode 100644
index 00000000000..faefc3e64b2
--- /dev/null
+++ b/internal/pkg/agent/application/upgrade/marker_watcher.go
@@ -0,0 +1,132 @@
+// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+// or more contributor license agreements. Licensed under the Elastic License;
+// you may not use this file except in compliance with the Elastic License.
+
+package upgrade
+
+import (
+	"context"
+	"fmt"
+	"path/filepath"
+
+	"github.com/fsnotify/fsnotify"
+
+	"github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details"
+	"github.com/elastic/elastic-agent/pkg/core/logger"
+	"github.com/elastic/elastic-agent/version"
+)
+
+type MarkerWatcher interface {
+	Watch() <-chan UpdateMarker
+	Run(ctx context.Context) error
+}
+
+type MarkerFileWatcher struct {
+	markerFilePath string
+	logger         *logger.Logger
+	updateCh       chan UpdateMarker
+}
+
+func newMarkerFileWatcher(upgradeMarkerFilePath string, logger *logger.Logger) MarkerWatcher {
+	logger = logger.Named("marker_file_watcher")
+
+	return &MarkerFileWatcher{
+		markerFilePath: upgradeMarkerFilePath,
+		logger:         logger,
+		updateCh:       make(chan UpdateMarker),
+	}
+}
+
+func (mfw *MarkerFileWatcher) Watch() <-chan UpdateMarker {
+	return mfw.updateCh
+}
+
+func (mfw *MarkerFileWatcher) Run(ctx context.Context) error {
+	watcher, err := fsnotify.NewWatcher()
+	if err != nil {
+		return fmt.Errorf("failed to create upgrade marker watcher: %w", err)
+	}
+
+	// Watch the upgrade marker file's directory, not the file itself, so we
+	// notice the file even if it's deleted and recreated.
+	upgradeMarkerDirPath := filepath.Dir(mfw.markerFilePath)
+	if err := watcher.Add(upgradeMarkerDirPath); err != nil {
+		return fmt.Errorf("failed to set watch on upgrade marker's directory [%s]: %w", upgradeMarkerDirPath, err)
+	}
+
+	// Do an initial read from the upgrade marker file, in case the file
+	// is already present before the watching starts.
+	doInitialRead := make(chan struct{}, 1)
+	doInitialRead <- struct{}{}
+
+	// Handle watching
+	go func() {
+		defer watcher.Close()
+		for {
+			select {
+			case <-ctx.Done():
+				return
+			case err, ok := <-watcher.Errors:
+				if !ok { // Channel was closed (i.e. Watcher.Close() was called).
+					mfw.logger.Debug("fsnotify.Watcher's error channel was closed")
+					return
+				}
+				mfw.logger.Errorf("upgrade marker watch returned error: %s", err)
+				continue
+			case e, ok := <-watcher.Events:
+				if !ok { // Channel was closed (i.e. Watcher.Close() was called).
+					mfw.logger.Debug("fsnotify.Watcher's events channel was closed")
+					return
+				}
+
+				if e.Name != mfw.markerFilePath {
+					// Since we are watching the directory that will contain the upgrade
+					// marker file, we could receive events here for changes to files other
+					// than the upgrade marker. We ignore such events as we're only concerned
+					// with changes to the upgrade marker.
+					continue
+				}
+
+				switch {
+				case e.Op&(fsnotify.Create|fsnotify.Write) != 0:
+					// Upgrade marker file was created or updated; read its contents
+					// and send them over the update channel.
+					mfw.processMarker(version.GetAgentPackageVersion())
+				}
+			case <-doInitialRead:
+				mfw.processMarker(version.GetAgentPackageVersion())
+			}
+		}
+	}()
+
+	return nil
+}
+
+func (mfw *MarkerFileWatcher) processMarker(currentVersion string) {
+	marker, err := loadMarker(mfw.markerFilePath)
+	if err != nil {
+		mfw.logger.Error(err)
+		return
+	}
+
+	// Nothing to do if marker is not (yet) present
+	if marker == nil {
+		return
+	}
+
+	// If the marker exists but the version of Agent we're running right
+	// now is the same as the prevVersion recorded in the marker, it means
+	// the upgrade was rolled back. Ideally, this UPG_ROLLBACK state would've
+	// been recorded in the marker's upgrade details field but, in case it
+	// isn't for some reason, we fallback to explicitly setting that state as
+	// part of the upgrade details in the marker.
+	if marker.PrevVersion == currentVersion {
+		if marker.Details == nil {
+			marker.Details = details.NewDetails("unknown", details.StateRollback, marker.GetActionID())
+		} else {
+			marker.Details.SetState(details.StateRollback)
+		}
+	}
+
+	mfw.updateCh <- *marker
+}
diff --git a/internal/pkg/agent/application/upgrade/marker_watcher_test.go b/internal/pkg/agent/application/upgrade/marker_watcher_test.go
new file mode 100644
index 00000000000..9d54a9bbcae
--- /dev/null
+++ b/internal/pkg/agent/application/upgrade/marker_watcher_test.go
@@ -0,0 +1,225 @@
+// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+// or more contributor license agreements. Licensed under the Elastic License;
+// you may not use this file except in compliance with the Elastic License.
+
+package upgrade
+
+import (
+	"context"
+	"os"
+	"path/filepath"
+	"sync"
+	"testing"
+	"time"
+
+	"go.uber.org/zap/zapcore"
+
+	"github.com/stretchr/testify/require"
+
+	"gopkg.in/yaml.v2"
+
+	"github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details"
+	"github.com/elastic/elastic-agent/pkg/core/logger"
+)
+
+func TestMarkerWatcher(t *testing.T) {
+	testMarkerDir := t.TempDir()
+	testMarkerFile := filepath.Join(testMarkerDir, markerFilename)
+	testLogger, _ := logger.NewTesting("watch_marker")
+
+	markerWatcher := newMarkerFileWatcher(testMarkerFile, testLogger)
+
+	testCtx, testCancel := context.WithCancel(context.Background())
+	defer testCancel()
+
+	var testDetails *details.Details
+	var testDetailsMu sync.Mutex
+
+	var testErr error
+	go func() {
+		for {
+			select {
+			case <-testCtx.Done():
+				return
+			case marker := <-markerWatcher.Watch():
+				testDetailsMu.Lock()
+				testDetails = marker.Details
+				testDetailsMu.Unlock()
+			}
+		}
+	}()
+
+	err := markerWatcher.Run(testCtx)
+	require.NoError(t, err)
+
+	// Write out the expected upgrade details to the test upgrade marker
+	// file.
+	expectedDetails := &details.Details{
+		TargetVersion: "8.12.0",
+		State:         details.StateWatching,
+	}
+	marker := updateMarkerSerializer{
+		PrevVersion: "8.6.0",
+		Details:     expectedDetails,
+	}
+	data, err := yaml.Marshal(marker)
+	require.NoError(t, err)
+	err = os.WriteFile(testMarkerFile, data, 0644)
+	require.NoError(t, err)
+
+	// We expect that the details that were just written out to the test upgrade
+	// marker file will be noticed and read by the watchMarker function, and the
+	// testDetailsObs function will be called with them.
+	require.Eventually(t, func() bool {
+		testDetailsMu.Lock()
+		defer testDetailsMu.Unlock()
+
+		return testDetails != nil && testDetails.Equals(expectedDetails)
+	}, 1*time.Second, 10*time.Millisecond)
+
+	require.NoError(t, testErr)
+}
+
+func TestProcessMarker(t *testing.T) {
+	cases := map[string]struct {
+		markerFileContents string
+
+		expectedErrLogMsg bool
+		expectedDetails   *details.Details
+	}{
+		"failed_loading": {
+			markerFileContents: `
+invalid
+`,
+			expectedErrLogMsg: true,
+			expectedDetails:   nil,
+		},
+		"no_marker": {
+			markerFileContents: "",
+			expectedErrLogMsg:  false,
+			expectedDetails:    nil,
+		},
+		"same_version_no_details": {
+			markerFileContents: `
+prev_version: 8.9.2
+`,
+			expectedDetails: &details.Details{
+				TargetVersion: "unknown",
+				State:         details.StateRollback,
+			},
+		},
+		"same_version_with_details_no_state": {
+			markerFileContents: `
+prev_version: 8.9.2
+details:
+  target_version: 8.9.2
+`,
+			expectedErrLogMsg: false,
+			expectedDetails: &details.Details{
+				TargetVersion: "8.9.2",
+				State:         details.StateRollback,
+			},
+		},
+		"same_version_with_details_wrong_state": {
+			markerFileContents: `
+prev_version: 8.9.2
+details:
+  target_version: 8.9.2
+  state: UPG_WATCHING
+`,
+			expectedErrLogMsg: false,
+			expectedDetails: &details.Details{
+				TargetVersion: "8.9.2",
+				State:         details.StateRollback,
+			},
+		},
+		"different_version": {
+			markerFileContents: `
+prev_version: 8.8.2
+details:
+  target_version: 8.9.2
+  state: UPG_WATCHING
+`,
+			expectedErrLogMsg: false,
+			expectedDetails: &details.Details{
+				TargetVersion: "8.9.2",
+				State:         details.StateWatching,
+			},
+		},
+	}
+
+	for name, test := range cases {
+		t.Run(name, func(t *testing.T) {
+			tmpDir := t.TempDir()
+			testMarkerFilePath := filepath.Join(tmpDir, markerFilename)
+			if test.markerFileContents != "" {
+				err := os.WriteFile(testMarkerFilePath, []byte(test.markerFileContents), 0644)
+				require.NoError(t, err)
+			}
+			log, obs := logger.NewTesting("marker_watcher")
+			updateCh := make(chan UpdateMarker)
+			mfw := MarkerFileWatcher{
+				markerFilePath: testMarkerFilePath,
+				logger:         log,
+				updateCh:       updateCh,
+			}
+
+			done := make(chan struct{})
+			var markerRead bool
+			var actualMarker UpdateMarker
+			var markerMu sync.Mutex
+			go func() {
+				for {
+					select {
+					case <-done:
+						return
+					case m := <-updateCh:
+						markerMu.Lock()
+						markerRead = true
+						actualMarker = m
+						markerMu.Unlock()
+					}
+				}
+			}()
+
+			mfw.processMarker("8.9.2")
+
+			// error loading marker
+			if test.expectedErrLogMsg {
+				done <- struct{}{}
+				logs := obs.FilterLevelExact(zapcore.ErrorLevel).TakeAll()
+				require.NotEmpty(t, logs)
+
+				markerMu.Lock()
+				defer markerMu.Unlock()
+				require.False(t, markerRead)
+
+				return
+			}
+
+			// no marker
+			if test.markerFileContents == "" {
+				done <- struct{}{}
+
+				markerMu.Lock()
+				defer markerMu.Unlock()
+				require.False(t, markerRead)
+
+				return
+			}
+
+			// marker exists and can be read
+			require.Eventually(t, func() bool {
+				markerMu.Lock()
+				defer markerMu.Unlock()
+				return markerRead
+			}, 5*time.Second, 100*time.Millisecond)
+
+			markerMu.Lock()
+			defer markerMu.Unlock()
+
+			require.True(t, actualMarker.Details.Equals(test.expectedDetails))
+		})
+
+	}
+}
diff --git a/internal/pkg/agent/application/upgrade/step_download_test.go b/internal/pkg/agent/application/upgrade/step_download_test.go
index dcdc4da7de8..2f4500389ac 100644
--- a/internal/pkg/agent/application/upgrade/step_download_test.go
+++ b/internal/pkg/agent/application/upgrade/step_download_test.go
@@ -89,7 +89,9 @@ func TestDownloadWithRetries(t *testing.T) {
 			return &mockDownloader{expectedDownloadPath, nil}, nil
 		}
 
-		u := NewUpgrader(testLogger, &settings, &info.AgentInfo{})
+		u, err := NewUpgrader(testLogger, &settings, &info.AgentInfo{})
+		require.NoError(t, err)
+
 		parsedVersion, err := agtversion.ParseVersion("8.9.0")
 		require.NoError(t, err)
 		upgradeDetails := details.NewDetails(parsedVersion.String(), details.StateRequested, "")
@@ -124,7 +126,9 @@ func TestDownloadWithRetries(t *testing.T) {
 			return nil, nil
 		}
 
-		u := NewUpgrader(testLogger, &settings, &info.AgentInfo{})
+		u, err := NewUpgrader(testLogger, &settings, &info.AgentInfo{})
+		require.NoError(t, err)
+
 		parsedVersion, err := agtversion.ParseVersion("8.9.0")
 		require.NoError(t, err)
 		upgradeDetails := details.NewDetails(parsedVersion.String(), details.StateRequested, "")
@@ -161,7 +165,9 @@ func TestDownloadWithRetries(t *testing.T) {
 			return nil, nil
 		}
 
-		u := NewUpgrader(testLogger, &settings, &info.AgentInfo{})
+		u, err := NewUpgrader(testLogger, &settings, &info.AgentInfo{})
+		require.NoError(t, err)
+
 		parsedVersion, err := agtversion.ParseVersion("8.9.0")
 		require.NoError(t, err)
 		upgradeDetails := details.NewDetails(parsedVersion.String(), details.StateRequested, "")
@@ -186,7 +192,9 @@ func TestDownloadWithRetries(t *testing.T) {
 			return &mockDownloader{"", errors.New("download failed")}, nil
 		}
 
-		u := NewUpgrader(testLogger, &settings, &info.AgentInfo{})
+		u, err := NewUpgrader(testLogger, &settings, &info.AgentInfo{})
+		require.NoError(t, err)
+
 		parsedVersion, err := agtversion.ParseVersion("8.9.0")
 		require.NoError(t, err)
 		upgradeDetails := details.NewDetails(parsedVersion.String(), details.StateRequested, "")
diff --git a/internal/pkg/agent/application/upgrade/step_mark.go b/internal/pkg/agent/application/upgrade/step_mark.go
index 0e9cf0bd92a..b5743582317 100644
--- a/internal/pkg/agent/application/upgrade/step_mark.go
+++ b/internal/pkg/agent/application/upgrade/step_mark.go
@@ -14,6 +14,7 @@ import (
 	"gopkg.in/yaml.v2"
 
 	"github.com/elastic/elastic-agent/internal/pkg/agent/application/paths"
+	"github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details"
 	"github.com/elastic/elastic-agent/internal/pkg/agent/errors"
 	"github.com/elastic/elastic-agent/internal/pkg/fleetapi"
 	"github.com/elastic/elastic-agent/internal/pkg/release"
@@ -37,6 +38,17 @@ type UpdateMarker struct {
 	// Acked is a flag marking whether or not action was acked
 	Acked  bool                    `json:"acked" yaml:"acked"`
 	Action *fleetapi.ActionUpgrade `json:"action" yaml:"action"`
+
+	Details *details.Details `json:"details,omitempty" yaml:"details,omitempty"`
+}
+
+// GetActionID returns the Fleet Action ID associated with the
+// upgrade action, if it's recorded in the UpdateMarker.
+func (um UpdateMarker) GetActionID() string {
+	if um.Action != nil {
+		return um.Action.ActionID
+	}
+	return ""
 }
 
 // MarkerActionUpgrade adapter struct compatible with pre 8.3 version of the marker file format
@@ -78,6 +90,7 @@ type updateMarkerSerializer struct {
 	PrevHash    string               `yaml:"prev_hash"`
 	Acked       bool                 `yaml:"acked"`
 	Action      *MarkerActionUpgrade `yaml:"action"`
+	Details     *details.Details     `yaml:"details"`
 }
 
 func newMarkerSerializer(m *UpdateMarker) *updateMarkerSerializer {
@@ -88,11 +101,12 @@ func newMarkerSerializer(m *UpdateMarker) *updateMarkerSerializer {
 		PrevHash:    m.PrevHash,
 		Acked:       m.Acked,
 		Action:      convertToMarkerAction(m.Action),
+		Details:     m.Details,
 	}
 }
 
 // markUpgrade marks update happened so we can handle grace period
-func (u *Upgrader) markUpgrade(_ context.Context, log *logger.Logger, hash string, action *fleetapi.ActionUpgrade) error {
+func (u *Upgrader) markUpgrade(_ context.Context, log *logger.Logger, hash string, action *fleetapi.ActionUpgrade, upgradeDetails *details.Details) error {
 	prevVersion := release.Version()
 	prevHash := release.Commit()
 	if len(prevHash) > hashLen {
@@ -105,6 +119,7 @@ func (u *Upgrader) markUpgrade(_ context.Context, log *logger.Logger, hash strin
 		PrevVersion: prevVersion,
 		PrevHash:    prevHash,
 		Action:      action,
+		Details:     upgradeDetails,
 	}
 
 	markerBytes, err := yaml.Marshal(newMarkerSerializer(marker))
@@ -150,14 +165,18 @@ func CleanMarker(log *logger.Logger) error {
 // LoadMarker loads the update marker. If the file does not exist it returns nil
 // and no error.
 func LoadMarker() (*UpdateMarker, error) {
-	markerFile := markerFilePath()
-	markerBytes, err := ioutil.ReadFile(markerFile)
+	return loadMarker(markerFilePath())
+}
+
+func loadMarker(markerFile string) (*UpdateMarker, error) {
+	markerBytes, err := readMarkerFile(markerFile)
 	if err != nil {
-		if errors.Is(err, os.ErrNotExist) {
-			return nil, nil
-		}
 		return nil, err
 	}
+	if markerBytes == nil {
+		// marker doesn't exist
+		return nil, nil
+	}
 
 	marker := &updateMarkerSerializer{}
 	if err := yaml.Unmarshal(markerBytes, &marker); err != nil {
@@ -171,10 +190,11 @@ func LoadMarker() (*UpdateMarker, error) {
 		PrevHash:    marker.PrevHash,
 		Acked:       marker.Acked,
 		Action:      convertToActionUpgrade(marker.Action),
+		Details:     marker.Details,
 	}, nil
 }
 
-func saveMarker(marker *UpdateMarker) error {
+func SaveMarker(marker *UpdateMarker) error {
 	makerSerializer := &updateMarkerSerializer{
 		Hash:        marker.Hash,
 		UpdatedOn:   marker.UpdatedOn,
@@ -182,13 +202,14 @@ func saveMarker(marker *UpdateMarker) error {
 		PrevHash:    marker.PrevHash,
 		Acked:       marker.Acked,
 		Action:      convertToMarkerAction(marker.Action),
+		Details:     marker.Details,
 	}
 	markerBytes, err := yaml.Marshal(makerSerializer)
 	if err != nil {
 		return err
 	}
 
-	return ioutil.WriteFile(markerFilePath(), markerBytes, 0600)
+	return writeMarkerFile(markerFilePath(), markerBytes)
 }
 
 func markerFilePath() string {
diff --git a/internal/pkg/agent/application/upgrade/upgrade.go b/internal/pkg/agent/application/upgrade/upgrade.go
index da0ea3df6be..a7c3c7e817b 100644
--- a/internal/pkg/agent/application/upgrade/upgrade.go
+++ b/internal/pkg/agent/application/upgrade/upgrade.go
@@ -55,6 +55,7 @@ type Upgrader struct {
 	agentInfo      *info.AgentInfo
 	upgradeable    bool
 	fleetServerURI string
+	markerWatcher  MarkerWatcher
 }
 
 // IsUpgradeable when agent is installed and running as a service or flag was provided.
@@ -65,13 +66,14 @@ func IsUpgradeable() bool {
 }
 
 // NewUpgrader creates an upgrader which is capable of performing upgrade operation
-func NewUpgrader(log *logger.Logger, settings *artifact.Config, agentInfo *info.AgentInfo) *Upgrader {
+func NewUpgrader(log *logger.Logger, settings *artifact.Config, agentInfo *info.AgentInfo) (*Upgrader, error) {
 	return &Upgrader{
-		log:         log,
-		settings:    settings,
-		agentInfo:   agentInfo,
-		upgradeable: IsUpgradeable(),
-	}
+		log:           log,
+		settings:      settings,
+		agentInfo:     agentInfo,
+		upgradeable:   IsUpgradeable(),
+		markerWatcher: newMarkerFileWatcher(markerFilePath(), log),
+	}, nil
 }
 
 // SetClient reloads URI based on up to date fleet client
@@ -181,14 +183,13 @@ func (u *Upgrader) Upgrade(ctx context.Context, version string, sourceURI string
 		return nil, err
 	}
 
-	if err := u.markUpgrade(ctx, u.log, newHash, action); err != nil {
+	det.SetState(details.StateWatching)
+	if err := u.markUpgrade(ctx, u.log, newHash, action, det); err != nil {
 		u.log.Errorw("Rolling back: marking upgrade failed", "error.message", err)
 		rollbackInstall(ctx, u.log, newHash)
 		return nil, err
 	}
 
-	det.SetState(details.StateWatching)
-
 	if err := InvokeWatcher(u.log); err != nil {
 		u.log.Errorw("Rolling back: starting watcher failed", "error.message", err)
 		rollbackInstall(ctx, u.log, newHash)
@@ -237,7 +238,11 @@ func (u *Upgrader) Ack(ctx context.Context, acker acker.Acker) error {
 
 	marker.Acked = true
 
-	return saveMarker(marker)
+	return SaveMarker(marker)
+}
+
+func (u *Upgrader) MarkerWatcher() MarkerWatcher {
+	return u.markerWatcher
 }
 
 func (u *Upgrader) sourceURI(retrievedURI string) string {
diff --git a/internal/pkg/agent/cmd/watch.go b/internal/pkg/agent/cmd/watch.go
index 45648752eda..b3d5727175c 100644
--- a/internal/pkg/agent/cmd/watch.go
+++ b/internal/pkg/agent/cmd/watch.go
@@ -13,22 +13,23 @@ import (
 	"syscall"
 	"time"
 
-	"github.com/elastic/elastic-agent/version"
-
-	"github.com/elastic/elastic-agent/internal/pkg/config"
-
 	"github.com/spf13/cobra"
 
 	"github.com/elastic/elastic-agent-libs/logp"
 	"github.com/elastic/elastic-agent-libs/logp/configure"
+
 	"github.com/elastic/elastic-agent/internal/pkg/agent/application/filelock"
 	"github.com/elastic/elastic-agent/internal/pkg/agent/application/paths"
 	"github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade"
+	"github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details"
 	"github.com/elastic/elastic-agent/internal/pkg/agent/configuration"
 	"github.com/elastic/elastic-agent/internal/pkg/agent/errors"
 	"github.com/elastic/elastic-agent/internal/pkg/cli"
+	"github.com/elastic/elastic-agent/internal/pkg/config"
 	"github.com/elastic/elastic-agent/internal/pkg/release"
 	"github.com/elastic/elastic-agent/pkg/core/logger"
+	agtversion "github.com/elastic/elastic-agent/pkg/version"
+	"github.com/elastic/elastic-agent/version"
 )
 
 const (
@@ -107,14 +108,49 @@ func watchCmd(log *logp.Logger, cfg *configuration.Configuration) error {
 	errorCheckInterval := cfg.Settings.Upgrade.Watcher.ErrorCheck.Interval
 	ctx := context.Background()
 	if err := watch(ctx, tilGrace, errorCheckInterval, log); err != nil {
-		log.Error("Error detected proceeding to rollback: %v", err)
+		log.Error("Error detected, proceeding to rollback: %v", err)
+
+		// If we are upgrading from version >= 8.12.0, marker.Details should be non-nil
+		// because the Agent we upgraded FROM would've written upgrade details in the upgrade
+		// marker. However, if we're upgrading from version < 8.12.0, the marker won't
+		// contain upgrade details, so we populate them now.
+		if marker.Details == nil {
+			fromVersion, err := agtversion.ParseVersion(marker.PrevVersion)
+			if err != nil {
+				log.Warnf("upgrade details are nil, but unable to parse version being upgraded from [%s]: %s", marker.PrevVersion, err.Error())
+			} else if fromVersion.Less(*agtversion.NewParsedSemVer(8, 12, 0, "", "")) {
+				log.Warnf("upgrade details are unexpectedly nil, upgrading from version [%s]", marker.PrevVersion)
+			}
+
+			marker.Details = details.NewDetails(version.GetAgentPackageVersion(), details.StateRollback, marker.GetActionID())
+		}
+
+		marker.Details.SetState(details.StateRollback)
+		err = upgrade.SaveMarker(marker)
+		if err != nil {
+			log.Errorf("unable to save upgrade marker before attempting to rollback: %s", err.Error())
+		}
+
 		err = upgrade.Rollback(ctx, log, marker.PrevHash, marker.Hash)
 		if err != nil {
 			log.Error("rollback failed", err)
+
+			marker.Details.Fail(err)
+			err = upgrade.SaveMarker(marker)
+			if err != nil {
+				log.Errorf("unable to save upgrade marker after rollback failed: %s", err.Error())
+			}
 		}
 		return err
 	}
 
+	// watch succeeded - upgrade was successful!
+	marker.Details.SetState(details.StateCompleted)
+	err = upgrade.SaveMarker(marker)
+	if err != nil {
+		log.Errorf("unable to save upgrade marker after successful watch: %s", err.Error())
+	}
+
 	// cleanup older versions,
 	// in windows it might leave self untouched, this will get cleaned up
 	// later at the start, because for windows we leave marker untouched.
diff --git a/testing/integration/upgrade_rollback_test.go b/testing/integration/upgrade_rollback_test.go
index 02a209e18eb..c91ac967df0 100644
--- a/testing/integration/upgrade_rollback_test.go
+++ b/testing/integration/upgrade_rollback_test.go
@@ -21,6 +21,7 @@ import (
 	"github.com/stretchr/testify/require"
 
 	"github.com/elastic/elastic-agent/internal/pkg/agent/application/paths"
+	"github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details"
 	"github.com/elastic/elastic-agent/internal/pkg/agent/install"
 	atesting "github.com/elastic/elastic-agent/pkg/testing"
 	"github.com/elastic/elastic-agent/pkg/testing/define"
@@ -119,6 +120,23 @@ inputs:
 	}
 	require.NoError(t, err)
 
+	// ensure that upgrade details now show the state as UPG_ROLLBACK. This is only possible with Elastic
+	// Agent versions >= 8.12.0.
+	startVersion, err := version.ParseVersion(startVersionInfo.Binary.Version)
+	require.NoError(t, err)
+
+	if !startVersion.Less(*version.NewParsedSemVer(8, 12, 0, "", "")) {
+		client := startFixture.Client()
+		err = client.Connect(ctx)
+		require.NoError(t, err)
+
+		state, err := client.State(ctx)
+		require.NoError(t, err)
+
+		require.NotNil(t, state.UpgradeDetails)
+		require.Equal(t, details.StateRollback, state.UpgradeDetails.State)
+	}
+
 	// rollback should stop the watcher
 	// killTimeout is greater than timeout as the watcher should have been
 	// stopped on its own, and we don't want this test to hide that fact
@@ -233,6 +251,23 @@ func TestStandaloneUpgradeRollbackOnRestarts(t *testing.T) {
 	}
 	require.NoError(t, err)
 
+	// ensure that upgrade details now show the state as UPG_ROLLBACK. This is only possible with Elastic
+	// Agent versions >= 8.12.0.
+	startVersion, err := version.ParseVersion(startVersionInfo.Binary.Version)
+	require.NoError(t, err)
+
+	if !startVersion.Less(*version.NewParsedSemVer(8, 12, 0, "", "")) {
+		client := startFixture.Client()
+		err = client.Connect(ctx)
+		require.NoError(t, err)
+
+		state, err := client.State(ctx)
+		require.NoError(t, err)
+
+		require.NotNil(t, state.UpgradeDetails)
+		require.Equal(t, details.StateRollback, state.UpgradeDetails.State)
+	}
+
 	// rollback should stop the watcher
 	// killTimeout is greater than timeout as the watcher should have been
 	// stopped on its own, and we don't want this test to hide that fact

From 38bee2471dc687258859d260d68427c42d5e75df Mon Sep 17 00:00:00 2001
From: Shaunak Kashyap <ycombinator@gmail.com>
Date: Thu, 16 Nov 2023 18:32:55 +0530
Subject: [PATCH 08/21] Send upgrade details to Fleet Server in check-in API
 requests  (#3528)

* Adding FSM for upgrades

* Implementing TODO

* WIP

* WIP

* Reorganizing imports

* Running go mod tidy

* Fix type

* Handle failures in one place

* Remove Fleet changes

* Send upgrade details in check-in requests to Fleet

* Add unit test

* Adding CHANGELOG entry

* Compare pointer values so we're not copying locks

* Add missing assertion

* Update Fleet-managed upgrade E2E test

* Make scheduled_at optional so empty time is not serialized

* Normalize comparison of hostnames

* Convert comment to log

* Fix assertion logic

* Removing replace directive

* Address linter error
---
 NOTICE.txt                                    |  4 +-
 ...9016628-send-upgrade-details-to-fleet.yaml | 32 +++++++++
 go.mod                                        |  2 +-
 go.sum                                        |  4 +-
 .../coordinator/diagnostics_test.go           |  2 +-
 .../gateway/fleet/fleet_gateway.go            | 11 +--
 .../gateway/fleet/fleet_gateway_test.go       | 70 +++++++++++++++++++
 .../application/upgrade/details/details.go    |  2 +-
 internal/pkg/fleetapi/checkin_cmd.go          | 12 ++--
 pkg/control/v2/server/server.go               |  7 +-
 pkg/control/v2/server/server_test.go          |  7 +-
 pkg/testing/tools/fleettools/fleet.go         |  3 +-
 testing/integration/upgrade_fleet_test.go     | 10 +++
 13 files changed, 146 insertions(+), 20 deletions(-)
 create mode 100644 changelog/fragments/1699016628-send-upgrade-details-to-fleet.yaml

diff --git a/NOTICE.txt b/NOTICE.txt
index a6e55930766..4bbca01b922 100644
--- a/NOTICE.txt
+++ b/NOTICE.txt
@@ -1166,11 +1166,11 @@ SOFTWARE
 
 --------------------------------------------------------------------------------
 Dependency : github.com/elastic/elastic-agent-libs
-Version: v0.6.2
+Version: v0.7.0
 Licence type (autodetected): Apache-2.0
 --------------------------------------------------------------------------------
 
-Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-agent-libs@v0.6.2/LICENSE:
+Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-agent-libs@v0.7.0/LICENSE:
 
                                  Apache License
                            Version 2.0, January 2004
diff --git a/changelog/fragments/1699016628-send-upgrade-details-to-fleet.yaml b/changelog/fragments/1699016628-send-upgrade-details-to-fleet.yaml
new file mode 100644
index 00000000000..181cc9114f3
--- /dev/null
+++ b/changelog/fragments/1699016628-send-upgrade-details-to-fleet.yaml
@@ -0,0 +1,32 @@
+# Kind can be one of:
+# - breaking-change: a change to previously-documented behavior
+# - deprecation: functionality that is being removed in a later release
+# - bug-fix: fixes a problem in a previous version
+# - enhancement: extends functionality but does not break or fix existing behavior
+# - feature: new functionality
+# - known-issue: problems that we are aware of in a given version
+# - security: impacts on the security of a product or a user’s deployment.
+# - upgrade: important information for someone upgrading from a prior version
+# - other: does not fit into any of the other categories
+kind: feature
+
+# Change summary; a 80ish characters long description of the change.
+summary: Send upgrade details to Fleet
+
+# Long description; in case the summary is not enough to describe the change
+# this field accommodate a description without length limits.
+# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
+#description:
+
+# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
+component: elastic-agent
+
+# PR URL; optional; the PR number that added the changeset.
+# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
+# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
+# Please provide it if you are adding a fragment for a different PR.
+pr: https://github.com/elastic/elastic-agent/pull/3528
+
+# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
+# If not present is automatically filled by the tooling with the issue linked to the PR number.
+issue: https://github.com/elastic/elastic-agent/issues/3119
diff --git a/go.mod b/go.mod
index 897874a5261..a0b30a7f3f1 100644
--- a/go.mod
+++ b/go.mod
@@ -14,7 +14,7 @@ require (
 	github.com/elastic/e2e-testing v1.1.0
 	github.com/elastic/elastic-agent-autodiscover v0.6.5
 	github.com/elastic/elastic-agent-client/v7 v7.5.0
-	github.com/elastic/elastic-agent-libs v0.6.2
+	github.com/elastic/elastic-agent-libs v0.7.0
 	github.com/elastic/elastic-agent-system-metrics v0.8.0
 	github.com/elastic/elastic-transport-go/v8 v8.3.0
 	github.com/elastic/go-elasticsearch/v8 v8.10.1
diff --git a/go.sum b/go.sum
index 54eca1468e7..ffcbb8e262a 100644
--- a/go.sum
+++ b/go.sum
@@ -781,8 +781,8 @@ github.com/elastic/elastic-agent-autodiscover v0.6.5 h1:5DeMpuNc8c/tN6HN0A4A2uOF
 github.com/elastic/elastic-agent-autodiscover v0.6.5/go.mod h1:chulyCAyZb/njMHgzkhC/yWnt8v/Y6eCRUhmFVnsA5o=
 github.com/elastic/elastic-agent-client/v7 v7.5.0 h1:niI3WQ+01Lnp2r5LxK8SyNhrPJe13vBiOkqrDRK2oTA=
 github.com/elastic/elastic-agent-client/v7 v7.5.0/go.mod h1:DYoX95xjC4BW/p2avyu724Qr2+hoUIz9eCU9CVS1d+0=
-github.com/elastic/elastic-agent-libs v0.6.2 h1:tE5pFK4y7xm1FtXm+r+63G7STjJAaWh3+oKIQDzdPDo=
-github.com/elastic/elastic-agent-libs v0.6.2/go.mod h1:o+EySawBZGeYu49shJxerg2wRCimS1dhrD4As0MS700=
+github.com/elastic/elastic-agent-libs v0.7.0 h1:g/+Gzpn4ayXPFbfZsn5lGjbPR1TGqlVpshJVVUNJGlQ=
+github.com/elastic/elastic-agent-libs v0.7.0/go.mod h1:o+EySawBZGeYu49shJxerg2wRCimS1dhrD4As0MS700=
 github.com/elastic/elastic-agent-system-metrics v0.8.0 h1:EsWbtd83JvnaqnL57bKS1E6GhOdemTRbxdFDcenR8zQ=
 github.com/elastic/elastic-agent-system-metrics v0.8.0/go.mod h1:9C1UEfj0P687HAzZepHszN6zXA+2tN2Lx3Osvq1zby8=
 github.com/elastic/elastic-integration-corpus-generator-tool v0.5.0/go.mod h1:uf9N86y+UACGybdEhZLpwZ93XHWVhsYZAA4c2T2v6YM=
diff --git a/internal/pkg/agent/application/coordinator/diagnostics_test.go b/internal/pkg/agent/application/coordinator/diagnostics_test.go
index 9fec8e1844b..1e12dd38141 100644
--- a/internal/pkg/agent/application/coordinator/diagnostics_test.go
+++ b/internal/pkg/agent/application/coordinator/diagnostics_test.go
@@ -441,7 +441,7 @@ func TestDiagnosticState(t *testing.T) {
 			ActionID:      "foobar",
 			Metadata: details.Metadata{
 				DownloadPercent: 0.17469,
-				ScheduledAt:     now,
+				ScheduledAt:     &now,
 				DownloadRate:    123.56,
 			},
 		},
diff --git a/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go b/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go
index 000ec534bf2..109ece58be9 100644
--- a/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go
+++ b/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go
@@ -332,11 +332,12 @@ func (f *FleetGateway) execute(ctx context.Context) (*fleetapi.CheckinResponse,
 	// checkin
 	cmd := fleetapi.NewCheckinCmd(f.agentInfo, f.client)
 	req := &fleetapi.CheckinRequest{
-		AckToken:   ackToken,
-		Metadata:   ecsMeta,
-		Status:     agentStateToString(state.State),
-		Message:    state.Message,
-		Components: components,
+		AckToken:       ackToken,
+		Metadata:       ecsMeta,
+		Status:         agentStateToString(state.State),
+		Message:        state.Message,
+		Components:     components,
+		UpgradeDetails: state.UpgradeDetails,
 	}
 
 	resp, took, err := cmd.Execute(ctx, req)
diff --git a/internal/pkg/agent/application/gateway/fleet/fleet_gateway_test.go b/internal/pkg/agent/application/gateway/fleet/fleet_gateway_test.go
index 04572eab845..de40565c0b5 100644
--- a/internal/pkg/agent/application/gateway/fleet/fleet_gateway_test.go
+++ b/internal/pkg/agent/application/gateway/fleet/fleet_gateway_test.go
@@ -7,6 +7,7 @@ package fleet
 import (
 	"bytes"
 	"context"
+	"encoding/json"
 	"fmt"
 
 	"io"
@@ -23,9 +24,11 @@ import (
 	"gotest.tools/assert"
 
 	"github.com/elastic/elastic-agent/internal/pkg/agent/application/coordinator"
+	"github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details"
 	"github.com/elastic/elastic-agent/internal/pkg/agent/errors"
 	"github.com/elastic/elastic-agent/internal/pkg/agent/storage"
 	"github.com/elastic/elastic-agent/internal/pkg/agent/storage/store"
+	"github.com/elastic/elastic-agent/internal/pkg/fleetapi"
 	"github.com/elastic/elastic-agent/internal/pkg/fleetapi/acker/noop"
 	"github.com/elastic/elastic-agent/internal/pkg/scheduler"
 	agentclient "github.com/elastic/elastic-agent/pkg/control/v2/client"
@@ -308,6 +311,73 @@ func TestFleetGateway(t *testing.T) {
 		require.NoError(t, err)
 	})
 
+	t.Run("Sends upgrade details", func(t *testing.T) {
+		ctx, cancel := context.WithCancel(context.Background())
+		defer cancel()
+
+		scheduler := scheduler.NewStepper()
+		client := newTestingClient()
+
+		log, _ := logger.NewTesting("fleet_gateway")
+
+		stateStore := newStateStore(t, log)
+
+		upgradeDetails := &details.Details{
+			TargetVersion: "8.12.0",
+			State:         "UPG_WATCHING",
+			ActionID:      "foobarbaz",
+		}
+		stateFetcher := func() coordinator.State {
+			return coordinator.State{
+				UpgradeDetails: upgradeDetails,
+			}
+		}
+
+		gateway, err := newFleetGatewayWithScheduler(
+			log,
+			settings,
+			agentInfo,
+			client,
+			scheduler,
+			noop.New(),
+			stateFetcher,
+			stateStore,
+		)
+
+		require.NoError(t, err)
+
+		waitFn := ackSeq(
+			client.Answer(func(headers http.Header, body io.Reader) (*http.Response, error) {
+				data, err := io.ReadAll(body)
+				require.NoError(t, err)
+
+				var checkinRequest fleetapi.CheckinRequest
+				err = json.Unmarshal(data, &checkinRequest)
+				require.NoError(t, err)
+
+				require.NotNil(t, checkinRequest.UpgradeDetails)
+				require.Equal(t, upgradeDetails, checkinRequest.UpgradeDetails)
+
+				resp := wrapStrToResp(http.StatusOK, `{ "actions": [] }`)
+				return resp, nil
+			}),
+		)
+
+		errCh := runFleetGateway(ctx, gateway)
+
+		// Synchronize scheduler and acking of calls from the worker go routine.
+		scheduler.Next()
+		waitFn()
+
+		cancel()
+		err = <-errCh
+		require.NoError(t, err)
+		select {
+		case actions := <-gateway.Actions():
+			t.Errorf("Expected no actions, got %v", actions)
+		default:
+		}
+	})
 }
 
 func TestRetriesOnFailures(t *testing.T) {
diff --git a/internal/pkg/agent/application/upgrade/details/details.go b/internal/pkg/agent/application/upgrade/details/details.go
index 884df46a3cc..dcc990d33cd 100644
--- a/internal/pkg/agent/application/upgrade/details/details.go
+++ b/internal/pkg/agent/application/upgrade/details/details.go
@@ -35,7 +35,7 @@ type Details struct {
 
 // Metadata consists of metadata relating to a specific upgrade state
 type Metadata struct {
-	ScheduledAt time.Time `json:"scheduled_at,omitempty" yaml:"scheduled_at,omitempty"`
+	ScheduledAt *time.Time `json:"scheduled_at,omitempty" yaml:"scheduled_at,omitempty"`
 
 	// DownloadPercent is the percentage of the artifact that has been
 	// downloaded. Minimum value is 0 and maximum value is 1.
diff --git a/internal/pkg/fleetapi/checkin_cmd.go b/internal/pkg/fleetapi/checkin_cmd.go
index b52eeb3903f..4c14454ce18 100644
--- a/internal/pkg/fleetapi/checkin_cmd.go
+++ b/internal/pkg/fleetapi/checkin_cmd.go
@@ -14,6 +14,7 @@ import (
 	"time"
 
 	"github.com/elastic/elastic-agent/internal/pkg/agent/application/info"
+	"github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details"
 	"github.com/elastic/elastic-agent/internal/pkg/agent/errors"
 	"github.com/elastic/elastic-agent/internal/pkg/fleetapi/client"
 )
@@ -47,11 +48,12 @@ type CheckinComponent struct {
 
 // CheckinRequest consists of multiple events reported to fleet ui.
 type CheckinRequest struct {
-	Status     string             `json:"status"`
-	AckToken   string             `json:"ack_token,omitempty"`
-	Metadata   *info.ECSMeta      `json:"local_metadata,omitempty"`
-	Message    string             `json:"message"`    // V2 Agent message
-	Components []CheckinComponent `json:"components"` // V2 Agent components
+	Status         string             `json:"status"`
+	AckToken       string             `json:"ack_token,omitempty"`
+	Metadata       *info.ECSMeta      `json:"local_metadata,omitempty"`
+	Message        string             `json:"message"`    // V2 Agent message
+	Components     []CheckinComponent `json:"components"` // V2 Agent components
+	UpgradeDetails *details.Details   `json:"upgrade_details,omitempty"`
 }
 
 // SerializableEvent is a representation of the event to be send to the Fleet Server API via the checkin
diff --git a/pkg/control/v2/server/server.go b/pkg/control/v2/server/server.go
index 645068ea64d..149c426e2e4 100644
--- a/pkg/control/v2/server/server.go
+++ b/pkg/control/v2/server/server.go
@@ -370,12 +370,17 @@ func stateToProto(state *coordinator.State, agentInfo *info.AgentInfo) (*cproto.
 			State:         string(state.UpgradeDetails.State),
 			ActionId:      state.UpgradeDetails.ActionID,
 			Metadata: &cproto.UpgradeDetailsMetadata{
-				ScheduledAt:     timestamppb.New(state.UpgradeDetails.Metadata.ScheduledAt),
 				DownloadPercent: float32(state.UpgradeDetails.Metadata.DownloadPercent),
 				FailedState:     string(state.UpgradeDetails.Metadata.FailedState),
 				ErrorMsg:        state.UpgradeDetails.Metadata.ErrorMsg,
 			},
 		}
+
+		if state.UpgradeDetails.Metadata.ScheduledAt != nil &&
+			!state.UpgradeDetails.Metadata.ScheduledAt.IsZero() {
+			upgradeDetails.Metadata.ScheduledAt = timestamppb.New(*state.UpgradeDetails.Metadata.ScheduledAt)
+
+		}
 	}
 
 	return &cproto.StateResponse{
diff --git a/pkg/control/v2/server/server_test.go b/pkg/control/v2/server/server_test.go
index cafab60450f..d7aee6d96c7 100644
--- a/pkg/control/v2/server/server_test.go
+++ b/pkg/control/v2/server/server_test.go
@@ -164,11 +164,16 @@ func TestStateMapping(t *testing.T) {
 
 			if tc.upgradeDetails != nil {
 				expectedMetadata := &cproto.UpgradeDetailsMetadata{
-					ScheduledAt:     timestamppb.New(tc.upgradeDetails.Metadata.ScheduledAt),
 					DownloadPercent: float32(tc.upgradeDetails.Metadata.DownloadPercent),
 					FailedState:     string(tc.upgradeDetails.Metadata.FailedState),
 					ErrorMsg:        tc.upgradeDetails.Metadata.ErrorMsg,
 				}
+
+				if tc.upgradeDetails.Metadata.ScheduledAt != nil &&
+					!tc.upgradeDetails.Metadata.ScheduledAt.IsZero() {
+					expectedMetadata.ScheduledAt = timestamppb.New(*tc.upgradeDetails.Metadata.ScheduledAt)
+				}
+
 				assert.Equal(t, string(tc.upgradeDetails.State), stateResponse.UpgradeDetails.State)
 				assert.Equal(t, tc.upgradeDetails.TargetVersion, stateResponse.UpgradeDetails.TargetVersion)
 				assert.Equal(t, tc.upgradeDetails.ActionID, stateResponse.UpgradeDetails.ActionId)
diff --git a/pkg/testing/tools/fleettools/fleet.go b/pkg/testing/tools/fleettools/fleet.go
index baa1ca659c0..77c64069e41 100644
--- a/pkg/testing/tools/fleettools/fleet.go
+++ b/pkg/testing/tools/fleettools/fleet.go
@@ -9,6 +9,7 @@ import (
 	"errors"
 	"fmt"
 	"os"
+	"strings"
 
 	"github.com/elastic/elastic-agent-libs/kibana"
 )
@@ -25,7 +26,7 @@ func GetAgentByPolicyIDAndHostnameFromList(client *kibana.Client, policyID, host
 		agentHostname := item.LocalMetadata.Host.Hostname
 		agentPolicyID := item.PolicyID
 
-		if agentHostname == hostname && agentPolicyID == policyID {
+		if strings.EqualFold(agentHostname, hostname) && agentPolicyID == policyID {
 			hostnameAgents = append(hostnameAgents, &listAgentsResp.Items[i])
 		}
 	}
diff --git a/testing/integration/upgrade_fleet_test.go b/testing/integration/upgrade_fleet_test.go
index aca07472d58..9c44386eb87 100644
--- a/testing/integration/upgrade_fleet_test.go
+++ b/testing/integration/upgrade_fleet_test.go
@@ -8,6 +8,7 @@ package integration
 
 import (
 	"context"
+	"os"
 	"strings"
 	"testing"
 	"time"
@@ -140,6 +141,15 @@ func testUpgradeFleetManagedElasticAgent(ctx context.Context, t *testing.T, info
 	err = fleettools.UpgradeAgent(kibClient, policy.ID, endVersionInfo.Binary.String(), true)
 	require.NoError(t, err)
 
+	t.Log("Waiting from upgrade details to show up in Fleet")
+	hostname, err := os.Hostname()
+	require.NoError(t, err)
+	require.Eventually(t, func() bool {
+		agent, err := fleettools.GetAgentByPolicyIDAndHostnameFromList(kibClient, policy.ID, hostname)
+		return err == nil && agent.UpgradeDetails != nil
+
+	}, 5*time.Minute, time.Second)
+
 	// wait for the watcher to show up
 	t.Logf("Waiting for upgrade watcher to start...")
 	err = upgradetest.WaitForWatcher(ctx, 5*time.Minute, 10*time.Second)

From c813d3f12776532e9209d1848e39a3da73f83b98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Paolo=20Chil=C3=A0?= <paolo.chila@elastic.co>
Date: Thu, 16 Nov 2023 17:54:48 +0100
Subject: [PATCH 09/21] Fix comparison of upgrade metadata (#3779)

---
 .../pkg/agent/application/dispatcher/dispatcher.go  |  2 +-
 .../agent/application/dispatcher/dispatcher_test.go |  5 +++--
 .../agent/application/upgrade/details/details.go    | 13 ++++++++++++-
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/internal/pkg/agent/application/dispatcher/dispatcher.go b/internal/pkg/agent/application/dispatcher/dispatcher.go
index eda450d6a0b..63d00aeeea3 100644
--- a/internal/pkg/agent/application/dispatcher/dispatcher.go
+++ b/internal/pkg/agent/application/dispatcher/dispatcher.go
@@ -321,7 +321,7 @@ func reportNextScheduledUpgrade(input []fleetapi.Action, detailsSetter details.O
 
 	upgradeDetails := details.NewDetails(nextUpgrade.Version, details.StateScheduled, nextUpgrade.ID())
 	startTime, _ := nextUpgrade.StartTime()
-	upgradeDetails.Metadata.ScheduledAt = startTime
+	upgradeDetails.Metadata.ScheduledAt = &startTime
 
 	detailsSetter(upgradeDetails)
 }
diff --git a/internal/pkg/agent/application/dispatcher/dispatcher_test.go b/internal/pkg/agent/application/dispatcher/dispatcher_test.go
index 212c50a3068..bf69c76ce74 100644
--- a/internal/pkg/agent/application/dispatcher/dispatcher_test.go
+++ b/internal/pkg/agent/application/dispatcher/dispatcher_test.go
@@ -493,6 +493,7 @@ func Test_ActionDispatcher_scheduleRetry(t *testing.T) {
 func TestReportNextScheduledUpgrade(t *testing.T) {
 	now := time.Now().UTC()
 	later := now.Add(3 * time.Hour)
+	laterTruncate := later.Truncate(time.Second)
 	muchLater := later.Add(3 * time.Hour)
 
 	cases := map[string]struct {
@@ -522,7 +523,7 @@ func TestReportNextScheduledUpgrade(t *testing.T) {
 				State:         details.StateScheduled,
 				ActionID:      "action2",
 				Metadata: details.Metadata{
-					ScheduledAt: later.Truncate(time.Second),
+					ScheduledAt: &laterTruncate,
 				},
 			},
 		},
@@ -544,7 +545,7 @@ func TestReportNextScheduledUpgrade(t *testing.T) {
 				State:         details.StateScheduled,
 				ActionID:      "action4",
 				Metadata: details.Metadata{
-					ScheduledAt: later.Truncate(time.Second),
+					ScheduledAt: &laterTruncate,
 				},
 			},
 		},
diff --git a/internal/pkg/agent/application/upgrade/details/details.go b/internal/pkg/agent/application/upgrade/details/details.go
index dcc990d33cd..dae27923ecf 100644
--- a/internal/pkg/agent/application/upgrade/details/details.go
+++ b/internal/pkg/agent/application/upgrade/details/details.go
@@ -167,13 +167,24 @@ func (d *Details) notifyObserver(observer Observer) {
 }
 
 func (m Metadata) Equals(otherM Metadata) bool {
-	return m.ScheduledAt.Equal(otherM.ScheduledAt) &&
+	return equalTimePointers(m.ScheduledAt, otherM.ScheduledAt) &&
 		m.FailedState == otherM.FailedState &&
 		m.ErrorMsg == otherM.ErrorMsg &&
 		m.DownloadPercent == otherM.DownloadPercent &&
 		m.DownloadRate == otherM.DownloadRate
 }
 
+func equalTimePointers(t, otherT *time.Time) bool {
+	if t == otherT {
+		return true
+	}
+	if t == nil || otherT == nil {
+		return false
+	}
+
+	return t.Equal(*otherT)
+}
+
 func (dr *downloadRate) MarshalJSON() ([]byte, error) {
 	downloadRateBytesPerSecond := float64(*dr)
 	if math.IsInf(downloadRateBytesPerSecond, 0) {

From 112f61896951f738c98f0a46c6df4160d31b718f Mon Sep 17 00:00:00 2001
From: Fae Charlton <fae.charlton@elastic.co>
Date: Thu, 16 Nov 2023 17:39:38 -0500
Subject: [PATCH 10/21] Rework runtime manager updates to block the coordinator
 less (#3747)

---
 .../application/coordinator/coordinator.go    |  32 +--
 .../coordinator/coordinator_state.go          |  17 +-
 .../coordinator/coordinator_test.go           |  12 +-
 .../coordinator/coordinator_unit_test.go      |  20 +-
 pkg/component/runtime/manager.go              | 248 ++++++++++--------
 pkg/component/runtime/manager_test.go         | 150 +++++++----
 6 files changed, 280 insertions(+), 199 deletions(-)

diff --git a/internal/pkg/agent/application/coordinator/coordinator.go b/internal/pkg/agent/application/coordinator/coordinator.go
index c33cc700729..1129ef96e32 100644
--- a/internal/pkg/agent/application/coordinator/coordinator.go
+++ b/internal/pkg/agent/application/coordinator/coordinator.go
@@ -100,10 +100,7 @@ type RuntimeManager interface {
 	Runner
 
 	// Update updates the current components model.
-	Update(model component.Model) error
-
-	// State returns the current components model state.
-	State() []runtime.ComponentComponentState
+	Update(model component.Model)
 
 	// PerformAction executes an action on a unit.
 	PerformAction(ctx context.Context, comp component.Component, unit component.Unit, name string, params map[string]interface{}) (map[string]interface{}, error)
@@ -241,17 +238,19 @@ type Coordinator struct {
 	// into the reported state before broadcasting -- State() will report
 	// agentclient.Failed if one of these is set, even if the underlying
 	// coordinator state is agentclient.Healthy.
-	runtimeMgrErr error // Currently unused
-	configMgrErr  error
-	actionsErr    error
-	varsMgrErr    error
+	// Errors from the runtime manager report policy update failures and are
+	// stored in runtimeUpdateErr below.
+	configMgrErr error
+	actionsErr   error
+	varsMgrErr   error
 
 	// Errors resulting from different possible failure modes when setting a
 	// new policy. Right now there are three different stages where a policy
 	// update can fail:
 	// - in generateAST, converting the policy to an AST
 	// - in process, converting the AST and vars into a full component model
-	// - while sending the final component model to the runtime manager
+	// - while applying the final component model in the runtime manager
+	//   (reported asynchronously via the runtime manager error channel)
 	//
 	// The plan is to improve our preprocessing so we can always detect
 	// failures immediately https://github.com/elastic/elastic-agent/issues/2887.
@@ -920,7 +919,13 @@ func (c *Coordinator) runLoopIteration(ctx context.Context) {
 		return
 
 	case runtimeErr := <-c.managerChans.runtimeManagerError:
-		c.setRuntimeManagerError(runtimeErr)
+		// runtime manager errors report the result of a policy update.
+		// Coordinator transitions from starting to healthy when a policy update
+		// is successful.
+		c.setRuntimeUpdateError(runtimeErr)
+		if runtimeErr == nil {
+			c.setCoordinatorState(agentclient.Healthy, "Running")
+		}
 
 	case configErr := <-c.managerChans.configManagerError:
 		if c.isManaged {
@@ -1153,12 +1158,7 @@ func (c *Coordinator) refreshComponentModel(ctx context.Context) (err error) {
 
 	c.logger.Info("Updating running component model")
 	c.logger.With("components", model.Components).Debug("Updating running component model")
-	err = c.runtimeMgr.Update(model)
-	c.setRuntimeUpdateError(err)
-	if err != nil {
-		return fmt.Errorf("updating runtime: %w", err)
-	}
-	c.setCoordinatorState(agentclient.Healthy, "Running")
+	c.runtimeMgr.Update(model)
 	return nil
 }
 
diff --git a/internal/pkg/agent/application/coordinator/coordinator_state.go b/internal/pkg/agent/application/coordinator/coordinator_state.go
index 6e645c3a06b..22394ee52e0 100644
--- a/internal/pkg/agent/application/coordinator/coordinator_state.go
+++ b/internal/pkg/agent/application/coordinator/coordinator_state.go
@@ -63,10 +63,10 @@ func (c *Coordinator) SetUpgradeDetails(upgradeDetails *details.Details) {
 	c.upgradeDetailsChan <- upgradeDetails
 }
 
-// setRuntimeManagerError updates the error state for the runtime manager.
+// setRuntimeUpdateError reports a failed policy update in the runtime manager.
 // Called on the main Coordinator goroutine.
-func (c *Coordinator) setRuntimeManagerError(err error) {
-	c.runtimeMgrErr = err
+func (c *Coordinator) setRuntimeUpdateError(err error) {
+	c.runtimeUpdateErr = err
 	c.stateNeedsRefresh = true
 }
 
@@ -107,14 +107,6 @@ func (c *Coordinator) setComponentGenError(err error) {
 	c.stateNeedsRefresh = true
 }
 
-// setRuntimeUpdateError updates the error state for sending a component model
-// update to the runtime manager.
-// Called on the main Coordinator goroutine.
-func (c *Coordinator) setRuntimeUpdateError(err error) {
-	c.runtimeUpdateErr = err
-	c.stateNeedsRefresh = true
-}
-
 // setOverrideState is the internal helper to set the override state and
 // set stateNeedsRefresh.
 // Must be called on the main Coordinator goroutine.
@@ -201,9 +193,6 @@ func (c *Coordinator) generateReportableState() (s State) {
 	} else if c.runtimeUpdateErr != nil {
 		s.State = agentclient.Failed
 		s.Message = fmt.Sprintf("Runtime update failed: %s", c.runtimeUpdateErr.Error())
-	} else if c.runtimeMgrErr != nil {
-		s.State = agentclient.Failed
-		s.Message = fmt.Sprintf("Runtime manager: %s", c.runtimeMgrErr.Error())
 	} else if c.configMgrErr != nil {
 		s.State = agentclient.Failed
 		s.Message = fmt.Sprintf("Config manager: %s", c.configMgrErr.Error())
diff --git a/internal/pkg/agent/application/coordinator/coordinator_test.go b/internal/pkg/agent/application/coordinator/coordinator_test.go
index 676906bbde0..49770bc1e97 100644
--- a/internal/pkg/agent/application/coordinator/coordinator_test.go
+++ b/internal/pkg/agent/application/coordinator/coordinator_test.go
@@ -787,6 +787,8 @@ func (f *fakeVarsManager) Vars(ctx context.Context, vars []*transpiler.Vars) {
 type fakeRuntimeManager struct {
 	state          []runtime.ComponentComponentState
 	updateCallback func([]component.Component) error
+	result         error
+	errChan        chan error
 }
 
 func (r *fakeRuntimeManager) Run(ctx context.Context) error {
@@ -796,11 +798,15 @@ func (r *fakeRuntimeManager) Run(ctx context.Context) error {
 
 func (r *fakeRuntimeManager) Errors() <-chan error { return nil }
 
-func (r *fakeRuntimeManager) Update(model component.Model) error {
+func (r *fakeRuntimeManager) Update(model component.Model) {
+	r.result = nil
 	if r.updateCallback != nil {
-		return r.updateCallback(model.Components)
+		r.result = r.updateCallback(model.Components)
+	}
+	if r.errChan != nil {
+		// If a reporting channel is set, send the result to it
+		r.errChan <- r.result
 	}
-	return nil
 }
 
 // State returns the current components model state.
diff --git a/internal/pkg/agent/application/coordinator/coordinator_unit_test.go b/internal/pkg/agent/application/coordinator/coordinator_unit_test.go
index 956eb33b8c0..89869e48d5e 100644
--- a/internal/pkg/agent/application/coordinator/coordinator_unit_test.go
+++ b/internal/pkg/agent/application/coordinator/coordinator_unit_test.go
@@ -754,6 +754,7 @@ func TestCoordinatorReportsRuntimeManagerUpdateFailure(t *testing.T) {
 	logger := logp.NewLogger("testing")
 
 	configChan := make(chan ConfigChange, 1)
+	updateErrChan := make(chan error, 1)
 
 	const errorStr = "update failed for testing reasons"
 	// Create a mocked runtime manager that always reports an error
@@ -761,6 +762,7 @@ func TestCoordinatorReportsRuntimeManagerUpdateFailure(t *testing.T) {
 		updateCallback: func(comp []component.Component) error {
 			return fmt.Errorf(errorStr)
 		},
+		errChan: updateErrChan,
 	}
 
 	coord := &Coordinator{
@@ -769,6 +771,9 @@ func TestCoordinatorReportsRuntimeManagerUpdateFailure(t *testing.T) {
 		stateBroadcaster: broadcaster.New(State{}, 0, 0),
 		managerChans: managerChans{
 			configManagerUpdate: configChan,
+			// Give coordinator the same error channel we set on the runtime
+			// manager, so it receives the update result.
+			runtimeManagerError: updateErrChan,
 		},
 		runtimeMgr: runtimeManager,
 		vars:       emptyVars(t),
@@ -781,16 +786,19 @@ func TestCoordinatorReportsRuntimeManagerUpdateFailure(t *testing.T) {
 	configChan <- configChange
 	coord.runLoopIteration(ctx)
 
-	// Make sure the failure was reported to the config manager
-	assert.True(t, configChange.failed, "Config change should report failure if the runtime manager returns an error")
-	require.Error(t, configChange.err, "Config change should get an error if runtime manager update fails")
-	assert.Contains(t, configChange.err.Error(), errorStr)
+	// Make sure the config change was acknowledged to the config manager
+	// (the failure is not reported here since it happens asynchronously; it
+	// will appear in the coordinator state afterwards.)
+	assert.True(t, configChange.acked, "Config change should be acknowledged to the config manager")
+	assert.NoError(t, configChange.err, "Config change with async error should succeed")
 
-	// Make sure the error is saved in Coordinator.runtimeUpdateErr
+	// Now do another run loop iteration to let the update error propagate,
+	// and make sure it is reported correctly.
+	coord.runLoopIteration(ctx)
 	require.Error(t, coord.runtimeUpdateErr, "Runtime update failure should be saved in runtimeUpdateErr")
 	assert.Equal(t, errorStr, coord.runtimeUpdateErr.Error(), "runtimeUpdateErr should match the error reported by the runtime manager")
 
-	// Make sure the error is reported in Coordinator state.
+	// Make sure the error appears in the Coordinator state.
 	state := coord.State()
 	assert.Equal(t, agentclient.Failed, state.State, "Failed policy update should cause failed Coordinator")
 	assert.Contains(t, state.Message, errorStr, "Failed policy update should be reported in Coordinator state message")
diff --git a/pkg/component/runtime/manager.go b/pkg/component/runtime/manager.go
index 8462ac3c17e..850349406d6 100644
--- a/pkg/component/runtime/manager.go
+++ b/pkg/component/runtime/manager.go
@@ -93,22 +93,36 @@ type Manager struct {
 	baseLogger *logger.Logger
 	ca         *authority.CertificateAuthority
 	listenAddr string
+	listenPort int
 	agentInfo  *info.AgentInfo
 	tracer     *apm.Tracer
 	monitor    MonitoringManager
 	grpcConfig *configuration.GRPCConfig
 
-	// netMx synchronizes the access to listener and server only
-	netMx    sync.RWMutex
-	listener net.Listener
-	server   *grpc.Server
-
 	// Set when the RPC server is ready to receive requests, for use by tests.
 	serverReady *atomic.Bool
 
-	// updateMx protects the call to update to ensure that
-	// only one call to update occurs at a time
-	updateMx sync.Mutex
+	// updateChan forwards component model updates from the public Update method
+	// to the internal run loop.
+	updateChan chan component.Model
+
+	// Component model update is run asynchronously and pings this channel when
+	// finished, so the runtime manager loop knows it's safe to advance to the
+	// next update without ever having to block on the result.
+	updateDoneChan chan struct{}
+
+	// Next component model update that will be applied, in case we get one
+	// while a previous update is still in progress. If we get more than one,
+	// keep only the most recent.
+	// Only access from the main runtime manager goroutine.
+	nextUpdate *component.Model
+
+	// Whether we're already waiting on the results of an update call.
+	// If this is true when the run loop finishes, we need to wait for the
+	// final update result before shutting down, otherwise the shutdown's
+	// update call will conflict.
+	// Only access from the main runtime manager goroutine.
+	updateInProgress bool
 
 	// currentMx protects access to the current map only
 	currentMx sync.RWMutex
@@ -123,10 +137,9 @@ type Manager struct {
 
 	errCh chan error
 
-	// upon creation the Manager is neither running not shutting down, thus both
-	// flags are needed.
-	running      atomic.Bool
-	shuttingDown atomic.Bool
+	// doneChan is closed when Manager is shutting down to signal that any
+	// pending requests should be canceled.
+	doneChan chan struct{}
 }
 
 // NewManager creates a new manager.
@@ -144,19 +157,22 @@ func NewManager(
 		return nil, err
 	}
 	m := &Manager{
-		logger:        logger,
-		baseLogger:    baseLogger,
-		ca:            ca,
-		listenAddr:    listenAddr,
-		agentInfo:     agentInfo,
-		tracer:        tracer,
-		current:       make(map[string]*componentRuntimeState),
-		shipperConns:  make(map[string]*shipperConn),
-		subscriptions: make(map[string][]*Subscription),
-		errCh:         make(chan error),
-		monitor:       monitor,
-		grpcConfig:    grpcConfig,
-		serverReady:   atomic.NewBool(false),
+		logger:         logger,
+		baseLogger:     baseLogger,
+		ca:             ca,
+		listenAddr:     listenAddr,
+		agentInfo:      agentInfo,
+		tracer:         tracer,
+		current:        make(map[string]*componentRuntimeState),
+		shipperConns:   make(map[string]*shipperConn),
+		subscriptions:  make(map[string][]*Subscription),
+		updateChan:     make(chan component.Model),
+		updateDoneChan: make(chan struct{}),
+		errCh:          make(chan error),
+		monitor:        monitor,
+		grpcConfig:     grpcConfig,
+		serverReady:    atomic.NewBool(false),
+		doneChan:       make(chan struct{}),
 	}
 	return m, nil
 }
@@ -169,16 +185,11 @@ func NewManager(
 //
 // Blocks until the context is done.
 func (m *Manager) Run(ctx context.Context) error {
-	m.running.Store(true)
-	m.shuttingDown.Store(false)
-
-	lis, err := net.Listen("tcp", m.listenAddr)
+	listener, err := net.Listen("tcp", m.listenAddr)
 	if err != nil {
 		return fmt.Errorf("error starting tcp listener for runtime manager: %w", err)
 	}
-	m.netMx.Lock()
-	m.listener = lis
-	m.netMx.Unlock()
+	m.listenPort = listener.Addr().(*net.TCPAddr).Port
 
 	certPool := x509.NewCertPool()
 	if ok := certPool.AppendCertsFromPEM(m.ca.Crt()); !ok {
@@ -205,87 +216,123 @@ func (m *Manager) Run(ctx context.Context) error {
 			grpc.MaxRecvMsgSize(m.grpcConfig.MaxMsgSize),
 		)
 	}
-	m.netMx.Lock()
-	m.server = server
-	m.netMx.Unlock()
-	proto.RegisterElasticAgentServer(m.server, m)
+	proto.RegisterElasticAgentServer(server, m)
 
 	// start serving GRPC connections
-	var wg sync.WaitGroup
-	wg.Add(1)
+	var wgServer sync.WaitGroup
+	wgServer.Add(1)
 	go func() {
-		defer wg.Done()
-		m.serverReady.Store(true)
-		for {
-			err := server.Serve(lis)
-			if err != nil {
-				m.logger.Errorf("control protocol failed: %s", err)
-			}
-			if ctx.Err() != nil {
-				// context has an error don't start again
-				return
-			}
-		}
+		defer wgServer.Done()
+		go m.serverLoop(ctx, listener, server)
 	}()
 
-	<-ctx.Done()
-	m.running.Store(false)
-	m.shuttingDown.Store(true)
+	// Start the run loop, which continues on the main goroutine
+	// until the context is canceled.
+	m.runLoop(ctx)
+
+	// Notify components to shutdown and wait for their response
 	m.shutdown()
 
+	// Close the rpc listener and wait for serverLoop to return
+	listener.Close()
+	wgServer.Wait()
+
+	// Cancel any remaining connections
 	server.Stop()
-	wg.Wait()
-	m.netMx.Lock()
-	m.listener = nil
-	m.server = nil
-	m.netMx.Unlock()
 	return ctx.Err()
 }
 
+// The main run loop for the runtime manager, whose responsibilities are:
+//   - Accept component model updates from the Coordinator
+//   - Apply those updates safely without ever blocking, because a block here
+//     propagates to a block in the Coordinator
+//   - Close doneChan when the loop ends, so the Coordinator knows not to send
+//     any more updates
+func (m *Manager) runLoop(ctx context.Context) {
+LOOP:
+	for ctx.Err() == nil {
+		select {
+		case <-ctx.Done():
+			break LOOP
+		case model := <-m.updateChan:
+			// We got a new component model from m.Update(), mark it as the
+			// next update to apply, overwriting any previous pending value.
+			m.nextUpdate = &model
+		case <-m.updateDoneChan:
+			// An update call has finished, we can initiate another when available.
+			m.updateInProgress = false
+		}
+
+		// After each select call, check if there's a pending update that
+		// can be applied.
+		if m.nextUpdate != nil && !m.updateInProgress {
+			// There is a component model update available, apply it.
+			go func(model component.Model) {
+				// Run the update with tearDown set to true since this is coming
+				// from a user-initiated policy update
+				err := m.update(model, true)
+
+				// When update is done, send its result back to the coordinator,
+				// unless we're shutting down.
+				select {
+				case m.errCh <- err:
+				case <-ctx.Done():
+				}
+				// Signal the runtime manager that we're finished. Note that
+				// we don't select on ctx.Done() in this case because the runtime
+				// manager always reads the results of an update once initiated,
+				// even if it is shutting down.
+				m.updateDoneChan <- struct{}{}
+			}(*m.nextUpdate)
+			m.updateInProgress = true
+			m.nextUpdate = nil
+		}
+	}
+	// Signal that the run loop is ended to unblock any incoming messages.
+	// We need to do this before waiting on the final update result, otherwise
+	// it might be stuck trying to send the result to errCh.
+	close(m.doneChan)
+
+	if m.updateInProgress {
+		// Wait for the existing update to finish before shutting down,
+		// otherwise the new update call closing everything will
+		// conflict.
+		<-m.updateDoneChan
+		m.updateInProgress = false
+	}
+}
+
+func (m *Manager) serverLoop(ctx context.Context, listener net.Listener, server *grpc.Server) {
+	m.serverReady.Store(true)
+	for ctx.Err() == nil {
+		err := server.Serve(listener)
+		if err != nil && ctx.Err() == nil {
+			// Only log an error if we aren't shutting down, otherwise we'll spam
+			// the logs with "use of closed network connection" for a connection that
+			// was closed on purpose.
+			m.logger.Errorf("control protocol listener failed: %s", err)
+		}
+	}
+}
+
 // Errors returns channel that errors are reported on.
 func (m *Manager) Errors() <-chan error {
 	return m.errCh
 }
 
-// Update updates the currComp state of the running components.
+// Update forwards a new component model to Manager's run loop.
+// When it has been processed, a result will be sent on Manager's
+// error channel.
 // Called from the main Coordinator goroutine.
 //
-// This returns as soon as possible, the work is performed in the background.
-func (m *Manager) Update(model component.Model) error {
-	shuttingDown := m.shuttingDown.Load()
-	if shuttingDown {
-		// ignore any updates once shutdown started
-		return nil
-	}
-	// teardown is true because the public `Update` method would be coming directly from
-	// policy so if a component was removed it needs to be torn down.
-	return m.update(model, true)
-}
-
-// State returns the current component states.
-func (m *Manager) State() []ComponentComponentState {
-	m.currentMx.RLock()
-	defer m.currentMx.RUnlock()
-	states := make([]ComponentComponentState, 0, len(m.current))
-	for _, crs := range m.current {
-		var legacyPID string
-		if crs.runtime != nil {
-			if commandRuntime, ok := crs.runtime.(*commandRuntime); ok {
-				if commandRuntime != nil {
-					procInfo := commandRuntime.proc
-					if procInfo != nil {
-						legacyPID = fmt.Sprint(commandRuntime.proc.PID)
-					}
-				}
-			}
-		}
-		states = append(states, ComponentComponentState{
-			Component: crs.getCurrent(),
-			State:     crs.getLatest(),
-			LegacyPID: legacyPID,
-		})
+// If calling from a test, you should read from errCh afterwards to avoid
+// blocking Manager's main loop.
+func (m *Manager) Update(model component.Model) {
+	select {
+	case m.updateChan <- model:
+	case <-m.doneChan:
+		// Manager is shutting down, ignore the update
 	}
-	return states
 }
 
 // PerformAction executes an action on a unit.
@@ -658,13 +705,10 @@ func (m *Manager) Actions(server proto.ElasticAgent_ActionsServer) error {
 }
 
 // update updates the current state of the running components.
+// It is only called by the main runtime manager goroutine in Manager.Run.
 //
 // This returns as soon as possible, work is performed in the background.
 func (m *Manager) update(model component.Model, teardown bool) error {
-	// ensure that only one `update` can occur at the same time
-	m.updateMx.Lock()
-	defer m.updateMx.Unlock()
-
 	// prepare the components to add consistent shipper connection information between
 	// the connected components in the model
 	err := m.connectShippers(model.Components)
@@ -891,13 +935,7 @@ func (m *Manager) getRuntimeFromComponent(comp component.Component) *componentRu
 func (m *Manager) getListenAddr() string {
 	addr := strings.SplitN(m.listenAddr, ":", 2)
 	if len(addr) == 2 && addr[1] == "0" {
-		m.netMx.RLock()
-		lis := m.listener
-		m.netMx.RUnlock()
-		if lis != nil {
-			port := lis.Addr().(*net.TCPAddr).Port
-			return fmt.Sprintf("%s:%d", addr[0], port)
-		}
+		return fmt.Sprintf("%s:%d", addr[0], m.listenPort)
 	}
 	return m.listenAddr
 }
diff --git a/pkg/component/runtime/manager_test.go b/pkg/component/runtime/manager_test.go
index bb83de6fd04..51bb941bca6 100644
--- a/pkg/component/runtime/manager_test.go
+++ b/pkg/component/runtime/manager_test.go
@@ -145,7 +145,8 @@ func TestManager_SimpleComponentErr(t *testing.T) {
 	defer drainErrChan(errCh)
 	defer drainErrChan(subErrCh)
 
-	err = m.Update(component.Model{Components: []component.Component{comp}})
+	m.Update(component.Model{Components: []component.Component{comp}})
+	err = <-m.errCh
 	require.NoError(t, err)
 
 	endTimer := time.NewTimer(30 * time.Second)
@@ -237,7 +238,8 @@ func TestManager_FakeInput_StartStop(t *testing.T) {
 							subErrCh <- fmt.Errorf("unit failed: %s", unit.Message)
 						} else if unit.State == client.UnitStateHealthy {
 							// remove the component which will stop it
-							err := m.Update(component.Model{Components: []component.Component{}})
+							m.Update(component.Model{Components: []component.Component{}})
+							err := <-m.errCh
 							if err != nil {
 								subErrCh <- err
 							}
@@ -260,7 +262,8 @@ func TestManager_FakeInput_StartStop(t *testing.T) {
 	defer drainErrChan(errCh)
 	defer drainErrChan(subErrCh)
 
-	err = m.Update(component.Model{Components: []component.Component{comp}})
+	m.Update(component.Model{Components: []component.Component{comp}})
+	err = <-m.errCh
 	require.NoError(t, err)
 
 	endTimer := time.NewTimer(30 * time.Second)
@@ -384,7 +387,8 @@ func TestManager_FakeInput_Features(t *testing.T) {
 							Fqdn: &proto.FQDNFeature{Enabled: true},
 						}
 
-						err := m.Update(component.Model{Components: []component.Component{comp}})
+						m.Update(component.Model{Components: []component.Component{comp}})
+						err := <-m.errCh
 						if err != nil {
 							subscriptionErrCh <- fmt.Errorf("[case %d]: failed to update component: %w",
 								healthIteration, err)
@@ -439,7 +443,8 @@ func TestManager_FakeInput_Features(t *testing.T) {
 						"message": "Fake Healthy",
 					})
 
-					err := m.Update(component.Model{Components: []component.Component{comp}})
+					m.Update(component.Model{Components: []component.Component{comp}})
+					err := <-m.errCh
 					if err != nil {
 						t.Logf("error updating component state to health: %v", err)
 
@@ -459,7 +464,8 @@ func TestManager_FakeInput_Features(t *testing.T) {
 	defer drainErrChan(managerErrCh)
 	defer drainErrChan(subscriptionErrCh)
 
-	err = m.Update(component.Model{Components: []component.Component{comp}})
+	m.Update(component.Model{Components: []component.Component{comp}})
+	err = <-m.errCh
 	require.NoError(t, err)
 
 	timeout := 30 * time.Second
@@ -609,7 +615,8 @@ func TestManager_FakeInput_APM(t *testing.T) {
 						comp.Component = &proto.Component{
 							ApmConfig: initialAPMConfig,
 						}
-						err := m.Update(component.Model{Components: []component.Component{comp}})
+						m.Update(component.Model{Components: []component.Component{comp}})
+						err := <-m.errCh
 						if err != nil {
 							subscriptionErrCh <- fmt.Errorf("[case %d]: failed to update component: %w",
 								healthIteration, err)
@@ -653,7 +660,8 @@ func TestManager_FakeInput_APM(t *testing.T) {
 						comp.Component = &proto.Component{
 							ApmConfig: modifiedAPMConfig,
 						}
-						err := m.Update(component.Model{Components: []component.Component{comp}})
+						m.Update(component.Model{Components: []component.Component{comp}})
+						err := <-m.errCh
 						if err != nil {
 							subscriptionErrCh <- fmt.Errorf("[case %d]: failed to update component: %w",
 								healthIteration, err)
@@ -692,7 +700,8 @@ func TestManager_FakeInput_APM(t *testing.T) {
 						comp.Component = &proto.Component{
 							ApmConfig: nil,
 						}
-						err := m.Update(component.Model{Components: []component.Component{comp}})
+						m.Update(component.Model{Components: []component.Component{comp}})
+						err := <-m.errCh
 						if err != nil {
 							subscriptionErrCh <- fmt.Errorf("[case %d]: failed to update component: %w",
 								healthIteration, err)
@@ -741,7 +750,8 @@ func TestManager_FakeInput_APM(t *testing.T) {
 						"message": "Fake Healthy",
 					})
 
-					err := m.Update(component.Model{Components: []component.Component{comp}})
+					m.Update(component.Model{Components: []component.Component{comp}})
+					err := <-m.errCh
 					if err != nil {
 						t.Logf("error updating component state to health: %v", err)
 
@@ -761,7 +771,8 @@ func TestManager_FakeInput_APM(t *testing.T) {
 	defer drainErrChan(managerErrCh)
 	defer drainErrChan(subscriptionErrCh)
 
-	err = m.Update(component.Model{Components: []component.Component{comp}})
+	m.Update(component.Model{Components: []component.Component{comp}})
+	err = <-m.errCh
 	require.NoError(t, err)
 
 	timeout := 30 * time.Second
@@ -902,9 +913,10 @@ func TestManager_FakeInput_Limits(t *testing.T) {
 								GoMaxProcs: 101,
 							},
 						}
-						err := m.Update(component.Model{
+						m.Update(component.Model{
 							Components: []component.Component{comp},
 						})
+						err := <-m.errCh
 						if err != nil {
 							subscriptionErrCh <- fmt.Errorf("[case %d]: failed to update component: %w",
 								healthyIteration, err)
@@ -917,9 +929,10 @@ func TestManager_FakeInput_Limits(t *testing.T) {
 						assert.Equal(t, uint64(101), componentState.Component.Limits.GoMaxProcs)
 
 						comp.Component = nil
-						err := m.Update(component.Model{
+						m.Update(component.Model{
 							Components: []component.Component{comp},
 						})
+						err := <-m.errCh
 						if err != nil {
 							subscriptionErrCh <- fmt.Errorf("[case %d]: failed to update component: %w",
 								healthyIteration, err)
@@ -945,7 +958,8 @@ func TestManager_FakeInput_Limits(t *testing.T) {
 	defer drainErrChan(managerErrCh)
 	defer drainErrChan(subscriptionErrCh)
 
-	err = m.Update(component.Model{Components: []component.Component{comp}})
+	m.Update(component.Model{Components: []component.Component{comp}})
+	err = <-m.errCh
 	require.NoError(t, err)
 
 	timeout := 30 * time.Second
@@ -1063,9 +1077,10 @@ func TestManager_FakeShipper_Limits(t *testing.T) {
 								GoMaxProcs: 101,
 							},
 						}
-						err := m.Update(component.Model{
+						m.Update(component.Model{
 							Components: []component.Component{comp},
 						})
+						err := <-m.errCh
 						if err != nil {
 							subscriptionErrCh <- fmt.Errorf("[case %d]: failed to update component: %w",
 								healthyIteration, err)
@@ -1078,9 +1093,10 @@ func TestManager_FakeShipper_Limits(t *testing.T) {
 						assert.Equal(t, uint64(101), componentState.Component.Limits.GoMaxProcs)
 
 						comp.Component = nil
-						err := m.Update(component.Model{
+						m.Update(component.Model{
 							Components: []component.Component{comp},
 						})
+						err := <-m.errCh
 						if err != nil {
 							subscriptionErrCh <- fmt.Errorf("[case %d]: failed to update component: %w",
 								healthyIteration, err)
@@ -1106,7 +1122,8 @@ func TestManager_FakeShipper_Limits(t *testing.T) {
 	defer drainErrChan(managerErrCh)
 	defer drainErrChan(subscriptionErrCh)
 
-	err = m.Update(component.Model{Components: []component.Component{comp}})
+	m.Update(component.Model{Components: []component.Component{comp}})
+	err = <-m.errCh
 	require.NoError(t, err)
 
 	timeout := 30 * time.Second
@@ -1222,7 +1239,8 @@ func TestManager_FakeInput_BadUnitToGood(t *testing.T) {
 							}
 
 							unitBad = false
-							err := m.Update(component.Model{Components: []component.Component{updatedComp}})
+							m.Update(component.Model{Components: []component.Component{updatedComp}})
+							err := <-m.errCh
 							if err != nil {
 								subErrCh <- err
 							}
@@ -1250,7 +1268,8 @@ func TestManager_FakeInput_BadUnitToGood(t *testing.T) {
 								}
 							} else if unit.State == client.UnitStateHealthy {
 								// bad unit is now healthy; stop the component
-								err := m.Update(component.Model{Components: []component.Component{}})
+								m.Update(component.Model{Components: []component.Component{}})
+								err := <-m.errCh
 								if err != nil {
 									subErrCh <- err
 								}
@@ -1274,7 +1293,8 @@ func TestManager_FakeInput_BadUnitToGood(t *testing.T) {
 	defer drainErrChan(errCh)
 	defer drainErrChan(subErrCh)
 
-	err = m.Update(component.Model{Components: []component.Component{comp}})
+	m.Update(component.Model{Components: []component.Component{comp}})
+	err = <-m.errCh
 	require.NoError(t, err)
 
 	endTimer := time.NewTimer(30 * time.Second)
@@ -1365,7 +1385,8 @@ func TestManager_FakeInput_GoodUnitToBad(t *testing.T) {
 	endTimer := time.NewTimer(30 * time.Second)
 	defer endTimer.Stop()
 
-	err = m.Update(component.Model{Components: []component.Component{healthyComp}})
+	m.Update(component.Model{Components: []component.Component{healthyComp}})
+	err = <-m.errCh
 	require.NoError(t, err)
 
 	// nextState tracks the stage of the test. We expect the sequence
@@ -1395,7 +1416,8 @@ LOOP:
 			if unit.State == client.UnitStateHealthy {
 				// good unit is healthy; now make it bad
 				t.Logf("marking good-input as having a hard-error for config")
-				err := m.Update(component.Model{Components: []component.Component{unhealthyComp}})
+				m.Update(component.Model{Components: []component.Component{unhealthyComp}})
+				err := <-m.errCh
 				require.NoError(t, err, "Component model update should succeed")
 
 				// We next expect to transition to Failed
@@ -1414,7 +1436,8 @@ LOOP:
 			if unit.State == client.UnitStateFailed {
 				// Reached the expected state, now send an empty component model
 				// to stop everything.
-				err := m.Update(component.Model{Components: []component.Component{}})
+				m.Update(component.Model{Components: []component.Component{}})
+				err := <-m.errCh
 				require.NoError(t, err, "Component model update should succeed")
 				nextState = client.UnitStateStopped
 			} else {
@@ -1445,12 +1468,10 @@ LOOP:
 }
 
 func TestManager_FakeInput_NoDeadlock(t *testing.T) {
-	/*
-		NOTE: This is a long-running test that spams the runtime managers `Update` function to try and
-		trigger a deadlock. This test takes 2 minutes to run trying to re-produce issue:
+	// NOTE: This is a long-running test that spams the runtime managers `Update` function to try and
+	// trigger a deadlock. This test takes 2 minutes to run trying to re-produce issue:
+	// https://github.com/elastic/elastic-agent/issues/2691
 
-		https://github.com/elastic/elastic-agent/issues/2691
-	*/
 	testPaths(t)
 
 	ctx, cancel := context.WithCancel(context.Background())
@@ -1523,7 +1544,8 @@ func TestManager_FakeInput_NoDeadlock(t *testing.T) {
 			}
 			i += 1
 			comp = updatedComp
-			err := m.Update(component.Model{Components: []component.Component{updatedComp}})
+			m.Update(component.Model{Components: []component.Component{updatedComp}})
+			err := <-m.errCh
 			if err != nil {
 				updatedErr <- err
 				return
@@ -1564,7 +1586,8 @@ LOOP:
 		case <-endTimer.C:
 			// no deadlock after timeout (all good stop the component)
 			updatedCancel()
-			_ = m.Update(component.Model{Components: []component.Component{}})
+			m.Update(component.Model{Components: []component.Component{}})
+			<-m.errCh // Don't care about the result of Update, just that it runs
 			break LOOP
 		case err := <-errCh:
 			require.NoError(t, err)
@@ -1656,7 +1679,8 @@ func TestManager_FakeInput_Configure(t *testing.T) {
 								"state":   int(client.UnitStateDegraded),
 								"message": "Fake Degraded",
 							})
-							err := m.Update(component.Model{Components: []component.Component{comp}})
+							m.Update(component.Model{Components: []component.Component{comp}})
+							err := <-m.errCh
 							if err != nil {
 								subErrCh <- err
 							}
@@ -1679,7 +1703,8 @@ func TestManager_FakeInput_Configure(t *testing.T) {
 	defer drainErrChan(errCh)
 	defer drainErrChan(subErrCh)
 
-	err = m.Update(component.Model{Components: []component.Component{comp}})
+	m.Update(component.Model{Components: []component.Component{comp}})
+	err = <-m.errCh
 	require.NoError(t, err)
 
 	endTimer := time.NewTimer(30 * time.Second)
@@ -1797,7 +1822,8 @@ func TestManager_FakeInput_RemoveUnit(t *testing.T) {
 						} else if unit1.State == client.UnitStateHealthy {
 							// unit1 is healthy lets remove it from the component
 							comp.Units = comp.Units[0:1]
-							err := m.Update(component.Model{Components: []component.Component{comp}})
+							m.Update(component.Model{Components: []component.Component{comp}})
+							err = <-m.errCh
 							if err != nil {
 								subErrCh <- err
 							}
@@ -1832,7 +1858,8 @@ func TestManager_FakeInput_RemoveUnit(t *testing.T) {
 	defer drainErrChan(errCh)
 	defer drainErrChan(subErrCh)
 
-	err = m.Update(component.Model{Components: []component.Component{comp}})
+	m.Update(component.Model{Components: []component.Component{comp}})
+	err = <-m.errCh
 	require.NoError(t, err)
 
 	endTimer := time.NewTimer(30 * time.Second)
@@ -1956,7 +1983,8 @@ func TestManager_FakeInput_ActionState(t *testing.T) {
 	defer drainErrChan(errCh)
 	defer drainErrChan(subErrCh)
 
-	err = m.Update(component.Model{Components: []component.Component{comp}})
+	m.Update(component.Model{Components: []component.Component{comp}})
+	err = <-m.errCh
 	require.NoError(t, err)
 
 	endTimer := time.NewTimer(30 * time.Second)
@@ -2091,7 +2119,8 @@ func TestManager_FakeInput_Restarts(t *testing.T) {
 	defer drainErrChan(errCh)
 	defer drainErrChan(subErrCh)
 
-	err = m.Update(component.Model{Components: []component.Component{comp}})
+	m.Update(component.Model{Components: []component.Component{comp}})
+	err = <-m.errCh
 	require.NoError(t, err)
 
 	endTimer := time.NewTimer(30 * time.Second)
@@ -2208,7 +2237,8 @@ func TestManager_FakeInput_Restarts_ConfigKill(t *testing.T) {
 									"message": "Fake Healthy",
 									"kill":    rp[1],
 								})
-								err := m.Update(component.Model{Components: []component.Component{comp}})
+								m.Update(component.Model{Components: []component.Component{comp}})
+								err := <-m.errCh
 								if err != nil {
 									subErrCh <- err
 								}
@@ -2233,7 +2263,8 @@ func TestManager_FakeInput_Restarts_ConfigKill(t *testing.T) {
 	defer drainErrChan(errCh)
 	defer drainErrChan(subErrCh)
 
-	err = m.Update(component.Model{Components: []component.Component{comp}})
+	m.Update(component.Model{Components: []component.Component{comp}})
+	err = <-m.errCh
 	require.NoError(t, err)
 
 	endTimer := time.NewTimer(1 * time.Minute)
@@ -2347,7 +2378,8 @@ func TestManager_FakeInput_KeepsRestarting(t *testing.T) {
 									"message":          fmt.Sprintf("Fake Healthy %d", lastStoppedCount),
 									"kill_on_interval": true,
 								})
-								err := m.Update(component.Model{Components: []component.Component{comp}})
+								m.Update(component.Model{Components: []component.Component{comp}})
+								err := <-m.errCh
 								if err != nil {
 									subErrCh <- err
 								}
@@ -2375,7 +2407,8 @@ func TestManager_FakeInput_KeepsRestarting(t *testing.T) {
 	defer drainErrChan(errCh)
 	defer drainErrChan(subErrCh)
 
-	err = m.Update(component.Model{Components: []component.Component{comp}})
+	m.Update(component.Model{Components: []component.Component{comp}})
+	err = <-m.errCh
 	require.NoError(t, err)
 
 	endTimer := time.NewTimer(1 * time.Minute)
@@ -2493,7 +2526,8 @@ func TestManager_FakeInput_RestartsOnMissedCheckins(t *testing.T) {
 	defer drainErrChan(errCh)
 	defer drainErrChan(subErrCh)
 
-	err = m.Update(component.Model{Components: []component.Component{comp}})
+	m.Update(component.Model{Components: []component.Component{comp}})
+	err = <-m.errCh
 	require.NoError(t, err)
 
 	endTimer := time.NewTimer(30 * time.Second)
@@ -2610,7 +2644,8 @@ func TestManager_FakeInput_InvalidAction(t *testing.T) {
 	defer drainErrChan(errCh)
 	defer drainErrChan(subErrCh)
 
-	err = m.Update(component.Model{Components: []component.Component{comp}})
+	m.Update(component.Model{Components: []component.Component{comp}})
+	err = <-m.errCh
 	require.NoError(t, err)
 
 	endTimer := time.NewTimer(30 * time.Second)
@@ -2807,7 +2842,8 @@ func TestManager_FakeInput_MultiComponent(t *testing.T) {
 	defer drainErrChan(subErrCh1)
 	defer drainErrChan(subErrCh2)
 
-	err = m.Update(component.Model{Components: components})
+	m.Update(component.Model{Components: components})
+	err = <-m.errCh
 	require.NoError(t, err)
 
 	count := 0
@@ -2963,7 +2999,8 @@ func TestManager_FakeInput_LogLevel(t *testing.T) {
 	defer drainErrChan(errCh)
 	defer drainErrChan(subErrCh)
 
-	err = m.Update(component.Model{Components: []component.Component{comp}})
+	m.Update(component.Model{Components: []component.Component{comp}})
+	err = <-m.errCh
 	require.NoError(t, err)
 
 	endTimer := time.NewTimer(30 * time.Second)
@@ -3177,7 +3214,8 @@ func TestManager_FakeShipper(t *testing.T) {
 									subErrCh <- err
 								} else {
 									// successful; turn it all off
-									err := m.Update(component.Model{Components: []component.Component{}})
+									m.Update(component.Model{Components: []component.Component{}})
+									err = <-m.errCh
 									if err != nil {
 										subErrCh <- err
 									}
@@ -3206,7 +3244,8 @@ func TestManager_FakeShipper(t *testing.T) {
 									subErrCh <- err
 								} else {
 									// successful; turn it all off
-									err := m.Update(component.Model{Components: []component.Component{}})
+									m.Update(component.Model{Components: []component.Component{}})
+									err := <-m.errCh
 									if err != nil {
 										subErrCh <- err
 									}
@@ -3241,7 +3280,8 @@ func TestManager_FakeShipper(t *testing.T) {
 									subErrCh <- err
 								} else {
 									// successful; turn it all off
-									err := m.Update(component.Model{Components: []component.Component{}})
+									m.Update(component.Model{Components: []component.Component{}})
+									err := <-m.errCh
 									if err != nil {
 										subErrCh <- err
 									}
@@ -3266,7 +3306,8 @@ func TestManager_FakeShipper(t *testing.T) {
 	defer drainErrChan(errCh)
 	defer drainErrChan(subErrCh)
 
-	err = m.Update(component.Model{Components: comps})
+	m.Update(component.Model{Components: comps})
+	err = <-m.errCh
 	require.NoError(t, err)
 
 	timeout := 2 * time.Minute
@@ -3458,11 +3499,8 @@ func TestManager_FakeInput_OutputChange(t *testing.T) {
 		stateProgressionWG.Done()
 	}()
 
-	// Wait manager start running, then check if any error happened
-	assert.Eventually(t,
-		func() bool { return m.running.Load() },
-		500*time.Millisecond,
-		10*time.Millisecond)
+	err = waitForReady(waitCtx, m)
+	require.NoError(t, err, "Manager must finish initializing")
 
 	select {
 	case err := <-errCh:
@@ -3471,7 +3509,8 @@ func TestManager_FakeInput_OutputChange(t *testing.T) {
 	}
 
 	time.Sleep(100 * time.Millisecond)
-	err = m.Update(component.Model{Components: components})
+	m.Update(component.Model{Components: components})
+	err = <-m.errCh
 	require.NoError(t, err)
 
 	updateSleep := 300 * time.Millisecond
@@ -3480,7 +3519,8 @@ func TestManager_FakeInput_OutputChange(t *testing.T) {
 		updateSleep = time.Second
 	}
 	time.Sleep(updateSleep)
-	err = m.Update(component.Model{Components: components2})
+	m.Update(component.Model{Components: components2})
+	err = <-m.errCh
 	require.NoError(t, err)
 
 	count := 0

From adf19dbcdfb5540345d79538623efd97061850b5 Mon Sep 17 00:00:00 2001
From: Pavel Zorin <pavel.zorin@elastic.co>
Date: Mon, 20 Nov 2023 15:21:22 +0100
Subject: [PATCH 11/21] Sync k8s migtraion to buildkite (#3762)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Sync k8s migtraion to buildkite

* Sync k8s migtraion to buildkite

* Sync k8s migtraion to buildkite

* Sync k8s migtraion to buildkite

* Sync k8s migtraion to buildkite

* Sync k8s migtraion to buildkite

* Sync k8s migtraion to buildkite

* Fix sync-k8s.sh

* try monorepo-diff step

* try monorepo-diff step

* try monorepo-diff step

* try monorepo-diff step

* try monorepo-diff step

* Cleanup

* Cleanup

* Cleanup

* Added retry fro install-gh.sh. Cleanup

* Update .buildkite/scripts/install-gh.sh

Co-authored-by: Paolo Chilà <paolo.chila@elastic.co>

---------

Co-authored-by: Paolo Chilà <paolo.chila@elastic.co>
---
 .buildkite/pipeline.yml              | 20 +++++++++++
 .buildkite/scripts/install-gh.sh     | 54 ++++++++++++++++++++++++++++
 .buildkite/scripts/install-kind.sh   |  2 +-
 .buildkite/scripts/steps/sync-k8s.sh | 22 ++++++++++++
 deploy/kubernetes/Makefile           |  2 +-
 5 files changed, 98 insertions(+), 2 deletions(-)
 create mode 100644 .buildkite/scripts/install-gh.sh
 create mode 100644 .buildkite/scripts/steps/sync-k8s.sh

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 866dc91b367..6c47d08bf92 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -227,3 +227,23 @@ steps:
     plugins:
       - junit-annotate#v2.4.1:
           artifacts: build/TEST-go-integration*.xml
+
+  # Triggers a dynamic step: Sync K8s
+  # Runs only on main and if k8s files are changed
+  - label: "Trigger k8s sync"
+    branches: main
+    plugins:
+      - monebag/monorepo-diff#v2.5.9:
+          diff: "git diff --name-only HEAD~1"
+          watch:
+            - path:
+                - deploy/kubernetes/*
+                - version/docs/version.asciidoc
+              config:
+                label: "Sync K8s"
+                command: ".buildkite/scripts/steps/sync-k8s.sh"
+                agents:
+                  provider: "gcp"
+                  image: "family/core-ubuntu-2204"
+                env:
+                  - GH_VERSION=2.4.0
diff --git a/.buildkite/scripts/install-gh.sh b/.buildkite/scripts/install-gh.sh
new file mode 100644
index 00000000000..ff52687d02f
--- /dev/null
+++ b/.buildkite/scripts/install-gh.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+
+# Required environment variables:
+# - GH_VERSION - the version of gh to install
+set -exuo pipefail
+
+echo "--- Install gh cli"
+
+MSG="environment variable missing."
+DEFAULT_HOME="/usr/local"
+GH_VERSION=${GH_VERSION:-$MSG}
+HOME=${HOME:-$DEFAULT_HOME}
+GH_CMD="${HOME}/bin/gh"
+
+if command -v gh
+then
+    set +e
+    echo "Found GH. Checking version.."
+    FOUND_GH_VERSION=$(gh --version 2>&1 >/dev/null | awk '{print $3}')
+    if [ "$FOUND_GH_VERSION" == "$GH_VERSION" ]
+    then
+        echo "GH Versions match: $GH_VERSION. No need to install gh. Exiting."
+        exit 0
+    else 
+        echo "GH Version mismatch. Desired version: $GH_VERSION, found version: $FOUND_GH_VERSION. Installing new version."    
+    fi
+    set -e
+fi
+
+source .buildkite/scripts/common.sh
+
+OS=$(uname -s| tr '[:upper:]' '[:lower:]')
+ARCH=$(uname -m| tr '[:upper:]' '[:lower:]')
+if [ "${ARCH}" == "aarch64" ] ; then
+    ARCH_SUFFIX=arm64
+else
+    ARCH_SUFFIX=amd64
+fi
+
+echo "Downloading gh : ${GH_VERSION}..."
+TMP_DIR=$(mktemp -d)
+if retry 5 curl -sL "https://github.com/cli/cli/releases/download/v${GH_VERSION}/gh_${GH_VERSION}_linux_amd64.tar.gz" | tar xz -C $TMP_DIR ; then
+  mkdir -p "${HOME}/bin"
+  mv "${TMP_DIR}/gh_${GH_VERSION}_linux_amd64/bin/gh" "${GH_CMD}"
+  rm -rf "${TMP_DIR}"
+else    
+    echo "Something bad with the download, deleting the binary"
+    if [ -e "${GH_CMD}" ] ; then
+        rm "${GH_CMD}"
+    fi
+    exit 1
+fi  
+
+
diff --git a/.buildkite/scripts/install-kind.sh b/.buildkite/scripts/install-kind.sh
index 171480d7685..ee0e0039719 100644
--- a/.buildkite/scripts/install-kind.sh
+++ b/.buildkite/scripts/install-kind.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-set -exuo pipefail
+set -euo pipefail
 
 echo "--- Install Kind"
 
diff --git a/.buildkite/scripts/steps/sync-k8s.sh b/.buildkite/scripts/steps/sync-k8s.sh
new file mode 100644
index 00000000000..ecd64e815a4
--- /dev/null
+++ b/.buildkite/scripts/steps/sync-k8s.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+set -euo pipefail
+
+export PATH=$HOME/bin:${PATH}
+
+source .buildkite/scripts/install-gh.sh
+source .buildkite/scripts/common.sh
+
+export GITHUB_TOKEN=$(retry 5 vault kv get -field token kv/ci-shared/platform-ingest/github_token)
+
+cd deploy/kubernetes
+
+echo "--- [File Creation] Create-Needed-Manifest"
+WITHOUTCONFIG=true make generate-k8s
+./creator_k8s_manifest.sh .
+
+echo "--- [Clone] Kibana-Repository"
+make ci-clone-kibana-repository
+cp Makefile ./kibana
+cd kibana
+echo "--- Create Kibana PR"
+make ci-create-kubernetes-templates-pull-request
\ No newline at end of file
diff --git a/deploy/kubernetes/Makefile b/deploy/kubernetes/Makefile
index 6247c9461e1..d02fc6eeb38 100644
--- a/deploy/kubernetes/Makefile
+++ b/deploy/kubernetes/Makefile
@@ -81,7 +81,7 @@ else
 	echo "INFO: create pull request"
 	@gh pr create \
 		--title "Update kubernetes templates for elastic-agent" \
-		--body "Automated by ${BUILD_URL}" \
+		--body "Automated by ${BUILDKITE_BUILD_URL}" \
 		--label automation \
 		--label release_note:skip \
 		--base main \

From bdbba7baa9cca337e3de9d1b833e73e67ab5411f Mon Sep 17 00:00:00 2001
From: Pavel Zorin <pavel.zorin@elastic.co>
Date: Mon, 20 Nov 2023 15:22:04 +0100
Subject: [PATCH 12/21] Jenkins decommission (#3771)

---
 .ci/Jenkinsfile                               | 316 ------------------
 .ci/jobs/defaults.yml                         |  19 --
 .ci/jobs/elastic-agent-mbp.yml                |  43 ---
 .ci/jobs/elastic-agent-schedule-daily.yml     |  26 --
 .ci/jobs/folders.yml                          |   5 -
 .ci/schedule-daily.groovy                     |  48 ---
 .ci/scripts/install-go.bat                    |  57 ----
 .ci/scripts/install-go.sh                     |  43 ---
 .../updatecli-bump-golang.yml                 |   1 +
 .github/workflows/bump-golang.yml             |   5 +-
 10 files changed, 3 insertions(+), 560 deletions(-)
 delete mode 100644 .ci/Jenkinsfile
 delete mode 100644 .ci/jobs/defaults.yml
 delete mode 100644 .ci/jobs/elastic-agent-mbp.yml
 delete mode 100644 .ci/jobs/elastic-agent-schedule-daily.yml
 delete mode 100644 .ci/jobs/folders.yml
 delete mode 100644 .ci/schedule-daily.groovy
 delete mode 100755 .ci/scripts/install-go.bat
 delete mode 100755 .ci/scripts/install-go.sh
 rename .ci/bump-golang.yml => .github/updatecli-bump-golang.yml (98%)

diff --git a/.ci/Jenkinsfile b/.ci/Jenkinsfile
deleted file mode 100644
index 23f3ca1e707..00000000000
--- a/.ci/Jenkinsfile
+++ /dev/null
@@ -1,316 +0,0 @@
-#!/usr/bin/env groovy
-
-@Library('apm@current') _
-
-pipeline {
-  agent { label 'ubuntu-22 && immutable' }
-  environment {
-    REPO = "elastic-agent"
-    BASE_DIR = "src/github.com/elastic/${env.REPO}"
-    JOB_GIT_CREDENTIALS = "f6c7695a-671e-4f4f-a331-acdce44ff9ba"
-    PIPELINE_LOG_LEVEL = 'INFO'
-    SNAPSHOT = true
-    JOB_GCS_CREDENTIALS = 'fleet-ci-gcs-plugin'  // Support stash/unstash v2
-    JOB_GCS_BUCKET = 'fleet-ci-temp'             // Support stash/unstash v2
-    JOB_GCS_EXT_BUCKET = 'fleet-ci-artifacts'    // Support uploadPackagesToGoogleBucket
-    JOB_GCS_EXT_CREDENTIALS = 'fleet-ci-gcs-plugin-file-credentials'  // Support uploadPackagesToGoogleBucket
-    DOCKER_ELASTIC_SECRET = 'secret/observability-team/ci/docker-registry/prod'
-    DOCKER_REGISTRY = 'docker.elastic.co'
-    DEVELOPER_MODE=true
-  }
-  options {
-    timeout(time: 3, unit: 'HOURS')
-    buildDiscarder(logRotator(numToKeepStr: '20', artifactNumToKeepStr: '20', daysToKeepStr: '30'))
-    timestamps()
-    ansiColor('xterm')
-    disableResume()
-    durabilityHint('PERFORMANCE_OPTIMIZED')
-    rateLimitBuilds(throttle: [count: 60, durationName: 'hour', userBoost: true])
-    quietPeriod(10)
-  }
-  triggers {
-    issueCommentTrigger("(${obltGitHubComments()}|^run (integration|end-to-end) tests|/package)")
-  }
-  parameters {
-    // disabled by default, but required for merge, there are two GH checks:
-    // opt-in with 'ci:integration'
-    booleanParam(name: 'integration_tests_ci', defaultValue: false, description: 'Enable Integration tests')
-
-    // disabled by default, but required for merge:
-    // opt-in with 'ci:end-to-end' tag on PR
-    booleanParam(name: 'end_to_end_tests_ci', defaultValue: false, description: 'Enable End-to-End tests')
-
-    // disabled by default, but required for merge:
-    // opt-in with 'ci:extended-windows' tag on PR
-    booleanParam(name: 'extended_windows_ci', defaultValue: false, description: 'Enable Extended Windows tests')
-
-    // disabled by default, but required for merge:
-    // opt-in with 'ci:extended-m1' tag on PR
-    booleanParam(name: 'extended_m1_ci', defaultValue: false, description: 'Enable M1 tests')
-  }
-  stages {
-    stage('Checkout') {
-      steps {
-        pipelineManager([ cancelPreviousRunningBuilds: [ when: 'PR' ] ])
-        deleteDir()
-        gitCheckout(basedir: "${BASE_DIR}", githubNotifyFirstTimeContributor: true)
-        stashV2(name: 'source', bucket: "${JOB_GCS_BUCKET}", credentialsId: "${JOB_GCS_CREDENTIALS}")
-        dir("${BASE_DIR}"){
-          setEnvVar('ONLY_DOCS', isGitRegionMatch(patterns: [ '.*\\.(asciidoc|md)' ], shouldMatchAll: true).toString())
-          setEnvVar('PACKAGING_CHANGES', isGitRegionMatch(patterns: [ '(^dev-tools/packaging/.*|.ci/Jenkinsfile|.go-version|Dockerfile)' ], shouldMatchAll: false).toString())
-          setEnvVar('K8S_CHANGES', isGitRegionMatch(patterns: [ '(^deploy/kubernetes/.*|^version/docs/version.asciidoc|.ci/Jenkinsfile)' ], shouldMatchAll: false).toString())
-          setEnvVar('EXT_WINDOWS_CHANGES', isGitRegionMatch(patterns: [ '.ci/Jenkinsfile' ], shouldMatchAll: false).toString())
-          setEnvVar('EXT_M1_CHANGES', isGitRegionMatch(patterns: [ '.ci/Jenkinsfile' ], shouldMatchAll: false).toString())
-          // set the GO_VERSION env variable with the go version to be used in withMageEnv
-          setEnvVar('GO_VERSION', readFile(file: '.go-version')?.trim())
-        }
-      }
-    }
-    stage('Check'){
-      steps {
-        withGithubNotify(context: "Check") {
-          withMageEnv(){
-           dir("${BASE_DIR}"){
-              setEnvVar('BEAT_VERSION', sh(label: 'Get beat version', script: 'make get-version', returnStdout: true)?.trim())
-              log(level: 'INFO', text: "env.BEAT_VERSION=${env.BEAT_VERSION}")
-              cmd(label: 'check', script: 'make check-ci')
-            }
-          }
-        }
-      }
-    }
-    stage('Test') {
-      when {
-        beforeAgent true
-        expression { return env.ONLY_DOCS == "false" }
-      }
-      failFast false
-      matrix {
-        agent {label "${PLATFORM}"}
-        options { skipDefaultCheckout() }
-        axes {
-          axis {
-            name 'PLATFORM'
-            // Orka workers are not healthy (memory and connectivity issues)
-            values 'ubuntu-22 && immutable', 'aws && aarch64 && gobld/diskSizeGb:200', 'windows-2016 && windows-immutable', 'windows-2022 && windows-immutable' //, 'macos12 && x86_64'
-          }
-        }
-        stages {                  
-          stage('Package') {
-            when {
-              beforeAgent true
-              allOf {
-                anyOf {
-                  expression { return isE2eEnabled() }
-                  expression { return isPackageEnabled() }
-                  not { changeRequest() }
-                }
-                // Run packaging only for the linux specific arch
-                expression { return (PLATFORM.contains('ubuntu') || PLATFORM.contains('aarch64')) }
-              }
-            }
-            environment {
-              ARCH = "${PLATFORM.contains('aarch64') ? 'arm64' : 'amd64'}"
-              DEV = true
-              EXTERNAL = true
-            }
-            steps {
-              withGithubNotify(context: "Package ${PLATFORM}") {
-                deleteDir()
-                unstashV2(name: 'source', bucket: "${JOB_GCS_BUCKET}", credentialsId: "${JOB_GCS_CREDENTIALS}")
-                withMageEnv(){
-                  dir("${BASE_DIR}"){
-                    withPackageEnv("${PLATFORM}") {
-                      cmd(label: 'Go package', script: 'mage package ironbank')
-                      uploadPackagesToGoogleBucket(
-                        credentialsId: env.JOB_GCS_EXT_CREDENTIALS,
-                        repo: env.REPO,
-                        bucket: env.JOB_GCS_EXT_BUCKET,
-                        pattern: "build/distributions/**/*")
-                      pushDockerImages(
-                        registry: env.DOCKER_REGISTRY,
-                        secret: env.DOCKER_ELASTIC_SECRET,
-                        snapshot: env.SNAPSHOT,
-                        version: env.BEAT_VERSION,
-                        images: [
-                          [ source: "beats/elastic-agent", arch: env.ARCH, target: "observability-ci/elastic-agent"],
-                          [ source: "beats/elastic-agent-oss", arch: env.ARCH, target: "observability-ci/elastic-agent-oss"],
-                          [ source: "beats/elastic-agent-ubi9", arch: env.ARCH, target: "observability-ci/elastic-agent-ubi9"],
-                          [ source: "beats/elastic-agent-complete", arch: env.ARCH, target: "observability-ci/elastic-agent-complete"],
-                          [ source: "beats-ci/elastic-agent-cloud", arch: env.ARCH, target: "observability-ci/elastic-agent-cloud"]
-                        ]
-                      )
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }    
-    stage('Sync K8s') { //This stage opens a PR to kibana Repository in order to sync k8s manifests
-      when {
-        // Only on main branch
-        // Enable if k8s related changes.
-        allOf {
-          branch 'main'         // Only runs for branch main
-          expression { return env.K8S_CHANGES == "true" }   // If k8s changes
-        }
-      }
-      failFast false
-        agent {label 'ubuntu-22 && immutable'}
-        options { skipDefaultCheckout() }
-        stages {
-          stage('OpenKibanaPR') {
-            steps {
-              withGhEnv(version: '2.4.0') {
-                deleteDir()
-                unstashV2(name: 'source', bucket: "${JOB_GCS_BUCKET}", credentialsId: "${JOB_GCS_CREDENTIALS}")
-                  dir("${BASE_DIR}/deploy/kubernetes"){
-                  sh(label: '[File Creation] Create-Needed-Manifest', script: """
-                    WITHOUTCONFIG=true make generate-k8s
-                    ./creator_k8s_manifest.sh . """)
-                  sh(label: '[Clone] Kibana-Repository', script: """
-                    make ci-clone-kibana-repository
-                    cp Makefile ./kibana
-                    cd kibana
-                    make ci-create-kubernetes-templates-pull-request """)
-                  }
-              }
-            }
-            post {
-              always {
-                junit(allowEmptyResults: true, keepLongStdio: true, testResults: "${BASE_DIR}/build/TEST-*.xml")
-              }
-            }
-          }
-      }
-    }            
-  }
-  post {
-    cleanup {
-      notifyBuildResult(prComment: true,
-                        analyzeFlakey: !isTag(), jobName: getFlakyJobName(withBranch: (isPR() ? env.CHANGE_TARGET : env.BRANCH_NAME)),
-                        githubIssue: false, // Disable creating gh issues for build failures while the E2E tests are stabilized.
-                        githubLabels: 'Team:Elastic-Agent-Control-Plane')
-    }
-  }
-}
-
-// As agreed let's report the code coverage for Linux but no ARM only.
-def isCodeCoverageEnabled() {
-  return (isUnix() && !isArm())
-}
-
-def withPackageEnv(platform, Closure body) {
-  if (isUnix()) {
-    if (isDarwin()) {
-      withPackageDarwinEnv() {
-        body()
-      }
-    } else {
-      if (isArm()) {
-        withPackageArmEnv() {
-          body()
-        }
-      } else {
-        withPackageLinuxEnv() {
-          body()
-        }
-      }
-    }
-  } else {
-    error 'withPackageEnv: unsupported platform'
-  }
-}
-
-def withPackageLinuxEnv(Closure body) {
-  // Copied from https://github.com/elastic/beats/blob/e6e65aa92fe355c95789691ebf5a3bcecaf5b4ea/.ci/packaging.groovy#L126-L142
-  def PLATFORMS = [ '+all',
-                    'linux/amd64',
-                    'linux/386',
-                    'linux/arm64',
-                    // armv7 packaging isn't working, and we don't currently
-                    // need it for release. Do not re-enable it without
-                    // confirming it is fixed, you will break the packaging
-                    // pipeline!
-                    //'linux/armv7',
-                    // The platforms above are disabled temporarly as crossbuild images are
-                    // not available. See: https://github.com/elastic/golang-crossbuild/issues/71
-                    //'linux/ppc64le',
-                    //'linux/mips64',
-                    //'linux/s390x',
-                    'windows/amd64',
-                    'windows/386'
-  ].join(' ')
-  withEnv([
-    "PLATFORMS=${PLATFORMS}"
-  ]) {
-    body()
-  }
-}
-
-def withPackageArmEnv(Closure body) {
-  // Copied from https://github.com/elastic/beats/blob/e6e65aa92fe355c95789691ebf5a3bcecaf5b4ea/.ci/packaging.groovy#L126-L142
-  def PLATFORMS = [ 'linux/arm64' ].join(' ')
-  withEnv([
-    "PLATFORMS=${PLATFORMS}",
-    "PACKAGES=docker"
-  ]) {
-    body()
-  }
-}
-
-def withPackageDarwinEnv(Closure body) {
-  // Copied from https://github.com/elastic/beats/blob/e6e65aa92fe355c95789691ebf5a3bcecaf5b4ea/.ci/packaging.groovy#L126-L142
-  def PLATFORMS = [ '+all',
-                    'darwin/amd64',
-                    'darwin/arm64',
-  ].join(' ')
-  withEnv([
-    "PLATFORMS=${PLATFORMS}"
-  ]) {
-    body()
-  }
-}
-
-def runK8s(Map args=[:]) {
-  withGithubNotify(context: args.context) {
-    withMageEnv(){
-      withKindEnv(args) {
-        dir("${BASE_DIR}"){
-          sh(label: "Deploy to kubernetes",script: "make -C deploy/kubernetes test")
-        }
-      }
-    }
-  }
-}
-
-/**
-* Wrapper to know if the build should enalbe the e2e stage
-*/
-def isE2eEnabled() {
-  return params.end_to_end_tests_ci || env.GITHUB_COMMENT?.contains('e2e tests') || matchesPrLabel(label: 'ci:end-to-end')
-}
-
-/**
-* Wrapper to know if the build should enalbe the package stage
-*/
-def isPackageEnabled() {
-  return env.PACKAGING_CHANGES == "true" || env.GITHUB_COMMENT?.contains('package') || matchesPrLabel(label: 'ci:package')
-}
-
-/**
-* Wrapper to know if the build should enable the windows extended support
-*/
-def isExtendedWindowsEnabled() {
-  return env.EXT_WINDOWS_CHANGES == "true" || params.extended_windows_ci || env.GITHUB_COMMENT?.contains('extended windows') || matchesPrLabel(label: 'ci:extended-windows')
-}
-
-/**
-* Wrapper to know if the build should enable the M1 extended support
-*/
-def isExtendedM1Enabled() {
-  return env.EXT_M1_CHANGES == "true" || params.extended_m1_ci || env.GITHUB_COMMENT?.contains('extended m1') || matchesPrLabel(label: 'ci:extended-m1')
-}
diff --git a/.ci/jobs/defaults.yml b/.ci/jobs/defaults.yml
deleted file mode 100644
index a2bcaae57e1..00000000000
--- a/.ci/jobs/defaults.yml
+++ /dev/null
@@ -1,19 +0,0 @@
-
----
-
-##### GLOBAL METADATA
-
-- meta:
-    cluster: fleet-ci
-
-##### JOB DEFAULTS
-
-- job:
-    logrotate:
-      numToKeep: 20
-    node: linux
-    concurrent: true
-    publishers:
-      - email:
-          recipients: infra-root+build@elastic.co
-    prune-dead-branches: true
diff --git a/.ci/jobs/elastic-agent-mbp.yml b/.ci/jobs/elastic-agent-mbp.yml
deleted file mode 100644
index 8947d15880a..00000000000
--- a/.ci/jobs/elastic-agent-mbp.yml
+++ /dev/null
@@ -1,43 +0,0 @@
----
-- job:
-    name: "elastic-agent/elastic-agent-mbp"
-    display-name: elastic-agent
-    description: "Elastic agent"
-    project-type: multibranch
-    script-path: .ci/Jenkinsfile
-    scm:
-      - github:
-          branch-discovery: no-pr
-          discover-pr-forks-strategy: merge-current
-          discover-pr-forks-trust: permission
-          discover-pr-origin: merge-current
-          discover-tags: true
-          head-filter-regex: '(main|7\.17|8\.\d+|PR-.*|v\d+\.\d+\.\d+)'
-          notification-context: 'fleet-ci'
-          repo: elastic-agent
-          repo-owner: elastic
-          credentials-id: 2a9602aa-ab9f-4e52-baf3-b71ca88469c7-UserAndToken
-          ssh-checkout:
-            credentials: f6c7695a-671e-4f4f-a331-acdce44ff9ba
-          build-strategies:
-            - tags:
-                ignore-tags-older-than: -1
-                ignore-tags-newer-than: -1
-            - regular-branches: true
-            - change-request:
-                ignore-target-only-changes: true
-          clean:
-            after: true
-            before: true
-          prune: true
-          shallow-clone: true
-          depth: 4
-          do-not-fetch-tags: true
-          submodule:
-            disable: false
-            recursive: true
-            parent-credentials: true
-            timeout: 100
-          timeout: '15'
-          use-author: true
-          wipe-workspace: true
diff --git a/.ci/jobs/elastic-agent-schedule-daily.yml b/.ci/jobs/elastic-agent-schedule-daily.yml
deleted file mode 100644
index 2fd728a9e8c..00000000000
--- a/.ci/jobs/elastic-agent-schedule-daily.yml
+++ /dev/null
@@ -1,26 +0,0 @@
----
-- job:
-    name: "elastic-agent/elastic-agent-schedule-daily"
-    display-name: Jobs scheduled daily (weekdays)
-    description: Jobs scheduled daily (weekdays)
-    view: Beats
-    project-type: pipeline
-    parameters:
-      - string:
-          name: branch_specifier
-          default: main
-          description: the Git branch specifier to build
-    pipeline-scm:
-      script-path: .ci/schedule-daily.groovy
-      scm:
-        - git:
-            url: git@github.com:elastic/elastic-agent.git
-            refspec: +refs/heads/*:refs/remotes/origin/*
-            wipe-workspace: 'True'
-            name: origin
-            shallow-clone: true
-            credentials-id: f6c7695a-671e-4f4f-a331-acdce44ff9ba
-            branches:
-              - $branch_specifier
-    triggers:
-      - timed: 'H H(2-3) * * 1-5'
diff --git a/.ci/jobs/folders.yml b/.ci/jobs/folders.yml
deleted file mode 100644
index cf309e259a3..00000000000
--- a/.ci/jobs/folders.yml
+++ /dev/null
@@ -1,5 +0,0 @@
----
-- job:
-    name: "elastic-agent"
-    description: "Elastic Agent"
-    project-type: folder
diff --git a/.ci/schedule-daily.groovy b/.ci/schedule-daily.groovy
deleted file mode 100644
index 70ad931ecda..00000000000
--- a/.ci/schedule-daily.groovy
+++ /dev/null
@@ -1,48 +0,0 @@
-@Library('apm@current') _
-
-pipeline {
-  agent none
-  environment {
-    NOTIFY_TO = credentials('notify-to')
-    PIPELINE_LOG_LEVEL = 'INFO'
-  }
-  options {
-    timeout(time: 1, unit: 'HOURS')
-    buildDiscarder(logRotator(numToKeepStr: '20', artifactNumToKeepStr: '20'))
-    timestamps()
-    ansiColor('xterm')
-    disableResume()
-    durabilityHint('PERFORMANCE_OPTIMIZED')
-  }
-  triggers {
-    cron('H H(2-3) * * 1-5')
-  }
-  stages {
-    stage('Nighly beats builds') {
-      steps {
-        runBuilds(quietPeriodFactor: 2000, branches: ['main', '8.<minor>', '8.<next-patch>'])
-      }
-    }
-  }
-  post {
-    cleanup {
-      notifyBuildResult(prComment: false)
-    }
-  }
-}
-
-def runBuilds(Map args = [:]) {
-  def branches = getBranchesFromAliases(aliases: args.branches)
-  def quietPeriod = 0
-  branches.each { branch ->
-    build(quietPeriod: quietPeriod,
-          job: "elastic-agent/elastic-agent-mbp/${branch}",
-          parameters: [
-            booleanParam(name: 'integration_tests_ci', value: true),
-            // Disable running e2e until we fix the 2e2 testing suite
-            booleanParam(name: 'end_to_end_tests_ci', value: false)
-          ],
-          wait: false, propagate: false)
-    quietPeriod += args.quietPeriodFactor
-  }
-}
diff --git a/.ci/scripts/install-go.bat b/.ci/scripts/install-go.bat
deleted file mode 100755
index 29448bd4f63..00000000000
--- a/.ci/scripts/install-go.bat
+++ /dev/null
@@ -1,57 +0,0 @@
-set GOPATH=%WORKSPACE%
-set MAGEFILE_CACHE=%WORKSPACE%\.magefile
-
-set PATH=%WORKSPACE%\bin;C:\ProgramData\chocolatey\bin;%PATH%
-
-curl --version >nul 2>&1 && (
-    echo found curl
-) || (
-    choco install curl -y --no-progress --skipdownloadcache
-)
-
-mkdir %WORKSPACE%\bin
-
-IF EXIST "%PROGRAMFILES(X86)%" (
-    REM Force the gvm installation.
-    SET GVM_BIN=gvm.exe
-    curl -L -o %WORKSPACE%\bin\gvm.exe https://github.com/andrewkroh/gvm/releases/download/v0.3.0/gvm-windows-amd64.exe
-    IF ERRORLEVEL 1 (
-        REM gvm installation has failed.
-        del bin\gvm.exe /s /f /q
-        exit /b 1
-    )
-) ELSE (
-    REM Windows 7 workers got a broken gvm installation.
-    curl -L -o %WORKSPACE%\bin\gvm.exe https://github.com/andrewkroh/gvm/releases/download/v0.3.0/gvm-windows-386.exe
-    IF ERRORLEVEL 1 (
-        REM gvm installation has failed.
-        del bin\gvm.exe /s /f /q
-        exit /b 1
-    )
-)
-
-SET GVM_BIN=gvm.exe
-WHERE /q %GVM_BIN%
-%GVM_BIN% version
-
-REM Install the given go version
-%GVM_BIN% --debug install %GO_VERSION%
-
-REM Configure the given go version
-FOR /f "tokens=*" %%i IN ('"%GVM_BIN%" use %GO_VERSION% --format=batch') DO %%i
-
-go env
-IF ERRORLEVEL 1 (
-    REM go is not configured correctly.
-    rmdir %WORKSPACE%\.gvm /s /q
-    exit /b 1
-)
-
-where mage
-mage -version
-IF ERRORLEVEL 1 (
-    go get github.com/magefile/mage
-    IF ERRORLEVEL 1 (
-        exit /b 1
-    )
-)
diff --git a/.ci/scripts/install-go.sh b/.ci/scripts/install-go.sh
deleted file mode 100755
index 31566c08726..00000000000
--- a/.ci/scripts/install-go.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/usr/bin/env bash
-set -exuo pipefail
-
-MSG="environment variable missing"
-GO_VERSION=${GO_VERSION:?$MSG}
-PROPERTIES_FILE=${PROPERTIES_FILE:-"go_env.properties"}
-HOME=${HOME:?$MSG}
-OS=$(uname -s| tr '[:upper:]' '[:lower:]')
-ARCH=$(uname -m| tr '[:upper:]' '[:lower:]')
-GVM_CMD="${HOME}/bin/gvm"
-
-if command -v go
-then
-    set +e
-    echo "Found Go. Checking version.."
-    FOUND_GO_VERSION=$(go version|awk '{print $3}'|sed s/go//)
-    if [ "$FOUND_GO_VERSION" == "$GO_VERSION" ]
-    then
-        echo "Versions match. No need to install Go. Exiting."
-        exit 0
-    fi
-    set -e
-fi
-
-if [ "${ARCH}" == "aarch64" ] ; then
-    GVM_ARCH_SUFFIX=arm64
-elif [ "${ARCH}" == "x86_64" ] ; then
-    GVM_ARCH_SUFFIX=amd64
-elif [ "${ARCH}" == "i686" ] ; then
-    GVM_ARCH_SUFFIX=386
-else
-    GVM_ARCH_SUFFIX=arm
-fi
-
-echo "UNMET DEP: Installing Go"
-mkdir -p "${HOME}/bin"
-
-curl -sSLo "${GVM_CMD}" "https://github.com/andrewkroh/gvm/releases/download/v0.3.0/gvm-${OS}-${GVM_ARCH_SUFFIX}"
-chmod +x "${GVM_CMD}"
-
-${GVM_CMD} "${GO_VERSION}" |cut -d ' ' -f 2|tr -d '\"' > ${PROPERTIES_FILE}
-
-eval "$("${GVM_CMD}" "${GO_VERSION}")"
diff --git a/.ci/bump-golang.yml b/.github/updatecli-bump-golang.yml
similarity index 98%
rename from .ci/bump-golang.yml
rename to .github/updatecli-bump-golang.yml
index 470c6f4c8d5..df5d8104048 100644
--- a/.ci/bump-golang.yml
+++ b/.github/updatecli-bump-golang.yml
@@ -1,3 +1,4 @@
+# update-cli configuration for automated go updates
 ---
 name: Bump golang-version to latest version
 
diff --git a/.github/workflows/bump-golang.yml b/.github/workflows/bump-golang.yml
index 30b8fbd89b3..34d488b9d72 100644
--- a/.github/workflows/bump-golang.yml
+++ b/.github/workflows/bump-golang.yml
@@ -4,7 +4,7 @@ name: bump-golang
 on:
   workflow_dispatch:
   schedule:
-    - cron: '0 20 * * 6'
+    - cron: "0 20 * * 6"
 
 permissions:
   contents: read
@@ -13,7 +13,6 @@ jobs:
   bump:
     runs-on: ubuntu-latest
     steps:
-
       - uses: actions/checkout@v3
 
       - uses: elastic/apm-pipeline-library/.github/actions/updatecli@current
@@ -21,4 +20,4 @@ jobs:
           vaultUrl: ${{ secrets.VAULT_ADDR }}
           vaultRoleId: ${{ secrets.VAULT_ROLE_ID }}
           vaultSecretId: ${{ secrets.VAULT_SECRET_ID }}
-          pipeline: ./.ci/bump-golang.yml
+          pipeline: ./.github/updatecli-bump-golang.yml

From 08c51501035a2aaae1d52500db1f40f671d40815 Mon Sep 17 00:00:00 2001
From: Pavel Zorin <pavel.zorin@elastic.co>
Date: Mon, 20 Nov 2023 15:25:55 +0100
Subject: [PATCH 13/21] Migrate check-ci step to buildkite (#3759)

* Migrate check-ci step to buildkite

* Remove check-ci stage form jenkinsfile

* Migrate check-ci step to buildkite

* Migrate check-ci step to buildkite

* chmod check-ci.sh

* Removed package step
---
 .buildkite/pipeline.yml              |  9 +++++++++
 .buildkite/scripts/steps/check-ci.sh | 12 ++++++++++++
 2 files changed, 21 insertions(+)
 create mode 100755 .buildkite/scripts/steps/check-ci.sh

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 6c47d08bf92..f251bcaf81c 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -4,6 +4,15 @@ env:
   VAULT_PATH: "kv/ci-shared/observability-ingest/cloud/gcp"
   DOCKER_REGISTRY: "docker.elastic.co"
 steps:
+  - label: "check-ci"
+    key: "check-ci"
+    command: ".buildkite/scripts/steps/check-ci.sh"
+    agents:
+      provider: "gcp"
+      image: "family/core-ubuntu-2204"
+    retry:
+      manual:
+        allowed: true
   - group: "Unit tests"
     key: "unit-tests"
     steps:
diff --git a/.buildkite/scripts/steps/check-ci.sh b/.buildkite/scripts/steps/check-ci.sh
new file mode 100755
index 00000000000..46923d1b7ef
--- /dev/null
+++ b/.buildkite/scripts/steps/check-ci.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+source .buildkite/scripts/common.sh
+
+echo "--- Check CI"
+go version
+mage --version
+BEAT_VERSION=$(make get-version)
+echo "Beat version: $BEAT_VERSION"
+make check-ci
\ No newline at end of file

From d5f34dfd21a969f7422efb5e0f085d96d89d4059 Mon Sep 17 00:00:00 2001
From: Alex K <8418476+fearful-symmetry@users.noreply.github.com>
Date: Mon, 20 Nov 2023 08:00:44 -0800
Subject: [PATCH 14/21] Add serverless beats tests (#3658)

* cleaning up

* final bit of cleanup

* fix magefile, cleanup docs

* clean up errors, make linter happy

* fix headers

* fix fields in runner config

* add dashboard checks

* clean up, refactor

* clean up

* tinker with env vars

* fix defaults in fixture

* check binary name in test setup

* allow ilm override in tests

* fix filebeat tests, add cleanup

* tinker with dashboards

* fix ilm tests

* use API keys for auth

* add additional integration tests

* remove beats-specific code

* hack in serverless tests

* tinker with tests

* change env var naming

* actually use correct provisioner name

* tinker with buildkite again

* fix things after refactor

* fix buildkite

* fix my bash scripts

* my bash is a tad rusty

* tinker with script hooks

* not sure what ci role I broke

* clean up es handlers

* deal with recent refactor

* fix my broken refactor

* change url, see what happens

* tinker with tests more

* swap pipelines, see what happens

* break apart beat runners

* create test

* add more tests

* override tests in progress

* finish overwrite tests

* still adding tests

* cleanup

* fix env var names

* fix filebeat setup, test names

* fixing up filebeat

* use templates, fix ES query

* tinker with buildkite

* still tinkering

* still trying to get it to work

* still trying to get it to work

* add script, see what happens

* tinkering with script

* tinkering with script

* trying to diagnose buildkite

* still fighting with buildkite

* still trying to get it to work

* set workspace

* change workspace

* change workspace, again

* change workspace, again

* trying to debug buildkite

* add debug statements

* run mage twice

* I swear I'll figure this out

* potential script install issue?

* use different condition for if block

* fix potential wrong flag in setup scripts

* change -p flag

* fix workspace path, add tests,remove debug code

* use proper env vars

* fix mage call, directories

* set working directories

* now trying to get auth working

* change name to make setup happy

* disable cleanup

* re-enable cleanup

* run one test at a time

* refactor, run filebeat

* move function def

* fix if blocks

* fix test run, add support for auditbeat and packetbeat

* add packetbeat support, run auditbeat tests

* add serverless buildkite logic

* add more documentation

* fix up tests
---
 .buildkite/pipeline.yml                      |  12 +
 .buildkite/scripts/common.sh                 |   4 +-
 .buildkite/scripts/steps/beats_tests.sh      |  70 +++
 magefile.go                                  |  27 +
 pkg/testing/tools/estools/elasticsearch.go   |   2 +-
 testing/integration/beats_serverless_test.go | 628 +++++++++++++++++++
 6 files changed, 740 insertions(+), 3 deletions(-)
 create mode 100755 .buildkite/scripts/steps/beats_tests.sh
 create mode 100644 testing/integration/beats_serverless_test.go

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index f251bcaf81c..e7484aa3fe6 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -228,6 +228,18 @@ steps:
       provider: "gcp"
       machineType: "n1-standard-8"
 
+  - label: "Serverless Beats Tests"
+    key: "serverless-beats-integration-tests"
+    command: ".buildkite/scripts/steps/beats_tests.sh"
+    if: "build.env('CRON') == 'yes'"
+    agents:
+      provider: "gcp"
+      machineType: "n1-standard-8"
+    retry:
+      manual:
+        allowed: true
+
+
   - wait: ~
     continue_on_failure: true
   - label: "Processing test results"
diff --git a/.buildkite/scripts/common.sh b/.buildkite/scripts/common.sh
index 16d32fb77ab..86c370ae360 100644
--- a/.buildkite/scripts/common.sh
+++ b/.buildkite/scripts/common.sh
@@ -53,7 +53,7 @@ getOSOptions() {
 # Wrapper function for executing mage
 mage() {
     go version
-    if ! [ -x "$(type -p mage | sed 's/mage is //g')" ];
+    if ! [ -x "$(type -P mage | sed 's/mage is //g')" ];
     then
         echo "installing mage ${SETUP_MAGE_VERSION}"
         make mage
@@ -68,7 +68,7 @@ mage() {
 # Wrapper function for executing go
 go(){
     # Search for the go in the Path
-    if ! [ -x "$(type -p go | sed 's/go is //g')" ];
+    if ! [ -x "$(type -P go | sed 's/go is //g')" ];
     then
         getOSOptions
         echo "installing golang "${GO_VERSION}" for "${AGENT_OS_NAME}/${AGENT_OS_ARCH}" "
diff --git a/.buildkite/scripts/steps/beats_tests.sh b/.buildkite/scripts/steps/beats_tests.sh
new file mode 100755
index 00000000000..b62bc947b6e
--- /dev/null
+++ b/.buildkite/scripts/steps/beats_tests.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+#=========================
+# NOTE: This entire script is a temporary hack until we have buildkite set up on the beats repo.
+# until then, we need some kind of serverless integration tests, hence this script, which just clones the beats repo,
+# and runs the serverless integration suite against different beats
+# After buildkite is set up on beats, this file/PR should be reverted.
+#==========================
+
+source .buildkite/scripts/common.sh
+STACK_PROVISIONER="${1:-"serverless"}"
+
+run_test_for_beat(){
+    local beat_name=$1
+    
+    #build
+    export WORKSPACE="build/beats/x-pack/${beat_name}"
+    SNAPSHOT=true PLATFORMS=linux/amd64 PACKAGES=tar.gz,zip mage package
+
+    #run
+    export AGENT_BUILD_DIR="build/beats/x-pack/${beat_name}/build/distributions"
+    export WORKSPACE=$(pwd)
+
+    set +e
+    TEST_INTEG_CLEAN_ON_EXIT=true TEST_PLATFORMS="linux/amd64" STACK_PROVISIONER="$STACK_PROVISIONER" SNAPSHOT=true mage integration:testBeatServerless $beat_name
+    TESTS_EXIT_STATUS=$?
+    set -e
+
+    return $TESTS_EXIT_STATUS
+}
+#run mage before setup, since this will install go and mage
+#the setup scripts will do a few things that assume we're running out of elastic-agent and will break things for beats, so run before we do actual setup
+mage -l
+
+mkdir -p build
+cd build
+
+git clone --filter=tree:0 git@github.com:elastic/beats.git
+cd ..
+
+# export WORKSPACE=beats/x-pack/metricbeat
+
+# SNAPSHOT=true PLATFORMS=linux/amd64,windows/amd64 PACKAGES=tar.gz,zip mage package
+
+
+# cd ..
+
+# export AGENT_BUILD_DIR=build/beats/x-pack/metricbeat/build/distributions
+# export WORKSPACE=$(pwd)
+
+# set +e
+# TEST_INTEG_CLEAN_ON_EXIT=true TEST_PLATFORMS="linux/amd64" STACK_PROVISIONER="$STACK_PROVISIONER" SNAPSHOT=true mage integration:testBeatServerless metricbeat
+# TESTS_EXIT_STATUS=$?
+# set -e
+
+# exit $TESTS_EXIT_STATUS
+
+echo "testing metricbeat..."
+run_test_for_beat metricbeat
+
+
+
+echo "testing filebeat..."
+run_test_for_beat filebeat
+
+
+
+echo "testing auditbeat..."
+run_test_for_beat auditbeat
diff --git a/magefile.go b/magefile.go
index 6f29cd68991..fe320754b91 100644
--- a/magefile.go
+++ b/magefile.go
@@ -1567,6 +1567,33 @@ func (Integration) PrepareOnRemote() {
 	mg.Deps(mage.InstallGoTestTools)
 }
 
+// Run beat serverless tests
+func (Integration) TestBeatServerless(ctx context.Context, beatname string) error {
+	beatBuildPath := filepath.Join("..", "beats", "x-pack", beatname, "build", "distributions")
+	if os.Getenv("AGENT_BUILD_DIR") == "" {
+		err := os.Setenv("AGENT_BUILD_DIR", beatBuildPath)
+		if err != nil {
+			return fmt.Errorf("error setting build dir: %s", err)
+		}
+	}
+
+	// a bit of bypass logic; run as serverless by default
+	if os.Getenv("STACK_PROVISIONER") == "" {
+		err := os.Setenv("STACK_PROVISIONER", "serverless")
+		if err != nil {
+			return fmt.Errorf("error setting serverless stack var: %w", err)
+		}
+	} else if os.Getenv("STACK_PROVISIONER") == "stateful" {
+		fmt.Printf(">>> Warning: running TestBeatServerless as stateful\n")
+	}
+
+	err := os.Setenv("TEST_BINARY_NAME", beatname)
+	if err != nil {
+		return fmt.Errorf("error setting binary name: %w", err)
+	}
+	return integRunner(ctx, false, "TestBeatsServerless")
+}
+
 // TestOnRemote shouldn't be called locally (called on remote host to perform testing)
 func (Integration) TestOnRemote(ctx context.Context) error {
 	mg.Deps(Build.TestBinaries)
diff --git a/pkg/testing/tools/estools/elasticsearch.go b/pkg/testing/tools/estools/elasticsearch.go
index 304e917d7ee..1c85ed788f3 100644
--- a/pkg/testing/tools/estools/elasticsearch.go
+++ b/pkg/testing/tools/estools/elasticsearch.go
@@ -212,7 +212,7 @@ func GetLatestDocumentMatchingQuery(ctx context.Context, client elastictransport
 	queryRaw := map[string]interface{}{
 		"query": query,
 		"sort": map[string]interface{}{
-			"timestamp": "desc",
+			"@timestamp": "desc",
 		},
 		"size": 1,
 	}
diff --git a/testing/integration/beats_serverless_test.go b/testing/integration/beats_serverless_test.go
new file mode 100644
index 00000000000..57123e9142e
--- /dev/null
+++ b/testing/integration/beats_serverless_test.go
@@ -0,0 +1,628 @@
+// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+// or more contributor license agreements. Licensed under the Elastic License;
+// you may not use this file except in compliance with the Elastic License.
+
+//go:build integration
+
+package integration
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"net/url"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+	"text/template"
+	"time"
+
+	"github.com/gofrs/uuid"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	"github.com/stretchr/testify/suite"
+
+	"github.com/elastic/elastic-agent-libs/mapstr"
+	atesting "github.com/elastic/elastic-agent/pkg/testing"
+	"github.com/elastic/elastic-agent/pkg/testing/define"
+	"github.com/elastic/elastic-agent/pkg/testing/tools"
+	"github.com/elastic/elastic-agent/pkg/testing/tools/estools"
+)
+
+type BeatRunner struct {
+	suite.Suite
+	requirementsInfo *define.Info
+	agentFixture     *atesting.Fixture
+
+	// connection info
+	ESHost  string
+	user    string
+	pass    string
+	kibHost string
+
+	testUuid     string
+	testbeatName string
+
+	skipCleanup bool
+}
+
+func TestBeatsServerless(t *testing.T) {
+	info := define.Require(t, define.Requirements{
+		OS: []define.OS{
+			{Type: define.Linux},
+		},
+		Stack: &define.Stack{},
+		Local: false,
+		Sudo:  true,
+	})
+
+	suite.Run(t, &BeatRunner{requirementsInfo: info})
+}
+
+func (runner *BeatRunner) SetupSuite() {
+	runner.skipCleanup = false
+
+	runner.testbeatName = os.Getenv("TEST_BINARY_NAME")
+	if runner.testbeatName == "" {
+		runner.T().Fatalf("TEST_BINARY_NAME must be set")
+	}
+	if runner.testbeatName == "elastic-agent" {
+		runner.T().Skipf("tests must be run against a beat, not elastic-agent")
+	}
+
+	if runner.testbeatName != "filebeat" && runner.testbeatName != "metricbeat" && runner.testbeatName != "auditbeat" && runner.testbeatName != "packetbeat" {
+		runner.T().Skip("test only supports metricbeat or filebeat")
+	}
+	runner.T().Logf("running serverless tests with %s", runner.testbeatName)
+
+	agentFixture, err := define.NewFixtureWithBinary(runner.T(), define.Version(), runner.testbeatName, "/home/ubuntu", atesting.WithRunLength(time.Minute*3), atesting.WithAdditionalArgs([]string{"-E", "output.elasticsearch.allow_older_versions=true"}))
+	runner.agentFixture = agentFixture
+	require.NoError(runner.T(), err)
+
+	// the require.* code will fail without these, so assume the values are non-nil
+	runner.ESHost = os.Getenv("ELASTICSEARCH_HOST")
+	runner.user = os.Getenv("ELASTICSEARCH_USERNAME")
+	runner.pass = os.Getenv("ELASTICSEARCH_PASSWORD")
+	runner.kibHost = os.Getenv("KIBANA_HOST")
+
+	ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
+	defer cancel()
+
+	beatOutConfig := `
+output.elasticsearch:
+  hosts: ["{{.es_host}}"]
+  api_key: "{{.key_user}}:{{.key_pass}}"
+setup.kibana:
+  host: {{.kb_host}}
+processors:
+  - add_fields:
+      target: host
+      fields:
+        test-id: {{.test_id}}
+{{.beat_cfg}}
+`
+
+	mbCfg := `
+metricbeat.config.modules:
+  path: ${path.config}/modules.d/*.yml
+`
+
+	fbCfg := `
+filebeat.modules:
+  - module: system
+    syslog:
+      enabled: true
+    auth:
+      enabled: true
+filebeat.config.modules:
+  - modules: system
+    syslog:
+      enabled: true
+    auth:
+      enabled: true
+`
+	auditbeatCfg := `
+auditbeat.modules:
+
+- module: file_integrity
+  paths:
+  - /bin
+  - /usr/bin
+  - /sbin
+  - /usr/sbin
+  - /etc
+`
+
+	packetbeatCfg := `
+`
+
+	tmpl, err := template.New("config").Parse(beatOutConfig)
+	require.NoError(runner.T(), err)
+
+	apiResp, err := estools.CreateAPIKey(ctx, runner.requirementsInfo.ESClient, estools.APIKeyRequest{Name: "test-api-key", Expiration: "1d"})
+	require.NoError(runner.T(), err)
+
+	// beats likes to add standard ports to URLs that don't have them, and ESS will sometimes return a URL without a port, assuming :443
+	// so try to fix that here
+	fixedKibanaHost := runner.kibHost
+	parsedKibana, err := url.Parse(runner.kibHost)
+	require.NoError(runner.T(), err)
+	if parsedKibana.Port() == "" {
+		fixedKibanaHost = fmt.Sprintf("%s:443", fixedKibanaHost)
+	}
+
+	fixedESHost := runner.ESHost
+	parsedES, err := url.Parse(runner.ESHost)
+	require.NoError(runner.T(), err)
+	if parsedES.Port() == "" {
+		fixedESHost = fmt.Sprintf("%s:443", fixedESHost)
+	}
+
+	runner.T().Logf("configuring beats with %s / %s", fixedESHost, fixedKibanaHost)
+
+	testUuid, err := uuid.NewV4()
+	require.NoError(runner.T(), err)
+	runner.testUuid = testUuid.String()
+
+	additionalCfg := mbCfg
+	if runner.testbeatName == "filebeat" {
+		additionalCfg = fbCfg
+	} else if runner.testbeatName == "auditbeat" {
+		additionalCfg = auditbeatCfg
+	} else if runner.testbeatName == "packetbeat" {
+		additionalCfg = packetbeatCfg
+	}
+
+	tmpl_map := map[string]string{"es_host": fixedESHost, "key_user": apiResp.Id, "key_pass": apiResp.APIKey, "kb_host": fixedKibanaHost, "test_id": testUuid.String(), "beat_cfg": additionalCfg}
+	parsedCfg := bytes.Buffer{}
+	err = tmpl.Execute(&parsedCfg, tmpl_map)
+	require.NoError(runner.T(), err)
+
+	err = runner.agentFixture.WriteFileToWorkDir(ctx, parsedCfg.String(), fmt.Sprintf("%s.yml", runner.testbeatName))
+	require.NoError(runner.T(), err)
+}
+
+// run the beat with default metricsets, ensure no errors in logs + data is ingested
+func (runner *BeatRunner) TestRunAndCheckData() {
+	ctx, cancel := context.WithTimeout(context.Background(), time.Minute*4)
+	defer cancel()
+
+	// in case there's already a running template, delete it, forcing the beat to re-install
+	runner.CleanupTemplates(ctx)
+
+	err := runner.agentFixture.RunBeat(ctx)
+	require.NoError(runner.T(), err)
+
+	docs, err := estools.GetLatestDocumentMatchingQuery(ctx, runner.requirementsInfo.ESClient, map[string]interface{}{
+		"match": map[string]interface{}{
+			"host.test-id": runner.testUuid,
+		},
+	}, fmt.Sprintf("*%s*", runner.testbeatName))
+	require.NoError(runner.T(), err)
+	require.NotEmpty(runner.T(), docs.Hits.Hits)
+}
+
+// tests the [beat] setup --dashboards command
+func (runner *BeatRunner) TestSetupDashboards() {
+	ctx, cancel := context.WithTimeout(context.Background(), time.Minute*3) //dashboards seem to take a while
+	defer cancel()
+
+	resp, err := runner.agentFixture.Exec(ctx, []string{"--path.home", runner.agentFixture.WorkDir(), "setup", "--dashboards"})
+	assert.NoError(runner.T(), err)
+	runner.T().Logf("got response from dashboard setup: %s", string(resp))
+	require.True(runner.T(), strings.Contains(string(resp), "Loaded dashboards"))
+
+	dashList, err := tools.GetDashboards(ctx, runner.requirementsInfo.KibanaClient)
+	require.NoError(runner.T(), err)
+
+	// interesting hack in cases where we don't have a clean environment
+	// check to see if any of the dashboards were created recently
+	found := false
+	for _, dash := range dashList {
+		if time.Since(dash.UpdatedAt) < time.Minute*5 {
+			found = true
+			break
+		}
+	}
+	require.True(runner.T(), found, fmt.Sprintf("could not find dashboard newer than 5 minutes, out of %d dashboards", len(dashList)))
+
+	runner.Run("export dashboards", runner.SubtestExportDashboards)
+	// cleanup
+	if !runner.skipCleanup {
+		for _, dash := range dashList {
+			err = tools.DeleteDashboard(ctx, runner.requirementsInfo.KibanaClient, dash.ID)
+			if err != nil {
+				runner.T().Logf("WARNING: could not delete dashboards after test: %s", err)
+				break
+			}
+		}
+	}
+}
+
+// tests the [beat] export dashboard command
+func (runner *BeatRunner) SubtestExportDashboards() {
+	ctx, cancel := context.WithTimeout(context.Background(), time.Minute*2)
+	defer cancel()
+	outDir := runner.T().TempDir()
+
+	dashlist, err := tools.GetDashboards(ctx, runner.requirementsInfo.KibanaClient)
+	require.NoError(runner.T(), err)
+	require.NotEmpty(runner.T(), dashlist)
+
+	exportOut, err := runner.agentFixture.Exec(ctx, []string{"--path.home",
+		runner.agentFixture.WorkDir(),
+		"export",
+		"dashboard", "--folder", outDir, "--id", dashlist[0].ID})
+
+	runner.T().Logf("got output: %s", exportOut)
+	assert.NoError(runner.T(), err)
+
+	inFolder, err := os.ReadDir(filepath.Join(outDir, "/_meta/kibana/8/dashboard"))
+	require.NoError(runner.T(), err)
+	runner.T().Logf("got log contents: %#v", inFolder)
+	require.NotEmpty(runner.T(), inFolder)
+}
+
+// NOTE for the below tests: the testing framework doesn't guarantee a new stack instance each time,
+// which means we might be running against a stack where a previous test has already done setup.
+// perhaps CI should run `mage integration:clean` first?
+
+// tests the [beat] setup --pipelines command
+func (runner *BeatRunner) TestSetupPipelines() {
+	if runner.testbeatName != "filebeat" {
+		runner.T().Skip("pipelines only available on filebeat")
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
+	defer cancel()
+
+	defer func() {
+		// cleanup
+		if !runner.skipCleanup {
+			err := estools.DeletePipelines(ctx, runner.requirementsInfo.ESClient, "*filebeat*")
+			if err != nil {
+				runner.T().Logf("WARNING: could not clean up pipelines: %s", err)
+			}
+		}
+
+	}()
+
+	// need to actually enable something that has pipelines
+	resp, err := runner.agentFixture.Exec(ctx, []string{"--path.home", runner.agentFixture.WorkDir(),
+		"setup", "--pipelines", "--modules", "apache", "-M", "apache.error.enabled=true", "-M", "apache.access.enabled=true"})
+	assert.NoError(runner.T(), err)
+
+	runner.T().Logf("got response from pipeline setup: %s", string(resp))
+
+	pipelines, err := estools.GetPipelines(ctx, runner.requirementsInfo.ESClient, "*filebeat*")
+	require.NoError(runner.T(), err)
+	require.NotEmpty(runner.T(), pipelines)
+
+}
+
+// test beat setup --index-management with ILM disabled
+func (runner *BeatRunner) TestIndexManagementNoILM() {
+	ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
+	defer cancel()
+	defer func() {
+		runner.CleanupTemplates(ctx)
+	}()
+
+	resp, err := runner.agentFixture.Exec(ctx, []string{"--path.home",
+		runner.agentFixture.WorkDir(),
+		"setup",
+		"--index-management",
+		"--E=setup.ilm.enabled=false"})
+	runner.T().Logf("got response from management setup: %s", string(resp))
+	assert.NoError(runner.T(), err)
+	// we should not print a warning if we've explicitly disabled ILM
+	assert.NotContains(runner.T(), string(resp), "not supported")
+
+	tmpls, err := estools.GetIndexTemplatesForPattern(ctx, runner.requirementsInfo.ESClient, fmt.Sprintf("*%s*", runner.testbeatName))
+	require.NoError(runner.T(), err)
+	for _, tmpl := range tmpls.IndexTemplates {
+		runner.T().Logf("got template: %s", tmpl.Name)
+	}
+	require.NotEmpty(runner.T(), tmpls.IndexTemplates)
+
+	runner.Run("export templates", runner.SubtestExportTemplates)
+	runner.Run("export index patterns", runner.SubtestExportIndexPatterns)
+
+}
+
+// tests setup with all default settings
+func (runner *BeatRunner) TestWithAllDefaults() {
+	ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
+	defer cancel()
+	defer func() {
+		runner.CleanupTemplates(ctx)
+	}()
+
+	// pre-delete in case something else missed cleanup
+	runner.CleanupTemplates(ctx)
+
+	resp, err := runner.agentFixture.Exec(ctx, []string{"--path.home",
+		runner.agentFixture.WorkDir(),
+		"setup",
+		"--index-management"})
+	runner.T().Logf("got response from management setup: %s", string(resp))
+	require.NoError(runner.T(), err)
+
+	streams, err := estools.GetDataStreamsForPattern(ctx, runner.requirementsInfo.ESClient, fmt.Sprintf("%s*", runner.testbeatName))
+	require.NoError(runner.T(), err)
+
+	require.NotEmpty(runner.T(), streams.DataStreams)
+
+}
+
+// test the setup process with mismatching template and DSL names
+func (runner *BeatRunner) TestCustomBadNames() {
+	ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
+	defer cancel()
+
+	defer func() {
+		runner.CleanupTemplates(ctx)
+	}()
+
+	resp, err := runner.agentFixture.Exec(ctx, []string{"-e", "--path.home",
+		runner.agentFixture.WorkDir(),
+		"setup",
+		"--index-management",
+		"--E=setup.dsl.enabled=true", "--E=setup.dsl.data_stream_pattern='custom-bad-name'", "--E=setup.template.name='custom-name'", "--E=setup.template.pattern='custom-name'"})
+	runner.T().Logf("got response from management setup: %s", string(resp))
+	require.NoError(runner.T(), err)
+
+	require.True(runner.T(), strings.Contains(string(resp), "Additional updates & overwrites to this config will not work."))
+
+}
+
+func (runner *BeatRunner) TestOverwriteWithCustomName() {
+	//an updated policy that has a different value than the default of 7d
+	updatedPolicy := mapstr.M{
+		"data_retention": "1d",
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
+	defer cancel()
+	defer func() {
+		runner.CleanupTemplates(ctx)
+	}()
+
+	lctemp := runner.T().TempDir()
+	raw, err := json.MarshalIndent(updatedPolicy, "", " ")
+	require.NoError(runner.T(), err)
+
+	lifecyclePath := filepath.Join(lctemp, "dsl_policy.json")
+
+	err = os.WriteFile(lifecyclePath, raw, 0o744)
+	require.NoError(runner.T(), err)
+
+	runner.CleanupTemplates(ctx)
+
+	resp, err := runner.agentFixture.Exec(ctx, []string{"--path.home",
+		runner.agentFixture.WorkDir(),
+		"setup",
+		"--index-management",
+		"--E=setup.dsl.enabled=true", "--E=setup.dsl.data_stream_pattern='custom-name'", "--E=setup.template.name='custom-name'", "--E=setup.template.pattern='custom-name'"})
+	runner.T().Logf("got response from management setup: %s", string(resp))
+	require.NoError(runner.T(), err)
+
+	runner.CheckDSLPolicy(ctx, "*custom-name*", "7d")
+
+	resp, err = runner.agentFixture.Exec(ctx, []string{"--path.home",
+		runner.agentFixture.WorkDir(),
+		"setup",
+		"--index-management",
+		"--E=setup.dsl.enabled=true", "--E=setup.dsl.overwrite=true", "--E=setup.dsl.data_stream_pattern='custom-name'",
+		"--E=setup.template.name='custom-name'", "--E=setup.template.pattern='custom-name'", fmt.Sprintf("--E=setup.dsl.policy_file=%s", lifecyclePath)})
+	runner.T().Logf("got response from management setup: %s", string(resp))
+	require.NoError(runner.T(), err)
+
+	runner.CheckDSLPolicy(ctx, "*custom-name*", "1d")
+
+}
+
+// TestWithCustomLifecyclePolicy uploads a custom DSL policy
+func (runner *BeatRunner) TestWithCustomLifecyclePolicy() {
+	//create a custom policy file
+	dslPolicy := mapstr.M{
+		"data_retention": "1d",
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
+	defer cancel()
+	defer func() {
+		runner.CleanupTemplates(ctx)
+	}()
+
+	lctemp := runner.T().TempDir()
+	raw, err := json.MarshalIndent(dslPolicy, "", " ")
+	require.NoError(runner.T(), err)
+
+	lifecyclePath := filepath.Join(lctemp, "dsl_policy.json")
+
+	err = os.WriteFile(lifecyclePath, raw, 0o744)
+	require.NoError(runner.T(), err)
+
+	runner.CleanupTemplates(ctx)
+
+	resp, err := runner.agentFixture.Exec(ctx, []string{"--path.home",
+		runner.agentFixture.WorkDir(),
+		"setup",
+		"--index-management",
+		"--E=setup.dsl.enabled=true", fmt.Sprintf("--E=setup.dsl.policy_file=%s", lifecyclePath)})
+	runner.T().Logf("got response from management setup: %s", string(resp))
+	require.NoError(runner.T(), err)
+
+	runner.CheckDSLPolicy(ctx, fmt.Sprintf("%s*", runner.testbeatName), "1d")
+
+}
+
+// tests beat setup --index-management with ILM explicitly set
+// On serverless, this should fail.
+func (runner *BeatRunner) TestIndexManagementILMEnabledFailure() {
+	ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
+	defer cancel()
+	info, err := estools.GetPing(ctx, runner.requirementsInfo.ESClient)
+	require.NoError(runner.T(), err)
+
+	if info.Version.BuildFlavor != "serverless" {
+		runner.T().Skip("must run on serverless")
+	}
+
+	resp, err := runner.agentFixture.Exec(ctx, []string{"--path.home",
+		runner.agentFixture.WorkDir(),
+		"setup",
+		"--index-management",
+		"--E=setup.ilm.enabled=true", "--E=setup.ilm.overwrite=true"})
+	runner.T().Logf("got response from management setup: %s", string(resp))
+	require.Error(runner.T(), err)
+	assert.Contains(runner.T(), string(resp), "error creating")
+}
+
+// tests setup with both ILM and DSL enabled, should fail
+func (runner *BeatRunner) TestBothLifecyclesEnabled() {
+	ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
+	defer cancel()
+
+	resp, err := runner.agentFixture.Exec(ctx, []string{"--path.home",
+		runner.agentFixture.WorkDir(),
+		"setup",
+		"--index-management",
+		"--E=setup.ilm.enabled=true", "--E=setup.dsl.enabled=true"})
+	runner.T().Logf("got response from management setup: %s", string(resp))
+	require.Error(runner.T(), err)
+}
+
+// disable all lifecycle management, ensure it's actually disabled
+func (runner *BeatRunner) TestAllLifecyclesDisabled() {
+	ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
+	defer cancel()
+	defer func() {
+		runner.CleanupTemplates(ctx)
+	}()
+
+	runner.CleanupTemplates(ctx)
+
+	resp, err := runner.agentFixture.Exec(ctx, []string{"--path.home",
+		runner.agentFixture.WorkDir(),
+		"setup",
+		"--index-management",
+		"--E=setup.ilm.enabled=false", "--E=setup.dsl.enabled=false"})
+	runner.T().Logf("got response from management setup: %s", string(resp))
+	require.NoError(runner.T(), err)
+
+	// make sure we have data streams, but there's no lifecycles
+	streams, err := estools.GetDataStreamsForPattern(ctx, runner.requirementsInfo.ESClient, fmt.Sprintf("*%s*", runner.testbeatName))
+	require.NoError(runner.T(), err)
+
+	require.NotEmpty(runner.T(), streams.DataStreams, "found no datastreams")
+	foundPolicy := false
+	for _, stream := range streams.DataStreams {
+		if stream.Lifecycle.DataRetention != "" {
+			foundPolicy = true
+			break
+		}
+	}
+	require.False(runner.T(), foundPolicy, "Found a lifecycle policy despite disabling lifecycles. Found: %#v", streams)
+}
+
+// the export command doesn't actually make a network connection,
+// so this won't fail
+func (runner *BeatRunner) TestExport() {
+	ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
+	defer cancel()
+	info, err := estools.GetPing(ctx, runner.requirementsInfo.ESClient)
+	require.NoError(runner.T(), err)
+
+	if info.Version.BuildFlavor != "serverless" {
+		runner.T().Skip("must run on serverless")
+	}
+
+	resp, err := runner.agentFixture.Exec(ctx, []string{"--path.home",
+		runner.agentFixture.WorkDir(),
+		"export", "ilm-policy", "--E=setup.ilm.enabled=true"})
+	runner.T().Logf("got response from export: %s", string(resp))
+	assert.NoError(runner.T(), err)
+	// check to see if we got a valid output
+	policy := map[string]interface{}{}
+	err = json.Unmarshal(resp, &policy)
+	require.NoError(runner.T(), err)
+
+	require.NotEmpty(runner.T(), policy["policy"])
+}
+
+// tests beat export with DSL
+func (runner *BeatRunner) TestExportDSL() {
+	ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
+	defer cancel()
+	resp, err := runner.agentFixture.Exec(ctx, []string{"--path.home",
+		runner.agentFixture.WorkDir(),
+		"export", "ilm-policy", "--E=setup.dsl.enabled=true"})
+	runner.T().Logf("got response from export: %s", string(resp))
+	assert.NoError(runner.T(), err)
+	// check to see if we got a valid output
+	policy := map[string]interface{}{}
+	err = json.Unmarshal(resp, &policy)
+	require.NoError(runner.T(), err)
+
+	require.NotEmpty(runner.T(), policy["data_retention"])
+}
+
+func (runner *BeatRunner) SubtestExportTemplates() {
+	ctx, cancel := context.WithTimeout(context.Background(), time.Minute*2)
+	defer cancel()
+	outDir := runner.T().TempDir()
+
+	_, err := runner.agentFixture.Exec(ctx, []string{"--path.home",
+		runner.agentFixture.WorkDir(),
+		"export",
+		"template", "--dir", outDir})
+	assert.NoError(runner.T(), err)
+
+	inFolder, err := os.ReadDir(filepath.Join(outDir, "/template"))
+	require.NoError(runner.T(), err)
+	runner.T().Logf("got log contents: %#v", inFolder)
+	require.NotEmpty(runner.T(), inFolder)
+}
+
+func (runner *BeatRunner) SubtestExportIndexPatterns() {
+	ctx, cancel := context.WithTimeout(context.Background(), time.Minute*2)
+	defer cancel()
+
+	rawPattern, err := runner.agentFixture.Exec(ctx, []string{"--path.home",
+		runner.agentFixture.WorkDir(),
+		"export",
+		"index-pattern"})
+	assert.NoError(runner.T(), err)
+
+	idxPattern := map[string]interface{}{}
+
+	err = json.Unmarshal(rawPattern, &idxPattern)
+	require.NoError(runner.T(), err)
+	require.NotNil(runner.T(), idxPattern["attributes"])
+}
+
+// CheckDSLPolicy checks if we have a match for the given DSL policy given a template name and policy data_retention
+func (runner *BeatRunner) CheckDSLPolicy(ctx context.Context, tmpl string, policy string) {
+	streams, err := estools.GetDataStreamsForPattern(ctx, runner.requirementsInfo.ESClient, tmpl)
+	require.NoError(runner.T(), err)
+
+	foundCustom := false
+	for _, stream := range streams.DataStreams {
+		if stream.Lifecycle.DataRetention == policy {
+			foundCustom = true
+			break
+		}
+	}
+
+	require.True(runner.T(), foundCustom, "did not find our lifecycle policy. Found: %#v", streams)
+}
+
+// CleanupTemplates removes any existing index
+func (runner *BeatRunner) CleanupTemplates(ctx context.Context) {
+	if !runner.skipCleanup {
+		_ = estools.DeleteIndexTemplatesDataStreams(ctx, runner.requirementsInfo.ESClient, fmt.Sprintf("%s*", runner.testbeatName))
+		_ = estools.DeleteIndexTemplatesDataStreams(ctx, runner.requirementsInfo.ESClient, "*custom-name*")
+	}
+}

From fa5de97768314d906ff7db74b450c3c78f405689 Mon Sep 17 00:00:00 2001
From: Shaunak Kashyap <ycombinator@gmail.com>
Date: Thu, 23 Nov 2023 03:48:22 +0530
Subject: [PATCH 15/21] [Upgrade Details] Use `details.DownloadRate` type from
 elastic-agent-libs (#3793)

* Use type from elastic-agent-libs

* Updating SHA on elastic-agent-libs replace dependency

* Fix test

* Bumping up dependency on elastic-agent-libs

* Remove unused type
---
 NOTICE.txt                                    |  4 +-
 go.mod                                        |  2 +-
 go.sum                                        |  4 +-
 .../application/upgrade/details/details.go    | 47 ++-----------------
 .../upgrade/details/details_test.go           |  2 +-
 5 files changed, 9 insertions(+), 50 deletions(-)

diff --git a/NOTICE.txt b/NOTICE.txt
index 4bbca01b922..4e9b7cc65ae 100644
--- a/NOTICE.txt
+++ b/NOTICE.txt
@@ -1166,11 +1166,11 @@ SOFTWARE
 
 --------------------------------------------------------------------------------
 Dependency : github.com/elastic/elastic-agent-libs
-Version: v0.7.0
+Version: v0.7.2
 Licence type (autodetected): Apache-2.0
 --------------------------------------------------------------------------------
 
-Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-agent-libs@v0.7.0/LICENSE:
+Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-agent-libs@v0.7.2/LICENSE:
 
                                  Apache License
                            Version 2.0, January 2004
diff --git a/go.mod b/go.mod
index a0b30a7f3f1..2c550799de2 100644
--- a/go.mod
+++ b/go.mod
@@ -14,7 +14,7 @@ require (
 	github.com/elastic/e2e-testing v1.1.0
 	github.com/elastic/elastic-agent-autodiscover v0.6.5
 	github.com/elastic/elastic-agent-client/v7 v7.5.0
-	github.com/elastic/elastic-agent-libs v0.7.0
+	github.com/elastic/elastic-agent-libs v0.7.2
 	github.com/elastic/elastic-agent-system-metrics v0.8.0
 	github.com/elastic/elastic-transport-go/v8 v8.3.0
 	github.com/elastic/go-elasticsearch/v8 v8.10.1
diff --git a/go.sum b/go.sum
index ffcbb8e262a..f428a834747 100644
--- a/go.sum
+++ b/go.sum
@@ -781,8 +781,8 @@ github.com/elastic/elastic-agent-autodiscover v0.6.5 h1:5DeMpuNc8c/tN6HN0A4A2uOF
 github.com/elastic/elastic-agent-autodiscover v0.6.5/go.mod h1:chulyCAyZb/njMHgzkhC/yWnt8v/Y6eCRUhmFVnsA5o=
 github.com/elastic/elastic-agent-client/v7 v7.5.0 h1:niI3WQ+01Lnp2r5LxK8SyNhrPJe13vBiOkqrDRK2oTA=
 github.com/elastic/elastic-agent-client/v7 v7.5.0/go.mod h1:DYoX95xjC4BW/p2avyu724Qr2+hoUIz9eCU9CVS1d+0=
-github.com/elastic/elastic-agent-libs v0.7.0 h1:g/+Gzpn4ayXPFbfZsn5lGjbPR1TGqlVpshJVVUNJGlQ=
-github.com/elastic/elastic-agent-libs v0.7.0/go.mod h1:o+EySawBZGeYu49shJxerg2wRCimS1dhrD4As0MS700=
+github.com/elastic/elastic-agent-libs v0.7.2 h1:yT0hF0UAxJCdQqhHh6SFpgYrcpB10oFzPj8IaytPS2o=
+github.com/elastic/elastic-agent-libs v0.7.2/go.mod h1:pVBEElQJUO9mr4WStWNXuQGsJn54lcjAoYAHmsvBLBc=
 github.com/elastic/elastic-agent-system-metrics v0.8.0 h1:EsWbtd83JvnaqnL57bKS1E6GhOdemTRbxdFDcenR8zQ=
 github.com/elastic/elastic-agent-system-metrics v0.8.0/go.mod h1:9C1UEfj0P687HAzZepHszN6zXA+2tN2Lx3Osvq1zby8=
 github.com/elastic/elastic-integration-corpus-generator-tool v0.5.0/go.mod h1:uf9N86y+UACGybdEhZLpwZ93XHWVhsYZAA4c2T2v6YM=
diff --git a/internal/pkg/agent/application/upgrade/details/details.go b/internal/pkg/agent/application/upgrade/details/details.go
index dae27923ecf..3522b181e59 100644
--- a/internal/pkg/agent/application/upgrade/details/details.go
+++ b/internal/pkg/agent/application/upgrade/details/details.go
@@ -5,20 +5,12 @@
 package details
 
 import (
-	"encoding/json"
-	"fmt"
-	"math"
-	"strings"
 	"sync"
 	"time"
 
-	"github.com/docker/go-units"
+	"github.com/elastic/elastic-agent-libs/upgrade/details"
 )
 
-// downloadRate is a float64 that can be safely marshalled to JSON
-// when the value is Infinity. The rate is always in bytes/second units.
-type downloadRate float64
-
 // Observer is a function that will be called with upgrade details
 type Observer func(details *Details)
 
@@ -43,7 +35,7 @@ type Metadata struct {
 
 	// DownloadRate is the rate, in bytes per second, at which the download
 	// is progressing.
-	DownloadRate downloadRate `json:"download_rate,omitempty" yaml:"download_rate,omitempty"`
+	DownloadRate details.DownloadRate `json:"download_rate,omitempty" yaml:"download_rate,omitempty"`
 
 	// FailedState is the state an upgrade was in if/when it failed. Use the
 	// Fail() method of UpgradeDetails to correctly record details when
@@ -93,7 +85,7 @@ func (d *Details) SetDownloadProgress(percent, rateBytesPerSecond float64) {
 	defer d.mu.Unlock()
 
 	d.Metadata.DownloadPercent = percent
-	d.Metadata.DownloadRate = downloadRate(rateBytesPerSecond)
+	d.Metadata.DownloadRate = details.DownloadRate(rateBytesPerSecond)
 	d.notifyObservers()
 }
 
@@ -184,36 +176,3 @@ func equalTimePointers(t, otherT *time.Time) bool {
 
 	return t.Equal(*otherT)
 }
-
-func (dr *downloadRate) MarshalJSON() ([]byte, error) {
-	downloadRateBytesPerSecond := float64(*dr)
-	if math.IsInf(downloadRateBytesPerSecond, 0) {
-		return json.Marshal("+Inf bps")
-	}
-
-	return json.Marshal(
-		fmt.Sprintf("%sps", units.HumanSizeWithPrecision(downloadRateBytesPerSecond, 2)),
-	)
-}
-
-func (dr *downloadRate) UnmarshalJSON(data []byte) error {
-	var downloadRateStr string
-	err := json.Unmarshal(data, &downloadRateStr)
-	if err != nil {
-		return err
-	}
-
-	if downloadRateStr == "+Inf bps" {
-		*dr = downloadRate(math.Inf(1))
-		return nil
-	}
-
-	downloadRateStr = strings.TrimSuffix(downloadRateStr, "ps")
-	downloadRateBytesPerSecond, err := units.FromHumanSize(downloadRateStr)
-	if err != nil {
-		return err
-	}
-
-	*dr = downloadRate(downloadRateBytesPerSecond)
-	return nil
-}
diff --git a/internal/pkg/agent/application/upgrade/details/details_test.go b/internal/pkg/agent/application/upgrade/details/details_test.go
index 2ecd933e9f8..d1ade774e7b 100644
--- a/internal/pkg/agent/application/upgrade/details/details_test.go
+++ b/internal/pkg/agent/application/upgrade/details/details_test.go
@@ -80,7 +80,7 @@ func TestDetailsDownloadRateJSON(t *testing.T) {
 		var unmarshalledDetails Details
 		err = json.Unmarshal(data, &unmarshalledDetails)
 		require.NoError(t, err)
-		require.Equal(t, float64(1800), float64(unmarshalledDetails.Metadata.DownloadRate))
+		require.Equal(t, float64(1794), float64(unmarshalledDetails.Metadata.DownloadRate))
 		require.Equal(t, .8, unmarshalledDetails.Metadata.DownloadPercent)
 	})
 

From 37f01b24273ad8f18bce46b8c7df8b35913be860 Mon Sep 17 00:00:00 2001
From: Fae Charlton <fae.charlton@elastic.co>
Date: Wed, 22 Nov 2023 20:10:23 -0500
Subject: [PATCH 16/21] Log some useful details during startup (#3781)

* add some helpful log messages

* Infof not Info
---
 internal/pkg/agent/application/application.go | 1 +
 pkg/component/runtime/manager.go              | 1 +
 2 files changed, 2 insertions(+)

diff --git a/internal/pkg/agent/application/application.go b/internal/pkg/agent/application/application.go
index 1f8853120e3..c277382ae93 100644
--- a/internal/pkg/agent/application/application.go
+++ b/internal/pkg/agent/application/application.go
@@ -86,6 +86,7 @@ func New(
 		// monitoring is always disabled in testing mode
 		disableMonitoring = true
 	} else {
+		log.Infof("Loading baseline config from %v", pathConfigFile)
 		rawConfig, err = config.LoadFile(pathConfigFile)
 		if err != nil {
 			return nil, nil, nil, fmt.Errorf("failed to load configuration: %w", err)
diff --git a/pkg/component/runtime/manager.go b/pkg/component/runtime/manager.go
index 850349406d6..81fde1090b0 100644
--- a/pkg/component/runtime/manager.go
+++ b/pkg/component/runtime/manager.go
@@ -203,6 +203,7 @@ func (m *Manager) Run(ctx context.Context) error {
 	})
 
 	var server *grpc.Server
+	m.logger.Infof("Starting grpc control protocol listener on port %v with max_message_size %v", m.grpcConfig.Port, m.grpcConfig.MaxMsgSize)
 	if m.tracer != nil {
 		apmInterceptor := apmgrpc.NewUnaryServerInterceptor(apmgrpc.WithRecovery(), apmgrpc.WithTracer(m.tracer))
 		server = grpc.NewServer(

From a6e0c1aef848cd29632e3d205425cc312d1a634b Mon Sep 17 00:00:00 2001
From: Victor Martinez <victormartinezrubio@gmail.com>
Date: Thu, 23 Nov 2023 10:59:36 +0100
Subject: [PATCH 17/21] ironbank: use ubi:9.3 (#3787)

---
 dev-tools/packaging/packages.yml                                | 2 +-
 dev-tools/packaging/templates/ironbank/Dockerfile.tmpl          | 2 +-
 .../packaging/templates/ironbank/hardening_manifest.yaml.tmpl   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/dev-tools/packaging/packages.yml b/dev-tools/packaging/packages.yml
index ec7f7e659c0..597c610271c 100644
--- a/dev-tools/packaging/packages.yml
+++ b/dev-tools/packaging/packages.yml
@@ -427,7 +427,7 @@ shared:
   - &docker_arm_ubi_spec
     extra_vars:
       image_name: '{{.BeatName}}-ubi9'
-      from: 'registry.access.redhat.com/ubi9/ubi-minimal:9.2'
+      from: 'registry.access.redhat.com/ubi9/ubi-minimal:9.3'
 
   - &elastic_docker_spec
     extra_vars:
diff --git a/dev-tools/packaging/templates/ironbank/Dockerfile.tmpl b/dev-tools/packaging/templates/ironbank/Dockerfile.tmpl
index 8d4c4aa9990..58d05bab9e0 100644
--- a/dev-tools/packaging/templates/ironbank/Dockerfile.tmpl
+++ b/dev-tools/packaging/templates/ironbank/Dockerfile.tmpl
@@ -4,7 +4,7 @@
 ################################################################################
 ARG BASE_REGISTRY=registry1.dsop.io
 ARG BASE_IMAGE=redhat/ubi/ubi9
-ARG BASE_TAG=9.2
+ARG BASE_TAG=9.3
 
 FROM ${BASE_REGISTRY}/${BASE_IMAGE}:${BASE_TAG} as prep_files
 
diff --git a/dev-tools/packaging/templates/ironbank/hardening_manifest.yaml.tmpl b/dev-tools/packaging/templates/ironbank/hardening_manifest.yaml.tmpl
index 00be4eb62ae..94f4dba3408 100644
--- a/dev-tools/packaging/templates/ironbank/hardening_manifest.yaml.tmpl
+++ b/dev-tools/packaging/templates/ironbank/hardening_manifest.yaml.tmpl
@@ -14,7 +14,7 @@ tags:
 # Build args passed to Dockerfile ARGs
 args:
   BASE_IMAGE: "redhat/ubi/ubi9"
-  BASE_TAG: "9.2"
+  BASE_TAG: "9.3"
   ELASTIC_STACK: "{{ beat_version }}"
   ELASTIC_PRODUCT: "elastic-agent"
 

From fb869d62f9fcf858ad14639f161d9daa7413d14a Mon Sep 17 00:00:00 2001
From: Tiago Queiroz <tiago.queiroz@elastic.co>
Date: Thu, 23 Nov 2023 14:58:31 +0100
Subject: [PATCH 18/21] Limit batchID length. (#3786)

If the BatchID is longer than 63 characters, OGC will fail to
instantiate a GCP VM, this commit fixes this by using a MD5 hash of
the whole ID and ensuring the final ID is at most 63 characters long.
---
 pkg/testing/runner/runner.go | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/pkg/testing/runner/runner.go b/pkg/testing/runner/runner.go
index 860ad57215e..aab9b695a2b 100644
--- a/pkg/testing/runner/runner.go
+++ b/pkg/testing/runner/runner.go
@@ -7,6 +7,7 @@ package runner
 import (
 	"bytes"
 	"context"
+	"crypto/md5"
 	"errors"
 	"fmt"
 	"io"
@@ -15,6 +16,7 @@ import (
 	"strings"
 	"sync"
 	"time"
+	"unicode/utf8"
 
 	"gopkg.in/yaml.v2"
 
@@ -1019,6 +1021,16 @@ func createBatchID(batch OSBatch) string {
 			id += "-" + batch.Batch.SudoTests[0].Tests[0].Name
 		}
 	}
+
+	// The batchID needs to be at most 63 characters long otherwise
+	// OGC will fail to instantiate the VM.
+	maxIDLen := 63
+	if len(id) > maxIDLen {
+		hash := fmt.Sprintf("%x", md5.Sum([]byte(id)))
+		hashLen := utf8.RuneCountInString(hash)
+		id = id[:maxIDLen-hashLen-1] + "-" + hash
+	}
+
 	return strings.ToLower(id)
 }
 

From a14c51f043070c1fbdbd9c5c83307c5f34216b97 Mon Sep 17 00:00:00 2001
From: Pavel Zorin <pavel.zorin@elastic.co>
Date: Thu, 23 Nov 2023 15:05:03 +0100
Subject: [PATCH 19/21] [CI] Strict unit test artifact paths. Removes
 TEST-go-unit-cov.xml from artifacts (#3780)

---
 .buildkite/pipeline.yml | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index e7484aa3fe6..ddd92c8d3d5 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -20,7 +20,8 @@ steps:
         key: "unit-tests-2204"
         command: ".buildkite/scripts/steps/unit-tests.sh"
         artifact_paths:
-          - "build/TEST-**"
+          - "build/TEST-go-unit.html"
+          - "build/TEST-go-unit.xml"
           - "build/diagnostics/*"
           - "coverage.out"
         agents:
@@ -34,7 +35,8 @@ steps:
         key: "unit-tests-2204-arm64"
         command: ".buildkite/scripts/steps/unit-tests.sh"
         artifact_paths:
-          - "build/TEST-**"
+          - "build/TEST-go-unit.html"
+          - "build/TEST-go-unit.xml"
           - "build/diagnostics/*"
           - "coverage.out"
         agents:
@@ -50,7 +52,8 @@ steps:
         key: "unit-tests-win2022"
         command: ".\\.buildkite\\scripts\\steps\\unit-tests.ps1"
         artifact_paths:
-          - "build/TEST-**"
+          - "build/TEST-go-unit.html"
+          - "build/TEST-go-unit.xml"
           - "build/diagnostics/*"
           - "coverage.out"
         agents:
@@ -67,7 +70,8 @@ steps:
         key: "unit-tests-win2016"
         command: ".\\.buildkite\\scripts\\steps\\unit-tests.ps1"
         artifact_paths:
-          - "build/TEST-**"
+          - "build/TEST-go-unit.html"
+          - "build/TEST-go-unit.xml"
           - "build/diagnostics/*"
           - "coverage.out"
         agents:
@@ -86,7 +90,8 @@ steps:
         command: ".buildkite/scripts/steps/unit-tests.sh"
         branches: main
         artifact_paths:
-          - "build/TEST-**"
+          - "build/TEST-go-unit.html"
+          - "build/TEST-go-unit.xml"
           - "build/diagnostics/*"
           - "coverage.out"
         agents:
@@ -100,7 +105,8 @@ steps:
         key: "unit-tests-macos-13"
         command: ".buildkite/scripts/steps/unit-tests.sh"
         artifact_paths:
-          - "build/TEST-**"
+          - "build/TEST-go-unit.html"
+          - "build/TEST-go-unit.xml"
           - "build/diagnostics/*"
           - "coverage.out"
         agents:
@@ -117,7 +123,8 @@ steps:
         key: "unit-tests-win10"
         command: ".\\.buildkite\\scripts\\steps\\unit-tests.ps1"
         artifact_paths:
-          - "build/TEST-**"
+          - "build/TEST-go-unit.html"
+          - "build/TEST-go-unit.xml"
           - "build/diagnostics/*"
           - "coverage.out"
         agents:
@@ -133,7 +140,8 @@ steps:
         key: "unit-tests-win11"
         command: ".\\.buildkite\\scripts\\steps\\unit-tests.ps1"
         artifact_paths:
-          - "build/TEST-**"
+          - "build/TEST-go-unit.html"
+          - "build/TEST-go-unit.xml"
           - "build/diagnostics/*"
           - "coverage.out"
         agents:
@@ -161,7 +169,7 @@ steps:
       unit-tests-win11
       "
     artifact_paths:
-      - "build/TEST-**"
+      - "build/TEST-go-unit.cov"
     agents:
       image: "golang:1.20.10"
     depends_on:
@@ -177,10 +185,6 @@ steps:
           K8S_VERSION: "v{{matrix.k8s_version}}"
           KIND_VERSION: "v0.20.0"
         command: ".buildkite/scripts/steps/k8s-tests.sh"
-        artifact_paths:
-          - "build/TEST-**"
-          - "build/diagnostics/*"
-          - "coverage.out"
         agents:
           provider: "gcp"
           image: "family/core-ubuntu-2204"

From 3e93536ce64a328ebf0f2cf92d6d59663c9c8393 Mon Sep 17 00:00:00 2001
From: Michael Wolf <michael.wolf@elastic.co>
Date: Thu, 23 Nov 2023 18:55:19 -0800
Subject: [PATCH 20/21] Change the default FLEET_INSECURE values in Kubernetes
 templates to false. (#3805)

The agent documentation states that the default value for FLEET_INSECURE is false, and true
is not recommended to be used. So this changes the kubernetes templates to use the recommended values.
---
 .../base/elastic-agent-managed-daemonset.yaml                 | 4 ++--
 .../base/elastic-agent-standalone-daemonset.yaml              | 2 +-
 .../base/elastic-agent-managed-daemonset.yaml                 | 4 ++--
 .../extra/elastic-agent-managed-statefulset.yaml              | 4 ++--
 deploy/kubernetes/elastic-agent-managed-kubernetes.yaml       | 4 ++--
 .../elastic-agent-managed-daemonset.yaml                      | 4 ++--
 6 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/deploy/kubernetes/elastic-agent-kustomize/default/elastic-agent-managed/base/elastic-agent-managed-daemonset.yaml b/deploy/kubernetes/elastic-agent-kustomize/default/elastic-agent-managed/base/elastic-agent-managed-daemonset.yaml
index c529b15460b..c40326b22b6 100644
--- a/deploy/kubernetes/elastic-agent-kustomize/default/elastic-agent-managed/base/elastic-agent-managed-daemonset.yaml
+++ b/deploy/kubernetes/elastic-agent-kustomize/default/elastic-agent-managed/base/elastic-agent-managed-daemonset.yaml
@@ -37,7 +37,7 @@ spec:
               value: "1"
             # Set to true to communicate with Fleet with either insecure HTTP or unverified HTTPS
             - name: FLEET_INSECURE
-              value: "true"
+              value: "false"
             # Fleet Server URL to enroll the Elastic Agent into
             # FLEET_URL can be found in Kibana, go to Management > Fleet > Settings
             - name: FLEET_URL
@@ -62,7 +62,7 @@ spec:
               valueFrom:
                 fieldRef:
                   fieldPath: metadata.name
-            # The following ELASTIC_NETINFO:false variable will disable the netinfo.enabled option of add-host-metadata processor. This will remove fields host.ip and host.mac.  
+            # The following ELASTIC_NETINFO:false variable will disable the netinfo.enabled option of add-host-metadata processor. This will remove fields host.ip and host.mac.
             # For more info: https://www.elastic.co/guide/en/beats/metricbeat/current/add-host-metadata.html
             - name: ELASTIC_NETINFO
               value: "false"
diff --git a/deploy/kubernetes/elastic-agent-kustomize/default/elastic-agent-standalone/base/elastic-agent-standalone-daemonset.yaml b/deploy/kubernetes/elastic-agent-kustomize/default/elastic-agent-standalone/base/elastic-agent-standalone-daemonset.yaml
index 9cdcf8670d6..35dd5612674 100644
--- a/deploy/kubernetes/elastic-agent-kustomize/default/elastic-agent-standalone/base/elastic-agent-standalone-daemonset.yaml
+++ b/deploy/kubernetes/elastic-agent-kustomize/default/elastic-agent-standalone/base/elastic-agent-standalone-daemonset.yaml
@@ -63,7 +63,7 @@ spec:
                   fieldPath: metadata.name
             - name: STATE_PATH
               value: "/etc/elastic-agent"
-            # The following ELASTIC_NETINFO:false variable will disable the netinfo.enabled option of add-host-metadata processor. This will remove fields host.ip and host.mac.  
+            # The following ELASTIC_NETINFO:false variable will disable the netinfo.enabled option of add-host-metadata processor. This will remove fields host.ip and host.mac.
             # For more info: https://www.elastic.co/guide/en/beats/metricbeat/current/add-host-metadata.html
             - name: ELASTIC_NETINFO
               value: "false"
diff --git a/deploy/kubernetes/elastic-agent-kustomize/ksm-autosharding/elastic-agent-managed/base/elastic-agent-managed-daemonset.yaml b/deploy/kubernetes/elastic-agent-kustomize/ksm-autosharding/elastic-agent-managed/base/elastic-agent-managed-daemonset.yaml
index 6f5344825c0..d3b384e7a56 100644
--- a/deploy/kubernetes/elastic-agent-kustomize/ksm-autosharding/elastic-agent-managed/base/elastic-agent-managed-daemonset.yaml
+++ b/deploy/kubernetes/elastic-agent-kustomize/ksm-autosharding/elastic-agent-managed/base/elastic-agent-managed-daemonset.yaml
@@ -37,7 +37,7 @@ spec:
               value: "1"
             # Set to true to communicate with Fleet with either insecure HTTP or unverified HTTPS
             - name: FLEET_INSECURE
-              value: "true"
+              value: "false"
             # Fleet Server URL to enroll the Elastic Agent into
             # FLEET_URL can be found in Kibana, go to Management > Fleet > Settings
             - name: FLEET_URL
@@ -62,7 +62,7 @@ spec:
               valueFrom:
                 fieldRef:
                   fieldPath: metadata.name
-            # The following ELASTIC_NETINFO:false variable will disable the netinfo.enabled option of add-host-metadata processor. This will remove fields host.ip and host.mac.  
+            # The following ELASTIC_NETINFO:false variable will disable the netinfo.enabled option of add-host-metadata processor. This will remove fields host.ip and host.mac.
             # For more info: https://www.elastic.co/guide/en/beats/metricbeat/current/add-host-metadata.html
             - name: ELASTIC_NETINFO
               value: "false"
diff --git a/deploy/kubernetes/elastic-agent-kustomize/ksm-autosharding/elastic-agent-managed/extra/elastic-agent-managed-statefulset.yaml b/deploy/kubernetes/elastic-agent-kustomize/ksm-autosharding/elastic-agent-managed/extra/elastic-agent-managed-statefulset.yaml
index 3a934d02685..bc493635411 100644
--- a/deploy/kubernetes/elastic-agent-kustomize/ksm-autosharding/elastic-agent-managed/extra/elastic-agent-managed-statefulset.yaml
+++ b/deploy/kubernetes/elastic-agent-kustomize/ksm-autosharding/elastic-agent-managed/extra/elastic-agent-managed-statefulset.yaml
@@ -37,7 +37,7 @@ spec:
               value: "1"
             # Set to true to communicate with Fleet with either insecure HTTP or unverified HTTPS
             - name: FLEET_INSECURE
-              value: "true"
+              value: "false"
             # Fleet Server URL to enroll the Elastic Agent into
             # FLEET_URL can be found in Kibana, go to Management > Fleet > Settings
             - name: FLEET_URL
@@ -62,7 +62,7 @@ spec:
               valueFrom:
                 fieldRef:
                   fieldPath: metadata.name
-            # The following ELASTIC_NETINFO:false variable will disable the netinfo.enabled option of add-host-metadata processor. This will remove fields host.ip and host.mac.  
+            # The following ELASTIC_NETINFO:false variable will disable the netinfo.enabled option of add-host-metadata processor. This will remove fields host.ip and host.mac.
             # For more info: https://www.elastic.co/guide/en/beats/metricbeat/current/add-host-metadata.html
             - name: ELASTIC_NETINFO
               value: "false"
diff --git a/deploy/kubernetes/elastic-agent-managed-kubernetes.yaml b/deploy/kubernetes/elastic-agent-managed-kubernetes.yaml
index 44df212a629..0b9526f287d 100644
--- a/deploy/kubernetes/elastic-agent-managed-kubernetes.yaml
+++ b/deploy/kubernetes/elastic-agent-managed-kubernetes.yaml
@@ -37,7 +37,7 @@ spec:
               value: "1"
             # Set to true to communicate with Fleet with either insecure HTTP or unverified HTTPS
             - name: FLEET_INSECURE
-              value: "true"
+              value: "false"
             # Fleet Server URL to enroll the Elastic Agent into
             # FLEET_URL can be found in Kibana, go to Management > Fleet > Settings
             - name: FLEET_URL
@@ -62,7 +62,7 @@ spec:
               valueFrom:
                 fieldRef:
                   fieldPath: metadata.name
-            # The following ELASTIC_NETINFO:false variable will disable the netinfo.enabled option of add-host-metadata processor. This will remove fields host.ip and host.mac.  
+            # The following ELASTIC_NETINFO:false variable will disable the netinfo.enabled option of add-host-metadata processor. This will remove fields host.ip and host.mac.
             # For more info: https://www.elastic.co/guide/en/beats/metricbeat/current/add-host-metadata.html
             - name: ELASTIC_NETINFO
               value: "false"
diff --git a/deploy/kubernetes/elastic-agent-managed/elastic-agent-managed-daemonset.yaml b/deploy/kubernetes/elastic-agent-managed/elastic-agent-managed-daemonset.yaml
index 36d5afef3be..7a028b524c1 100644
--- a/deploy/kubernetes/elastic-agent-managed/elastic-agent-managed-daemonset.yaml
+++ b/deploy/kubernetes/elastic-agent-managed/elastic-agent-managed-daemonset.yaml
@@ -37,7 +37,7 @@ spec:
               value: "1"
             # Set to true to communicate with Fleet with either insecure HTTP or unverified HTTPS
             - name: FLEET_INSECURE
-              value: "true"
+              value: "false"
             # Fleet Server URL to enroll the Elastic Agent into
             # FLEET_URL can be found in Kibana, go to Management > Fleet > Settings
             - name: FLEET_URL
@@ -62,7 +62,7 @@ spec:
               valueFrom:
                 fieldRef:
                   fieldPath: metadata.name
-            # The following ELASTIC_NETINFO:false variable will disable the netinfo.enabled option of add-host-metadata processor. This will remove fields host.ip and host.mac.  
+            # The following ELASTIC_NETINFO:false variable will disable the netinfo.enabled option of add-host-metadata processor. This will remove fields host.ip and host.mac.
             # For more info: https://www.elastic.co/guide/en/beats/metricbeat/current/add-host-metadata.html
             - name: ELASTIC_NETINFO
               value: "false"

From 97e82175953af75fbe29fccc3cfcc5688a96dc8f Mon Sep 17 00:00:00 2001
From: Blake Rouse <blake.rouse@elastic.co>
Date: Fri, 24 Nov 2023 09:52:13 -0500
Subject: [PATCH 21/21] [Non-Root] Hide --unprivileged installation flag until
 supported (#3799)

* Hide --unprivileged.

* Fix lint.

---------

Co-authored-by: Pierre HILBERT <pierre.hilbert@elastic.co>
---
 internal/pkg/agent/cmd/install.go | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/internal/pkg/agent/cmd/install.go b/internal/pkg/agent/cmd/install.go
index 2b96dae2d7a..a1415d0e59b 100644
--- a/internal/pkg/agent/cmd/install.go
+++ b/internal/pkg/agent/cmd/install.go
@@ -47,7 +47,8 @@ would like the Agent to operate.
 	cmd.Flags().BoolP("force", "f", false, "Force overwrite the current installation and do not prompt for confirmation")
 	cmd.Flags().BoolP("non-interactive", "n", false, "Install Elastic Agent in non-interactive mode which will not prompt on missing parameters but fails instead.")
 	cmd.Flags().String(flagInstallBasePath, paths.DefaultBasePath, "The path where the Elastic Agent will be installed. It must be an absolute path.")
-	cmd.Flags().Bool(flagInstallUnprivileged, false, "Installed Elastic Agent will create an 'elastic-agent' user and run as that user.")
+	cmd.Flags().Bool(flagInstallUnprivileged, false, "Installed Elastic Agent will create an 'elastic-agent' user and run as that user. (experimental)")
+	_ = cmd.Flags().MarkHidden(flagInstallUnprivileged) // Hidden until fully supported
 	addEnrollFlags(cmd)
 
 	return cmd
@@ -77,6 +78,9 @@ func installCmd(streams *cli.IOStreams, cmd *cobra.Command) error {
 	if unprivileged && runtime.GOOS != "linux" {
 		return fmt.Errorf("unable to perform install command, unprivileged is currently only supported on Linux")
 	}
+	if unprivileged {
+		fmt.Fprintln(streams.Out, "Unprivileged installation mode enabled; this is an experimental and currently unsupported feature.")
+	}
 
 	topPath := paths.InstallPath(basePath)