From e01dff8f1148c2d1ce614bf2c6f20a67ace4f9b9 Mon Sep 17 00:00:00 2001 From: "mergify[bot]" <37929162+mergify[bot]@users.noreply.github.com> Date: Wed, 15 Nov 2023 08:15:11 +0100 Subject: [PATCH 01/21] Changelog for 8.11.1 (#3751) (#3767) * Changelog for 8.11.1 * Update changelog/8.11.1.asciidoc Co-authored-by: David Kilfoyle <41695641+kilfoyle@users.noreply.github.com> --------- Co-authored-by: David Kilfoyle <41695641+kilfoyle@users.noreply.github.com> (cherry picked from commit 9f6cf349e44688ff26e47c2fbfbc934bf28fd3d2) Co-authored-by: Pierre HILBERT --- changelog/8.11.1.asciidoc | 33 +++++++++++++++++++ changelog/8.11.1.yaml | 14 ++++++++ ...35373-Add-component-fields-to-metrics.yaml | 32 ------------------ 3 files changed, 47 insertions(+), 32 deletions(-) create mode 100644 changelog/8.11.1.asciidoc create mode 100644 changelog/8.11.1.yaml delete mode 100644 changelog/fragments/1697735373-Add-component-fields-to-metrics.yaml diff --git a/changelog/8.11.1.asciidoc b/changelog/8.11.1.asciidoc new file mode 100644 index 00000000000..8fba60d7440 --- /dev/null +++ b/changelog/8.11.1.asciidoc @@ -0,0 +1,33 @@ +// begin 8.11.1 relnotes + +[[release-notes-8.11.1]] +== 8.11.1 + +Review important information about the 8.11.1 release. + + + + + + + + + +[discrete] +[[new-features-8.11.1]] +=== New features + +The 8.11.1 release adds the following new and notable features. + + +elastic-agent:: + +* Add component {id, binary} to monitoring metrics from {agent} and {beats}. {elastic-agent-pull}https://github.com/elastic/elastic-agent/pull/3626[#https://github.com/elastic/elastic-agent/pull/3626] {elastic-agent-issue}https://github.com/elastic/integrations/issues/7977[#https://github.com/elastic/integrations/issues/7977] + + + + + + + +// end 8.11.1 relnotes diff --git a/changelog/8.11.1.yaml b/changelog/8.11.1.yaml new file mode 100644 index 00000000000..73c0e4888c0 --- /dev/null +++ b/changelog/8.11.1.yaml @@ -0,0 +1,14 @@ +version: 8.11.1 +entries: + - kind: feature + summary: Add component {id, binary} to monitoring metrics from Elastic-Agent and Beats + description: "" + component: elastic-agent + pr: + - https://github.com/elastic/elastic-agent/pull/3626 + issue: + - https://github.com/elastic/integrations/issues/7977 + timestamp: 1697735373 + file: + name: 1697735373-Add-component-fields-to-metrics.yaml + checksum: e78d27aa33269834757dceee23e195ae6ea8ecd5 diff --git a/changelog/fragments/1697735373-Add-component-fields-to-metrics.yaml b/changelog/fragments/1697735373-Add-component-fields-to-metrics.yaml deleted file mode 100644 index 780093d1434..00000000000 --- a/changelog/fragments/1697735373-Add-component-fields-to-metrics.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# Kind can be one of: -# - breaking-change: a change to previously-documented behavior -# - deprecation: functionality that is being removed in a later release -# - bug-fix: fixes a problem in a previous version -# - enhancement: extends functionality but does not break or fix existing behavior -# - feature: new functionality -# - known-issue: problems that we are aware of in a given version -# - security: impacts on the security of a product or a user’s deployment. -# - upgrade: important information for someone upgrading from a prior version -# - other: does not fit into any of the other categories -kind: feature - -# Change summary; a 80ish characters long description of the change. -summary: Add component.{id, binary} to monitoring metrics from Elastic-Agent and Beats - -# Long description; in case the summary is not enough to describe the change -# this field accommodate a description without length limits. -# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. -#description: - -# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. -component: elastic-agent - -# PR URL; optional; the PR number that added the changeset. -# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. -# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. -# Please provide it if you are adding a fragment for a different PR. -pr: https://github.com/elastic/elastic-agent/pull/3626 - -# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). -# If not present is automatically filled by the tooling with the issue linked to the PR number. -issue: https://github.com/elastic/integrations/issues/7977 From 26947871957102cd972210202f15217789ff5cb9 Mon Sep 17 00:00:00 2001 From: Lee E Hinman <57081003+leehinman@users.noreply.github.com> Date: Wed, 15 Nov 2023 00:26:48 -0800 Subject: [PATCH 02/21] Update pgp-workaround.md (#3766) * Update pgp-workaround.md Add info on TLS * Update docs/pgp-workaround.md Co-authored-by: Craig MacKenzie --------- Co-authored-by: Craig MacKenzie --- docs/pgp-workaround.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/pgp-workaround.md b/docs/pgp-workaround.md index 1da21bde919..621a22514ce 100644 --- a/docs/pgp-workaround.md +++ b/docs/pgp-workaround.md @@ -56,3 +56,7 @@ host { 'elastic-artifacts': path: '/etc/hosts' line: ' artifacts.elastic.co' ``` + +## TLS + +Because the connection is `https`, the certificate that the host that is impersonating `https:\\artifacts.elastic.co` returns will have to have `artifacts.elastic.co` as one of it's Subject Alternate Names. From 91c03aa7170141c550d9c8c70626fb2f4f0a1a0f Mon Sep 17 00:00:00 2001 From: Dan Kortschak <90160302+efd6@users.noreply.github.com> Date: Thu, 16 Nov 2023 05:54:47 +1030 Subject: [PATCH 03/21] internal/pkg/agent/application/coordinator: fix logging calls (#3679) Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> --- .../fragments/1698808222-orphan-verbs.yaml | 32 +++++++++++++++++++ .../application/coordinator/coordinator.go | 4 +-- 2 files changed, 34 insertions(+), 2 deletions(-) create mode 100644 changelog/fragments/1698808222-orphan-verbs.yaml diff --git a/changelog/fragments/1698808222-orphan-verbs.yaml b/changelog/fragments/1698808222-orphan-verbs.yaml new file mode 100644 index 00000000000..a423b94619f --- /dev/null +++ b/changelog/fragments/1698808222-orphan-verbs.yaml @@ -0,0 +1,32 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: bug-fix + +# Change summary; a 80ish characters long description of the change. +summary: Fix logging calls incorrectly using non-f variants and missing args. + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. +#description: + +# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. +component: elastic-agent + +# PR URL; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +#pr: https://github.com/owner/repo/1234 + +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +#issue: https://github.com/owner/repo/1234 diff --git a/internal/pkg/agent/application/coordinator/coordinator.go b/internal/pkg/agent/application/coordinator/coordinator.go index bce2188c0e7..1bbfde40a20 100644 --- a/internal/pkg/agent/application/coordinator/coordinator.go +++ b/internal/pkg/agent/application/coordinator/coordinator.go @@ -1206,11 +1206,11 @@ func (c *Coordinator) filterByCapabilities(comps []component.Component) []compon for _, component := range comps { // If this is an input component (not a shipper), make sure its type is allowed if component.InputSpec != nil && !c.caps.AllowInput(component.InputType) { - c.logger.Info("Component '%v' with input type '%v' filtered by capabilities.yml", component.InputType) + c.logger.Infof("Component '%v' with input type '%v' filtered by capabilities.yml", component.ID, component.InputType) continue } if !c.caps.AllowOutput(component.OutputType) { - c.logger.Info("Component '%v' with output type '%v' filtered by capabilities.yml", component.ID, component.OutputType) + c.logger.Infof("Component '%v' with output type '%v' filtered by capabilities.yml", component.ID, component.OutputType) continue } result = append(result, component) From a0d3dbdefeb7996f981fd6cbc22c5fe84c722b54 Mon Sep 17 00:00:00 2001 From: apmmachine <58790750+apmmachine@users.noreply.github.com> Date: Wed, 15 Nov 2023 15:49:54 -0500 Subject: [PATCH 04/21] [Automation] Bump Golang version to 1.20.11 (#3748) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore: Update .golangci.yml Made with ❤️️ by updatecli * chore: Update .go-version Made with ❤️️ by updatecli * chore: Update from dockerfiles Made with ❤️️ by updatecli * chore: Update version.asciidoc Made with ❤️️ by updatecli * Add changelog fragment. --------- Co-authored-by: apmmachine Co-authored-by: Craig MacKenzie --- .go-version | 2 +- .golangci.yml | 8 ++--- Dockerfile | 2 +- Dockerfile.skaffold | 2 +- .../1700000391-Upgrade-to-Go-1.20.11.yaml | 32 +++++++++++++++++++ version/docs/version.asciidoc | 2 +- 6 files changed, 40 insertions(+), 8 deletions(-) create mode 100644 changelog/fragments/1700000391-Upgrade-to-Go-1.20.11.yaml diff --git a/.go-version b/.go-version index acdfc7930c8..4bb1a22f8ec 100644 --- a/.go-version +++ b/.go-version @@ -1 +1 @@ -1.20.10 +1.20.11 diff --git a/.golangci.yml b/.golangci.yml index c72fb52ef8d..b3bcd8666a7 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -116,7 +116,7 @@ linters-settings: gosimple: # Select the Go version to target. The default is '1.13'. - go: "1.20.10" + go: "1.20.11" nakedret: # make an issue if func has more lines of code than this setting and it has naked returns; default is 30 @@ -136,17 +136,17 @@ linters-settings: staticcheck: # Select the Go version to target. The default is '1.13'. - go: "1.20.10" + go: "1.20.11" checks: ["all"] stylecheck: # Select the Go version to target. The default is '1.13'. - go: "1.20.10" + go: "1.20.11" checks: ["all"] unused: # Select the Go version to target. The default is '1.13'. - go: "1.20.10" + go: "1.20.11" gosec: excludes: diff --git a/Dockerfile b/Dockerfile index f3d3475041b..3f727a50317 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -ARG GO_VERSION=1.20.10 +ARG GO_VERSION=1.20.11 FROM circleci/golang:${GO_VERSION} diff --git a/Dockerfile.skaffold b/Dockerfile.skaffold index 4122e146bf6..96d0a1ea800 100644 --- a/Dockerfile.skaffold +++ b/Dockerfile.skaffold @@ -1,4 +1,4 @@ -ARG GO_VERSION=1.20.10 +ARG GO_VERSION=1.20.11 ARG crossbuild_image="docker.elastic.co/beats-dev/golang-crossbuild" ARG AGENT_VERSION=8.9.0-SNAPSHOT ARG AGENT_IMAGE="docker.elastic.co/beats/elastic-agent" diff --git a/changelog/fragments/1700000391-Upgrade-to-Go-1.20.11.yaml b/changelog/fragments/1700000391-Upgrade-to-Go-1.20.11.yaml new file mode 100644 index 00000000000..e7b4c74c851 --- /dev/null +++ b/changelog/fragments/1700000391-Upgrade-to-Go-1.20.11.yaml @@ -0,0 +1,32 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: security + +# Change summary; a 80ish characters long description of the change. +summary: Upgrade to Go 1.20.11. + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. +#description: + +# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. +component: "elastic-agent" + +# PR URL; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +pr: https://github.com/elastic/elastic-agent/pull/3748 + +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +#issue: https://github.com/owner/repo/1234 diff --git a/version/docs/version.asciidoc b/version/docs/version.asciidoc index 87bce0ff1c3..f4443fa3f95 100644 --- a/version/docs/version.asciidoc +++ b/version/docs/version.asciidoc @@ -3,7 +3,7 @@ // FIXME: once elastic.co docs have been switched over to use `main`, remove // the `doc-site-branch` line below as well as any references to it in the code. :doc-site-branch: master -:go-version: 1.20.10 +:go-version: 1.20.11 :release-state: unreleased :python: 3.7 :docker: 1.12 From e0bac95c7c56e96d2210205f63b6a05303f06e5c Mon Sep 17 00:00:00 2001 From: Shaunak Kashyap Date: Thu, 16 Nov 2023 14:48:12 +0530 Subject: [PATCH 05/21] [Upgrade Details] Implement tracking of `UPG_SCHEDULED` state (#3757) * Report queued upgrades details as UPG_SCHEDULED * Add unit test * Fix linter error * Fix mock calls * Implement Equals to address FIXME * Check all errors from StartTime() call --- .../application/dispatcher/dispatcher.go | 52 +++++++- .../application/dispatcher/dispatcher_test.go | 119 ++++++++++++++++-- .../pkg/agent/application/managed_mode.go | 11 +- .../agent/application/managed_mode_test.go | 14 ++- .../application/upgrade/details/details.go | 26 ++++ 5 files changed, 199 insertions(+), 23 deletions(-) diff --git a/internal/pkg/agent/application/dispatcher/dispatcher.go b/internal/pkg/agent/application/dispatcher/dispatcher.go index 92ac050f9ab..eda450d6a0b 100644 --- a/internal/pkg/agent/application/dispatcher/dispatcher.go +++ b/internal/pkg/agent/application/dispatcher/dispatcher.go @@ -14,6 +14,7 @@ import ( "go.elastic.co/apm" "github.com/elastic/elastic-agent/internal/pkg/agent/application/actions" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details" "github.com/elastic/elastic-agent/internal/pkg/agent/errors" "github.com/elastic/elastic-agent/internal/pkg/fleetapi" "github.com/elastic/elastic-agent/internal/pkg/fleetapi/acker" @@ -31,7 +32,7 @@ type priorityQueue interface { // Dispatcher processes actions coming from fleet api. type Dispatcher interface { - Dispatch(context.Context, acker.Acker, ...fleetapi.Action) + Dispatch(context.Context, details.Observer, acker.Acker, ...fleetapi.Action) Errors() <-chan error } @@ -101,7 +102,7 @@ func (ad *ActionDispatcher) key(a fleetapi.Action) string { // Dispatch will handle action queue operations, and retries. // Any action that implements the ScheduledAction interface may be added/removed from the queue based on StartTime. // Any action that implements the RetryableAction interface will be rescheduled if the handler returns an error. -func (ad *ActionDispatcher) Dispatch(ctx context.Context, acker acker.Acker, actions ...fleetapi.Action) { +func (ad *ActionDispatcher) Dispatch(ctx context.Context, detailsSetter details.Observer, acker acker.Acker, actions ...fleetapi.Action) { var err error span, ctx := apm.StartSpan(ctx, "dispatch", "app.internal") defer func() { @@ -110,6 +111,7 @@ func (ad *ActionDispatcher) Dispatch(ctx context.Context, acker acker.Acker, act }() ad.removeQueuedUpgrades(actions) + reportNextScheduledUpgrade(actions, detailsSetter, ad.log) actions = ad.queueScheduledActions(actions) actions = ad.dispatchCancelActions(ctx, actions, acker) queued, expired := ad.gatherQueuedActions(time.Now().UTC()) @@ -277,3 +279,49 @@ func (ad *ActionDispatcher) scheduleRetry(ctx context.Context, action fleetapi.R ad.log.Errorf("Unable to commit action retry (id %s) to fleet-server: %v", action.ID(), err) } } + +func reportNextScheduledUpgrade(input []fleetapi.Action, detailsSetter details.Observer, log *logger.Logger) { + var nextUpgrade *fleetapi.ActionUpgrade + for _, action := range input { + sAction, ok := action.(fleetapi.ScheduledAction) + if !ok { + continue + } + + uAction, ok := sAction.(*fleetapi.ActionUpgrade) + if !ok { + continue + } + + start, err := uAction.StartTime() + if err != nil { + log.Errorf("failed to get start time for scheduled upgrade action [id = %s]", uAction.ID()) + continue + } + + if !start.After(time.Now()) { + continue + } + + if nextUpgrade == nil { + nextUpgrade = uAction + continue + } + + nextUpgradeStartTime, _ := nextUpgrade.StartTime() + if start.Before(nextUpgradeStartTime) { + nextUpgrade = uAction + } + } + + // If there no scheduled upgrade, there are no upgrade details to report. + if nextUpgrade == nil { + return + } + + upgradeDetails := details.NewDetails(nextUpgrade.Version, details.StateScheduled, nextUpgrade.ID()) + startTime, _ := nextUpgrade.StartTime() + upgradeDetails.Metadata.ScheduledAt = startTime + + detailsSetter(upgradeDetails) +} diff --git a/internal/pkg/agent/application/dispatcher/dispatcher_test.go b/internal/pkg/agent/application/dispatcher/dispatcher_test.go index c82214f2299..212c50a3068 100644 --- a/internal/pkg/agent/application/dispatcher/dispatcher_test.go +++ b/internal/pkg/agent/application/dispatcher/dispatcher_test.go @@ -9,13 +9,17 @@ import ( "testing" "time" + "go.uber.org/zap/zapcore" + "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details" "github.com/elastic/elastic-agent/internal/pkg/agent/errors" "github.com/elastic/elastic-agent/internal/pkg/fleetapi" "github.com/elastic/elastic-agent/internal/pkg/fleetapi/acker" "github.com/elastic/elastic-agent/internal/pkg/fleetapi/acker/noop" + "github.com/elastic/elastic-agent/pkg/core/logger" ) type mockHandler struct { @@ -107,6 +111,7 @@ func (m *mockQueue) Save() error { } func TestActionDispatcher(t *testing.T) { + detailsSetter := func(upgradeDetails *details.Details) {} ack := noop.New() t.Run("Success to dispatch multiples events", func(t *testing.T) { @@ -139,7 +144,7 @@ func TestActionDispatcher(t *testing.T) { dispatchCtx, cancelFn := context.WithCancel(ctx) defer cancelFn() - go d.Dispatch(dispatchCtx, ack, action1, action2) + go d.Dispatch(dispatchCtx, detailsSetter, ack, action1, action2) if err := <-d.Errors(); err != nil { t.Fatalf("Unexpected error: %v", err) } @@ -166,7 +171,7 @@ func TestActionDispatcher(t *testing.T) { dispatchCtx, cancelFn := context.WithCancel(ctx) defer cancelFn() - go d.Dispatch(dispatchCtx, ack, action) + go d.Dispatch(dispatchCtx, detailsSetter, ack, action) if err := <-d.Errors(); err != nil { t.Fatalf("Unexpected error: %v", err) } @@ -216,7 +221,7 @@ func TestActionDispatcher(t *testing.T) { dispatchCtx, cancelFn := context.WithCancel(context.Background()) defer cancelFn() - go d.Dispatch(dispatchCtx, ack, action1, action2) + go d.Dispatch(dispatchCtx, detailsSetter, ack, action1, action2) if err := <-d.Errors(); err != nil { t.Fatalf("Unexpected error: %v", err) } @@ -247,7 +252,7 @@ func TestActionDispatcher(t *testing.T) { dispatchCtx, cancelFn := context.WithCancel(context.Background()) defer cancelFn() - go d.Dispatch(dispatchCtx, ack, action) + go d.Dispatch(dispatchCtx, detailsSetter, ack, action) select { case err := <-d.Errors(): t.Fatalf("Unexpected error: %v", err) @@ -287,7 +292,7 @@ func TestActionDispatcher(t *testing.T) { dispatchCtx, cancelFn := context.WithCancel(context.Background()) defer cancelFn() - go d.Dispatch(dispatchCtx, ack, action2) + go d.Dispatch(dispatchCtx, detailsSetter, ack, action2) if err := <-d.Errors(); err != nil { t.Fatalf("Unexpected error: %v", err) } @@ -310,7 +315,7 @@ func TestActionDispatcher(t *testing.T) { dispatchCtx, cancelFn := context.WithCancel(context.Background()) defer cancelFn() - go d.Dispatch(dispatchCtx, ack) + go d.Dispatch(dispatchCtx, detailsSetter, ack) select { case err := <-d.Errors(): t.Fatalf("Unexpected error: %v", err) @@ -345,7 +350,7 @@ func TestActionDispatcher(t *testing.T) { dispatchCtx, cancelFn := context.WithCancel(context.Background()) defer cancelFn() - go d.Dispatch(dispatchCtx, ack, action) + go d.Dispatch(dispatchCtx, detailsSetter, ack, action) if err := <-d.Errors(); err != nil { t.Fatalf("Unexpected error: %v", err) } @@ -379,7 +384,7 @@ func TestActionDispatcher(t *testing.T) { // launch in another routing and sleep to check if an error is generated dispatchCtx, cancelFn := context.WithCancel(context.Background()) defer cancelFn() - go d.Dispatch(dispatchCtx, ack, action1, action2) + go d.Dispatch(dispatchCtx, detailsSetter, ack, action1, action2) time.Sleep(time.Millisecond * 200) select { case <-d.Errors(): @@ -422,7 +427,7 @@ func TestActionDispatcher(t *testing.T) { // launch in another routing and sleep to check if an error is generated dispatchCtx1, cancelFn1 := context.WithCancel(context.Background()) defer cancelFn1() - go d.Dispatch(dispatchCtx1, ack, action1) + go d.Dispatch(dispatchCtx1, detailsSetter, ack, action1) select { case err := <-d.Errors(): if err == nil { @@ -433,7 +438,7 @@ func TestActionDispatcher(t *testing.T) { dispatchCtx2, cancelFn2 := context.WithCancel(context.Background()) defer cancelFn2() - go d.Dispatch(dispatchCtx2, ack, action2) + go d.Dispatch(dispatchCtx2, detailsSetter, ack, action2) select { case err := <-d.Errors(): if err != nil { @@ -484,3 +489,97 @@ func Test_ActionDispatcher_scheduleRetry(t *testing.T) { action.AssertExpectations(t) }) } + +func TestReportNextScheduledUpgrade(t *testing.T) { + now := time.Now().UTC() + later := now.Add(3 * time.Hour) + muchLater := later.Add(3 * time.Hour) + + cases := map[string]struct { + actions []fleetapi.Action + expectedDetails *details.Details + expectedErrLogMsg string + }{ + "no_scheduled_upgrades": { + actions: []fleetapi.Action{ + &fleetapi.ActionUpgrade{ + ActionID: "action1", + Version: "8.12.3", + }, + }, + expectedErrLogMsg: "failed to get start time for scheduled upgrade action [id = action1]", + }, + "one_scheduled_upgrade": { + actions: []fleetapi.Action{ + &fleetapi.ActionUpgrade{ + ActionID: "action2", + ActionStartTime: later.Format(time.RFC3339), + Version: "8.13.0", + }, + }, + expectedDetails: &details.Details{ + TargetVersion: "8.13.0", + State: details.StateScheduled, + ActionID: "action2", + Metadata: details.Metadata{ + ScheduledAt: later.Truncate(time.Second), + }, + }, + }, + "many_scheduled_upgrades": { + actions: []fleetapi.Action{ + &fleetapi.ActionUpgrade{ + ActionID: "action3", + ActionStartTime: muchLater.Format(time.RFC3339), + Version: "8.14.1", + }, + &fleetapi.ActionUpgrade{ + ActionID: "action4", + ActionStartTime: later.Format(time.RFC3339), + Version: "8.13.5", + }, + }, + expectedDetails: &details.Details{ + TargetVersion: "8.13.5", + State: details.StateScheduled, + ActionID: "action4", + Metadata: details.Metadata{ + ScheduledAt: later.Truncate(time.Second), + }, + }, + }, + "invalid_time_scheduled_upgrade": { + actions: []fleetapi.Action{ + &fleetapi.ActionUpgrade{ + ActionID: "action1", + Version: "8.13.2", + ActionStartTime: "invalid", + }, + }, + expectedErrLogMsg: "failed to get start time for scheduled upgrade action [id = action1]", + }, + } + + for name, test := range cases { + t.Run(name, func(t *testing.T) { + var actualDetails *details.Details + detailsSetter := func(upgradeDetails *details.Details) { + actualDetails = upgradeDetails + } + log, obs := logger.NewTesting("report_next_upgrade_details") + + reportNextScheduledUpgrade(test.actions, detailsSetter, log) + + require.True(t, test.expectedDetails.Equals(actualDetails)) + + logs := obs.TakeAll() + if test.expectedErrLogMsg != "" { + require.Len(t, logs, 1) + require.Equal(t, zapcore.ErrorLevel, logs[0].Level) + require.Equal(t, test.expectedErrLogMsg, logs[0].Message) + } else { + require.Empty(t, logs) + } + }) + } +} diff --git a/internal/pkg/agent/application/managed_mode.go b/internal/pkg/agent/application/managed_mode.go index 9ced07b678f..524500c661a 100644 --- a/internal/pkg/agent/application/managed_mode.go +++ b/internal/pkg/agent/application/managed_mode.go @@ -17,6 +17,7 @@ import ( fleetgateway "github.com/elastic/elastic-agent/internal/pkg/agent/application/gateway/fleet" "github.com/elastic/elastic-agent/internal/pkg/agent/application/info" "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details" "github.com/elastic/elastic-agent/internal/pkg/agent/configuration" "github.com/elastic/elastic-agent/internal/pkg/agent/errors" "github.com/elastic/elastic-agent/internal/pkg/agent/storage" @@ -163,7 +164,7 @@ func (m *managedConfigManager) Run(ctx context.Context) error { // persisted action on disk we should be able to ask Fleet to get the latest configuration. // But at the moment this is not possible because the policy change was acked. m.log.Info("restoring current policy from disk") - m.dispatcher.Dispatch(ctx, actionAcker, actions...) + m.dispatcher.Dispatch(ctx, m.coord.SetUpgradeDetails, actionAcker, actions...) stateRestored = true } @@ -229,24 +230,24 @@ func (m *managedConfigManager) Run(ctx context.Context) error { return gateway.Run(ctx) }) - go runDispatcher(ctx, m.dispatcher, gateway, actionAcker, dispatchFlushInterval) + go runDispatcher(ctx, m.dispatcher, gateway, m.coord.SetUpgradeDetails, actionAcker, dispatchFlushInterval) <-ctx.Done() return gatewayRunner.Err() } // runDispatcher passes actions collected from gateway to dispatcher or calls Dispatch with no actions every flushInterval. -func runDispatcher(ctx context.Context, actionDispatcher dispatcher.Dispatcher, fleetGateway coordinator.FleetGateway, actionAcker acker.Acker, flushInterval time.Duration) { +func runDispatcher(ctx context.Context, actionDispatcher dispatcher.Dispatcher, fleetGateway coordinator.FleetGateway, detailsSetter details.Observer, actionAcker acker.Acker, flushInterval time.Duration) { t := time.NewTimer(flushInterval) for { select { case <-ctx.Done(): return case <-t.C: // periodically call the dispatcher to handle scheduled actions. - actionDispatcher.Dispatch(ctx, actionAcker) + actionDispatcher.Dispatch(ctx, detailsSetter, actionAcker) t.Reset(flushInterval) case actions := <-fleetGateway.Actions(): - actionDispatcher.Dispatch(ctx, actionAcker, actions...) + actionDispatcher.Dispatch(ctx, detailsSetter, actionAcker, actions...) t.Reset(flushInterval) } } diff --git a/internal/pkg/agent/application/managed_mode_test.go b/internal/pkg/agent/application/managed_mode_test.go index 25149896ab6..e7fca7ca2b5 100644 --- a/internal/pkg/agent/application/managed_mode_test.go +++ b/internal/pkg/agent/application/managed_mode_test.go @@ -10,6 +10,7 @@ import ( "testing" "time" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details" "github.com/elastic/elastic-agent/internal/pkg/fleetapi" "github.com/elastic/elastic-agent/internal/pkg/fleetapi/acker" "github.com/elastic/elastic-agent/internal/pkg/fleetapi/client" @@ -22,8 +23,8 @@ type mockDispatcher struct { mock.Mock } -func (m *mockDispatcher) Dispatch(ctx context.Context, ack acker.Acker, actions ...fleetapi.Action) { - m.Called(ctx, ack, actions) +func (m *mockDispatcher) Dispatch(ctx context.Context, detailsSetter details.Observer, ack acker.Acker, actions ...fleetapi.Action) { + m.Called(ctx, detailsSetter, ack, actions) } func (m *mockDispatcher) Errors() <-chan error { @@ -97,7 +98,7 @@ func Test_runDispatcher(t *testing.T) { }, mockDispatcher: func() *mockDispatcher { dispatcher := &mockDispatcher{} - dispatcher.On("Dispatch", mock.Anything, mock.Anything, mock.Anything).Once() + dispatcher.On("Dispatch", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Once() return dispatcher }, interval: time.Second, @@ -110,8 +111,8 @@ func Test_runDispatcher(t *testing.T) { }, mockDispatcher: func() *mockDispatcher { dispatcher := &mockDispatcher{} - dispatcher.On("Dispatch", mock.Anything, mock.Anything, mock.Anything).Once() - dispatcher.On("Dispatch", mock.Anything, mock.Anything, mock.Anything).Maybe() // allow a second call in case there are timing issues in the CI pipeline + dispatcher.On("Dispatch", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Once() + dispatcher.On("Dispatch", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Maybe() // allow a second call in case there are timing issues in the CI pipeline return dispatcher }, interval: time.Millisecond * 60, @@ -126,11 +127,12 @@ func Test_runDispatcher(t *testing.T) { ch := make(chan []fleetapi.Action, 1) gateway := tc.mockGateway(ch) dispatcher := tc.mockDispatcher() + detailsSetter := func(upgradeDetails *details.Details) {} acker := &mockAcker{} ctx, cancel := context.WithTimeout(context.Background(), time.Millisecond*100) defer cancel() - runDispatcher(ctx, dispatcher, gateway, acker, tc.interval) + runDispatcher(ctx, dispatcher, gateway, detailsSetter, acker, tc.interval) assert.Empty(t, ch) gateway.AssertExpectations(t) diff --git a/internal/pkg/agent/application/upgrade/details/details.go b/internal/pkg/agent/application/upgrade/details/details.go index 5bc1e409cad..80d039882a8 100644 --- a/internal/pkg/agent/application/upgrade/details/details.go +++ b/internal/pkg/agent/application/upgrade/details/details.go @@ -127,6 +127,24 @@ func (d *Details) RegisterObserver(observer Observer) { d.notifyObserver(observer) } +// Equals compares the non-lock fields of two Details structs. +func (d *Details) Equals(otherD *Details) bool { + // If both addresses are equal or both are nil + if d == otherD { + return true + } + + // If only one is nil but the other is not + if d == nil || otherD == nil { + return false + } + + return d.State == otherD.State && + d.TargetVersion == otherD.TargetVersion && + d.ActionID == otherD.ActionID && + d.Metadata.Equals(otherD.Metadata) +} + func (d *Details) notifyObservers() { for _, observer := range d.observers { d.notifyObserver(observer) @@ -147,6 +165,14 @@ func (d *Details) notifyObserver(observer Observer) { } } +func (m Metadata) Equals(otherM Metadata) bool { + return m.ScheduledAt.Equal(otherM.ScheduledAt) && + m.FailedState == otherM.FailedState && + m.ErrorMsg == otherM.ErrorMsg && + m.DownloadPercent == otherM.DownloadPercent && + m.DownloadRate == otherM.DownloadRate +} + func (dr *downloadRate) MarshalJSON() ([]byte, error) { downloadRateBytesPerSecond := float64(*dr) if math.IsInf(downloadRateBytesPerSecond, 0) { From 234577d6d3269890201135e983439a7ccdcc8b3c Mon Sep 17 00:00:00 2001 From: Anderson Queiroz Date: Thu, 16 Nov 2023 07:47:13 -0300 Subject: [PATCH 06/21] add provisioner to Stack and Instance Provisioners (#3734) The Stack and Instance types now have a provisioner property indicating which provisioner provisioned them. Currently it's used when saving and retrieving instances and stack from the saved state --- magefile.go | 19 +++--- ...provision.go => serverless_provisioner.go} | 25 +++++--- pkg/testing/ess/serverless_test.go | 2 +- ...{provisioner.go => statful_provisioner.go} | 27 +++++--- pkg/testing/multipass/provisioner.go | 18 ++++-- pkg/testing/ogc/provisioner.go | 22 ++++--- pkg/testing/runner/provisioner.go | 13 ++++ pkg/testing/runner/runner.go | 62 ++++++++++--------- pkg/testing/runner/runner_test.go | 48 ++++++++------ 9 files changed, 147 insertions(+), 89 deletions(-) rename pkg/testing/ess/{serverless_provision.go => serverless_provisioner.go} (90%) rename pkg/testing/ess/{provisioner.go => statful_provisioner.go} (82%) diff --git a/magefile.go b/magefile.go index c6a73d147c3..6f29cd68991 100644 --- a/magefile.go +++ b/magefile.go @@ -1772,10 +1772,14 @@ func createTestRunner(matrix bool, singleTest string, goTestFlags string, batche fmt.Printf(">>>> Using %s instance provisioner\n", instanceProvisionerMode) stackProvisionerMode := os.Getenv("STACK_PROVISIONER") if stackProvisionerMode == "" { - stackProvisionerMode = "stateful" + stackProvisionerMode = ess.ProvisionerStateful } - if stackProvisionerMode != "stateful" && stackProvisionerMode != "serverless" { - return nil, fmt.Errorf("STACK_PROVISIONER environment variable must be one of 'serverless' or 'stateful', not %s", stackProvisionerMode) + if stackProvisionerMode != ess.ProvisionerStateful && + stackProvisionerMode != ess.ProvisionerServerless { + return nil, fmt.Errorf("STACK_PROVISIONER environment variable must be one of %q or %q, not %s", + ess.ProvisionerStateful, + ess.ProvisionerServerless, + stackProvisionerMode) } fmt.Printf(">>>> Using %s stack provisioner\n", stackProvisionerMode) @@ -1831,9 +1835,9 @@ func createTestRunner(matrix bool, singleTest string, goTestFlags string, batche } var instanceProvisioner runner.InstanceProvisioner - if instanceProvisionerMode == "multipass" { + if instanceProvisionerMode == multipass.Name { instanceProvisioner = multipass.NewProvisioner() - } else if instanceProvisionerMode == "ogc" { + } else if instanceProvisionerMode == ogc.Name { instanceProvisioner, err = ogc.NewProvisioner(ogcCfg) if err != nil { return nil, err @@ -1848,12 +1852,13 @@ func createTestRunner(matrix bool, singleTest string, goTestFlags string, batche Region: essRegion, } var stackProvisioner runner.StackProvisioner - if stackProvisionerMode == "stateful" { + if stackProvisionerMode == ess.ProvisionerStateful { stackProvisioner, err = ess.NewProvisioner(provisionCfg) if err != nil { return nil, err } - } else if stackProvisionerMode == "serverless" { + + } else if stackProvisionerMode == ess.ProvisionerServerless { stackProvisioner, err = ess.NewServerlessProvisioner(provisionCfg) if err != nil { return nil, err diff --git a/pkg/testing/ess/serverless_provision.go b/pkg/testing/ess/serverless_provisioner.go similarity index 90% rename from pkg/testing/ess/serverless_provision.go rename to pkg/testing/ess/serverless_provisioner.go index c9656f628af..08f01479b82 100644 --- a/pkg/testing/ess/serverless_provision.go +++ b/pkg/testing/ess/serverless_provisioner.go @@ -16,8 +16,10 @@ import ( "github.com/elastic/elastic-agent/pkg/testing/runner" ) -// ServerlessProvision contains -type ServerlessProvision struct { +const ProvisionerServerless = "serverless" + +// ServerlessProvisioner contains +type ServerlessProvisioner struct { cfg ProvisionerConfig log runner.Logger } @@ -46,7 +48,7 @@ type ServerlessRegions struct { // NewServerlessProvisioner creates a new StackProvisioner instance for serverless func NewServerlessProvisioner(cfg ProvisionerConfig) (runner.StackProvisioner, error) { - prov := &ServerlessProvision{ + prov := &ServerlessProvisioner{ cfg: cfg, log: &defaultLogger{wrapped: logp.L()}, } @@ -57,13 +59,17 @@ func NewServerlessProvisioner(cfg ProvisionerConfig) (runner.StackProvisioner, e return prov, nil } +func (prov *ServerlessProvisioner) Name() string { + return ProvisionerServerless +} + // SetLogger sets the logger for the -func (prov *ServerlessProvision) SetLogger(l runner.Logger) { +func (prov *ServerlessProvisioner) SetLogger(l runner.Logger) { prov.log = l } // Create creates a stack. -func (prov *ServerlessProvision) Create(ctx context.Context, request runner.StackRequest) (runner.Stack, error) { +func (prov *ServerlessProvisioner) Create(ctx context.Context, request runner.StackRequest) (runner.Stack, error) { // allow up to 4 minutes for requests createCtx, createCancel := context.WithTimeout(ctx, 4*time.Minute) defer createCancel() @@ -82,6 +88,7 @@ func (prov *ServerlessProvision) Create(ctx context.Context, request runner.Stac } stack := runner.Stack{ ID: request.ID, + Provisioner: prov.Name(), Version: request.Version, Elasticsearch: client.proj.Endpoints.Elasticsearch, Kibana: client.proj.Endpoints.Kibana, @@ -98,7 +105,7 @@ func (prov *ServerlessProvision) Create(ctx context.Context, request runner.Stac } // WaitForReady should block until the stack is ready or the context is cancelled. -func (prov *ServerlessProvision) WaitForReady(ctx context.Context, stack runner.Stack) (runner.Stack, error) { +func (prov *ServerlessProvisioner) WaitForReady(ctx context.Context, stack runner.Stack) (runner.Stack, error) { deploymentID, deploymentType, err := prov.getDeploymentInfo(stack) if err != nil { return stack, fmt.Errorf("failed to get deployment info from the stack: %w", err) @@ -155,7 +162,7 @@ func (prov *ServerlessProvision) WaitForReady(ctx context.Context, stack runner. } // Delete deletes a stack. -func (prov *ServerlessProvision) Delete(ctx context.Context, stack runner.Stack) error { +func (prov *ServerlessProvisioner) Delete(ctx context.Context, stack runner.Stack) error { deploymentID, deploymentType, err := prov.getDeploymentInfo(stack) if err != nil { return fmt.Errorf("failed to get deployment info from the stack: %w", err) @@ -181,7 +188,7 @@ func (prov *ServerlessProvision) Delete(ctx context.Context, stack runner.Stack) // CheckCloudRegion checks to see if the provided region is valid for the serverless // if we have an invalid region, overwrite with a valid one. // The "normal" and serverless ESS APIs have different regions, hence why we need this. -func (prov *ServerlessProvision) CheckCloudRegion() error { +func (prov *ServerlessProvisioner) CheckCloudRegion() error { urlPath := fmt.Sprintf("%s/api/v1/serverless/regions", serverlessURL) httpHandler, err := http.NewRequestWithContext(context.Background(), "GET", urlPath, nil) @@ -231,7 +238,7 @@ func (prov *ServerlessProvision) CheckCloudRegion() error { return nil } -func (prov *ServerlessProvision) getDeploymentInfo(stack runner.Stack) (string, string, error) { +func (prov *ServerlessProvisioner) getDeploymentInfo(stack runner.Stack) (string, string, error) { if stack.Internal == nil { return "", "", fmt.Errorf("missing internal information") } diff --git a/pkg/testing/ess/serverless_test.go b/pkg/testing/ess/serverless_test.go index 2fc8e0075b1..f19bd91b7e2 100644 --- a/pkg/testing/ess/serverless_test.go +++ b/pkg/testing/ess/serverless_test.go @@ -25,7 +25,7 @@ func TestProvisionGetRegions(t *testing.T) { require.True(t, found) cfg := ProvisionerConfig{Region: "bad-region-ID", APIKey: key} - prov := &ServerlessProvision{ + prov := &ServerlessProvisioner{ cfg: cfg, log: &defaultLogger{wrapped: logp.L()}, } diff --git a/pkg/testing/ess/provisioner.go b/pkg/testing/ess/statful_provisioner.go similarity index 82% rename from pkg/testing/ess/provisioner.go rename to pkg/testing/ess/statful_provisioner.go index 47e8d9dcba2..32dc70707ef 100644 --- a/pkg/testing/ess/provisioner.go +++ b/pkg/testing/ess/statful_provisioner.go @@ -14,7 +14,9 @@ import ( "github.com/elastic/elastic-agent/pkg/testing/runner" ) -// ProvisionerConfig is the configuration for the ESS provisioner. +const ProvisionerStateful = "stateful" + +// ProvisionerConfig is the configuration for the ESS statefulProvisioner. type ProvisionerConfig struct { Identifier string APIKey string @@ -35,13 +37,13 @@ func (c *ProvisionerConfig) Validate() error { return nil } -type provisioner struct { +type statefulProvisioner struct { logger runner.Logger cfg ProvisionerConfig client *Client } -// NewProvisioner creates the ESS provisioner +// NewProvisioner creates the ESS stateful Provisioner func NewProvisioner(cfg ProvisionerConfig) (runner.StackProvisioner, error) { err := cfg.Validate() if err != nil { @@ -50,18 +52,22 @@ func NewProvisioner(cfg ProvisionerConfig) (runner.StackProvisioner, error) { essClient := NewClient(Config{ ApiKey: cfg.APIKey, }) - return &provisioner{ + return &statefulProvisioner{ cfg: cfg, client: essClient, }, nil } -func (p *provisioner) SetLogger(l runner.Logger) { +func (p *statefulProvisioner) Name() string { + return ProvisionerStateful +} + +func (p *statefulProvisioner) SetLogger(l runner.Logger) { p.logger = l } // Create creates a stack. -func (p *provisioner) Create(ctx context.Context, request runner.StackRequest) (runner.Stack, error) { +func (p *statefulProvisioner) Create(ctx context.Context, request runner.StackRequest) (runner.Stack, error) { // allow up to 2 minutes for request createCtx, createCancel := context.WithTimeout(ctx, 2*time.Minute) defer createCancel() @@ -78,6 +84,7 @@ func (p *provisioner) Create(ctx context.Context, request runner.StackRequest) ( } return runner.Stack{ ID: request.ID, + Provisioner: p.Name(), Version: request.Version, Elasticsearch: resp.ElasticsearchEndpoint, Kibana: resp.KibanaEndpoint, @@ -91,7 +98,7 @@ func (p *provisioner) Create(ctx context.Context, request runner.StackRequest) ( } // WaitForReady should block until the stack is ready or the context is cancelled. -func (p *provisioner) WaitForReady(ctx context.Context, stack runner.Stack) (runner.Stack, error) { +func (p *statefulProvisioner) WaitForReady(ctx context.Context, stack runner.Stack) (runner.Stack, error) { deploymentID, err := p.getDeploymentID(stack) if err != nil { return stack, fmt.Errorf("failed to get deployment ID from the stack: %w", err) @@ -112,7 +119,7 @@ func (p *provisioner) WaitForReady(ctx context.Context, stack runner.Stack) (run } // Delete deletes a stack. -func (p *provisioner) Delete(ctx context.Context, stack runner.Stack) error { +func (p *statefulProvisioner) Delete(ctx context.Context, stack runner.Stack) error { deploymentID, err := p.getDeploymentID(stack) if err != nil { return err @@ -126,7 +133,7 @@ func (p *provisioner) Delete(ctx context.Context, stack runner.Stack) error { return p.client.ShutdownDeployment(ctx, deploymentID) } -func (p *provisioner) createDeployment(ctx context.Context, r runner.StackRequest, tags map[string]string) (*CreateDeploymentResponse, error) { +func (p *statefulProvisioner) createDeployment(ctx context.Context, r runner.StackRequest, tags map[string]string) (*CreateDeploymentResponse, error) { ctx, cancel := context.WithTimeout(ctx, 1*time.Minute) defer cancel() @@ -158,7 +165,7 @@ func (p *provisioner) createDeployment(ctx context.Context, r runner.StackReques return resp, nil } -func (p *provisioner) getDeploymentID(stack runner.Stack) (string, error) { +func (p *statefulProvisioner) getDeploymentID(stack runner.Stack) (string, error) { if stack.Internal == nil { return "", fmt.Errorf("missing internal information") } diff --git a/pkg/testing/multipass/provisioner.go b/pkg/testing/multipass/provisioner.go index 7703dc7e335..43a18987ce0 100644 --- a/pkg/testing/multipass/provisioner.go +++ b/pkg/testing/multipass/provisioner.go @@ -24,6 +24,7 @@ import ( const ( Ubuntu = "ubuntu" + Name = "multipass" ) type provisioner struct { @@ -35,6 +36,10 @@ func NewProvisioner() runner.InstanceProvisioner { return &provisioner{} } +func (p *provisioner) Name() string { + return Name +} + func (p *provisioner) SetLogger(l runner.Logger) { p.logger = l } @@ -92,12 +97,13 @@ func (p *provisioner) Provision(ctx context.Context, cfg runner.Config, batches return nil, fmt.Errorf("instance %s is not marked as running", batch.ID) } results = append(results, runner.Instance{ - ID: batch.ID, - Name: batch.ID, - IP: mi.IPv4[0], - Username: "ubuntu", - RemotePath: "/home/ubuntu/agent", - Internal: nil, + ID: batch.ID, + Provisioner: Name, + Name: batch.ID, + IP: mi.IPv4[0], + Username: "ubuntu", + RemotePath: "/home/ubuntu/agent", + Internal: nil, }) } return results, nil diff --git a/pkg/testing/ogc/provisioner.go b/pkg/testing/ogc/provisioner.go index 74c92eb0b30..00a05459881 100644 --- a/pkg/testing/ogc/provisioner.go +++ b/pkg/testing/ogc/provisioner.go @@ -24,6 +24,7 @@ import ( const ( // LayoutIntegrationTag is the tag added to all layouts for the integration testing framework. LayoutIntegrationTag = "agent-integration" + Name = "ogc" ) type provisioner struct { @@ -42,6 +43,10 @@ func NewProvisioner(cfg Config) (runner.InstanceProvisioner, error) { }, nil } +func (p *provisioner) Name() string { + return Name +} + func (p *provisioner) SetLogger(l runner.Logger) { p.logger = l } @@ -83,8 +88,8 @@ func (p *provisioner) Provision(ctx context.Context, cfg runner.Config, batches return nil, err } if len(machines) == 0 { - // print the output so its clear what went wrong - // without this it's unclear where OGC went wrong it + // Print the output so its clear what went wrong. + // Without this it's unclear where OGC went wrong, it // doesn't do a great job of reporting a clean error fmt.Fprintf(os.Stdout, "%s\n", upOutput) return nil, fmt.Errorf("ogc didn't create any machines") @@ -99,14 +104,15 @@ func (p *provisioner) Provision(ctx context.Context, cfg runner.Config, batches // without this it's unclear where OGC went wrong it // doesn't do a great job of reporting a clean error fmt.Fprintf(os.Stdout, "%s\n", upOutput) - return nil, fmt.Errorf("failed to find machine for layout ID: %s", b.ID) + return nil, fmt.Errorf("failed to find machine for batch ID: %s", b.ID) } instances = append(instances, runner.Instance{ - ID: b.ID, - Name: machine.InstanceName, - IP: machine.PublicIP, - Username: machine.Layout.Username, - RemotePath: machine.Layout.RemotePath, + ID: b.ID, + Provisioner: Name, + Name: machine.InstanceName, + IP: machine.PublicIP, + Username: machine.Layout.Username, + RemotePath: machine.Layout.RemotePath, Internal: map[string]interface{}{ "instance_id": machine.InstanceID, }, diff --git a/pkg/testing/runner/provisioner.go b/pkg/testing/runner/provisioner.go index 2708b0d204d..7a27ad6697b 100644 --- a/pkg/testing/runner/provisioner.go +++ b/pkg/testing/runner/provisioner.go @@ -18,6 +18,9 @@ type Instance struct { ID string `yaml:"id"` // Name is the nice-name of the instance. Name string `yaml:"name"` + // Provisioner is the instance provider for the instance. + // See INSTANCE_PROVISIONER environment variable for the supported Provisioner. + Provisioner string `yaml:"provisioner"` // IP is the IP address of the instance. IP string `yaml:"ip"` // Username is the username used to SSH to the instance. @@ -32,6 +35,9 @@ type Instance struct { // InstanceProvisioner performs the provisioning of instances. type InstanceProvisioner interface { + // Name returns the name of the instance provisioner. + Name() string + // SetLogger sets the logger for it to use. SetLogger(l Logger) @@ -54,6 +60,10 @@ type Stack struct { // This must be the same ID used for requesting a stack. ID string `yaml:"id"` + // Provisioner is the stack provisioner. See STACK_PROVISIONER environment + // variable for the supported provisioners. + Provisioner string `yaml:"provisioner"` + // Version is the version of the stack. Version string `yaml:"version"` @@ -89,6 +99,9 @@ type StackRequest struct { // StackProvisioner performs the provisioning of stacks. type StackProvisioner interface { + // Name returns the name of the stack provisioner. + Name() string + // SetLogger sets the logger for it to use. SetLogger(l Logger) diff --git a/pkg/testing/runner/runner.go b/pkg/testing/runner/runner.go index a2c77f77aa0..860ad57215e 100644 --- a/pkg/testing/runner/runner.go +++ b/pkg/testing/runner/runner.go @@ -594,23 +594,10 @@ func (r *Runner) startStacks(ctx context.Context) error { var requests []stackReq for _, version := range versions { id := strings.Replace(version, ".", "", -1) - stack, ok := r.findStack(id) - if ok { - requests = append(requests, stackReq{ - request: StackRequest{ - ID: id, - Version: version, - }, - stack: &stack, - }) - } else { - requests = append(requests, stackReq{ - request: StackRequest{ - ID: id, - Version: version, - }, - }) - } + requests = append(requests, stackReq{ + request: StackRequest{ID: id, Version: version}, + stack: r.findStack(id), + }) } reportResult := func(version string, stack Stack, err error) { @@ -706,7 +693,8 @@ func (r *Runner) findInstance(id string) (StateInstance, bool) { r.stateMx.Lock() defer r.stateMx.Unlock() for _, existing := range r.state.Instances { - if existing.ID == id { + if existing.Same(StateInstance{ + Instance: Instance{ID: id, Provisioner: r.ip.Name()}}) { return existing, true } } @@ -718,30 +706,30 @@ func (r *Runner) addOrUpdateInstance(instance StateInstance) error { defer r.stateMx.Unlock() state := r.state - existed := false + found := false for idx, existing := range state.Instances { - if existing.ID == instance.ID { + if existing.Same(instance) { state.Instances[idx] = instance - existed = true + found = true break } } - if !existed { + if !found { state.Instances = append(state.Instances, instance) } r.state = state return r.writeState() } -func (r *Runner) findStack(id string) (Stack, bool) { +func (r *Runner) findStack(id string) *Stack { r.stateMx.Lock() defer r.stateMx.Unlock() for _, existing := range r.state.Stacks { - if existing.ID == id { - return existing, true + if existing.Same(Stack{ID: id, Provisioner: r.sp.Name()}) { + return &existing } } - return Stack{}, false + return nil } func (r *Runner) addOrUpdateStack(stack Stack) error { @@ -749,15 +737,15 @@ func (r *Runner) addOrUpdateStack(stack Stack) error { defer r.stateMx.Unlock() state := r.state - existed := false + found := false for idx, existing := range state.Stacks { - if existing.ID == stack.ID { + if existing.Same(stack) { state.Stacks[idx] = stack - existed = true + found = true break } } - if !existed { + if !found { state.Stacks = append(state.Stacks, stack) } r.state = state @@ -829,6 +817,20 @@ func (r *Runner) mergeResults(results map[string]OSRunnerResult) (Result, error) return complete, nil } +// Same returns true if other is the same instance as this one. +// Two instances are considered the same if their provider and ID are the same. +func (s StateInstance) Same(other StateInstance) bool { + return s.Provisioner == other.Provisioner && + s.ID == other.ID +} + +// Same returns true if other is the same stack as this one. +// Two stacks are considered the same if their provisioner and ID are the same. +func (s Stack) Same(other Stack) bool { + return s.Provisioner == other.Provisioner && + s.ID == other.ID +} + func mergePackageResult(pkg OSRunnerPackageResult, batchName string, sudo bool, rawOutput io.Writer, jsonOutput io.Writer, suites *JUnitTestSuites) error { suffix := "" sudoStr := "false" diff --git a/pkg/testing/runner/runner_test.go b/pkg/testing/runner/runner_test.go index c46b3b53761..c638b291bd4 100644 --- a/pkg/testing/runner/runner_test.go +++ b/pkg/testing/runner/runner_test.go @@ -38,12 +38,13 @@ func TestNewRunner_Clean(t *testing.T) { require.NoError(t, err) i1 := Instance{ - ID: "id-1", - Name: "name-1", - IP: "127.0.0.1", - Username: "ubuntu", - RemotePath: "/home/ubuntu/agent", - Internal: map[string]interface{}{}, // ElementsMatch fails without this set + ID: "id-1", + Name: "name-1", + Provisioner: ip.Name(), + IP: "127.0.0.1", + Username: "ubuntu", + RemotePath: "/home/ubuntu/agent", + Internal: map[string]interface{}{}, // ElementsMatch fails without this set } err = r.addOrUpdateInstance(StateInstance{ Instance: i1, @@ -51,12 +52,13 @@ func TestNewRunner_Clean(t *testing.T) { }) require.NoError(t, err) i2 := Instance{ - ID: "id-2", - Name: "name-2", - IP: "127.0.0.2", - Username: "ubuntu", - RemotePath: "/home/ubuntu/agent", - Internal: map[string]interface{}{}, // ElementsMatch fails without this set + ID: "id-2", + Name: "name-2", + Provisioner: ip.Name(), + IP: "127.0.0.2", + Username: "ubuntu", + RemotePath: "/home/ubuntu/agent", + Internal: map[string]interface{}{}, // ElementsMatch fails without this set } err = r.addOrUpdateInstance(StateInstance{ Instance: i2, @@ -64,16 +66,18 @@ func TestNewRunner_Clean(t *testing.T) { }) require.NoError(t, err) s1 := Stack{ - ID: "id-1", - Version: "8.10.0", - Internal: map[string]interface{}{}, // ElementsMatch fails without this set + ID: "id-1", + Provisioner: sp.Name(), + Version: "8.10.0", + Internal: map[string]interface{}{}, // ElementsMatch fails without this set } err = r.addOrUpdateStack(s1) require.NoError(t, err) s2 := Stack{ - ID: "id-2", - Version: "8.9.0", - Internal: map[string]interface{}{}, // ElementsMatch fails without this set + ID: "id-2", + Provisioner: sp.Name(), + Version: "8.9.0", + Internal: map[string]interface{}{}, // ElementsMatch fails without this set } err = r.addOrUpdateStack(s2) require.NoError(t, err) @@ -95,6 +99,10 @@ type fakeInstanceProvisioner struct { instances []Instance } +func (f *fakeInstanceProvisioner) Name() string { + return "fake" +} + func (f *fakeInstanceProvisioner) SetLogger(_ Logger) { } @@ -129,6 +137,10 @@ type fakeStackProvisioner struct { deletedStacks []Stack } +func (f *fakeStackProvisioner) Name() string { + return "fake" +} + func (f *fakeStackProvisioner) SetLogger(_ Logger) { } From d58ac9cb9183317ea0bc58bc6d1e38244000b5a3 Mon Sep 17 00:00:00 2001 From: Shaunak Kashyap Date: Thu, 16 Nov 2023 18:32:18 +0530 Subject: [PATCH 07/21] [Upgrade Details] Implement tracking of `UPG_ROLLBACK` state (#3694) * Remove context and handle cancellation internally instead * More optimizations * Add back context * Adding FSM for upgrades * Implementing TODO * WIP * WIP * Reorganizing imports * Running go mod tidy * Remove Fleet changes * Fixing booboos introduced during conflict resolution * Add nil guard * Setting logger in test * Regenerating protobuf implementation files * Newline fixes * Generating protobuf implementations * Running mage update * Add TODOs * Serialize upgrade details to upgrade marker before starting upgrade watcher * Refactoring: move watch function from coordinator -> upgrade package * WIP: unit test * Refactoring: make LoadMarker testable * Fix unit test * Consume from errorCh * Make watchMarker implementation more robust * Clarify godoc * Export SaveMarker * Set UPG_ROLLBACK state * Update integration test to check for UPG_ROLLBACK state after rolling back * Fix test * Add FIXME comment * Running make notice * Avoid data race in test * Remove old comment * Introduce MarkerWatcher * Add upgradeMarkerDirPath to error * Make unit tests pass * Remove FIXME * Initialize upgrade details in marker if needed * Fix tests * [Debugging] Log statements at INFO level * [Debugging] More debug logging * [Debugging] More logging * Fix logic in error check * Removing debugging changes * Fixing data type * Resolve conflicts * Add nil checks * Fix comments formatting. Co-authored-by: Anderson Queiroz * Consistency + readability * Simplifying marker watch implementation * Expand comment * Fix test * Simplify pointer checks + comments * Simplify bitwise logic * Refactoring marker access in preparation for Windows-specific retry logic * Replacing deprecated function calls * Implement retries for marker file access on Windows * Marker CreateMarkerTestDir as test helper * Fix windows linter errors * Simplify implementation * Close fsnotify watcher when goroutine exits * Remove createMarkerTestDir helper function * Check error in test * Prevent timeout * Add warning if upgrade details are unexpectedly missing * Fallback for recording UPG_ROLLBACK * Clear upgrade details upon successful upgrade * Fix fallback logic for recording UPG_ROLLBACK state * Add + update unit tests * Running mage fmt * Remove data races in test code * Reduce max timeout for retry operation * Implement non-existent marker file check into OS-specific read implementations --------- Co-authored-by: Anderson Queiroz --- NOTICE.txt | 76 +++--- go.mod | 2 +- .../handlers/handler_action_upgrade_test.go | 5 + internal/pkg/agent/application/application.go | 5 +- .../application/coordinator/coordinator.go | 53 ++++- .../coordinator/coordinator_test.go | 39 +-- .../coordinator/coordinator_unit_test.go | 13 +- .../application/upgrade/details/details.go | 1 + .../upgrade/details/details_test.go | 23 ++ .../upgrade/marker_access_other.go | 30 +++ .../upgrade/marker_access_windows.go | 76 ++++++ .../application/upgrade/marker_watcher.go | 132 ++++++++++ .../upgrade/marker_watcher_test.go | 225 ++++++++++++++++++ .../application/upgrade/step_download_test.go | 16 +- .../agent/application/upgrade/step_mark.go | 37 ++- .../pkg/agent/application/upgrade/upgrade.go | 25 +- internal/pkg/agent/cmd/watch.go | 46 +++- testing/integration/upgrade_rollback_test.go | 35 +++ 18 files changed, 743 insertions(+), 96 deletions(-) create mode 100644 internal/pkg/agent/application/upgrade/marker_access_other.go create mode 100644 internal/pkg/agent/application/upgrade/marker_access_windows.go create mode 100644 internal/pkg/agent/application/upgrade/marker_watcher.go create mode 100644 internal/pkg/agent/application/upgrade/marker_watcher_test.go diff --git a/NOTICE.txt b/NOTICE.txt index 11fa3452084..a6e55930766 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -2673,6 +2673,44 @@ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +-------------------------------------------------------------------------------- +Dependency : github.com/adriansr/fsnotify +Version: v1.4.8-0.20211018144411-a81f2b630e7c +Licence type (autodetected): BSD-3-Clause +-------------------------------------------------------------------------------- + +Contents of probable licence file $GOMODCACHE/github.com/adriansr/fsnotify@v1.4.8-0.20211018144411-a81f2b630e7c/LICENSE: + +Copyright (c) 2012 The Go Authors. All rights reserved. +Copyright (c) 2012 fsnotify Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + -------------------------------------------------------------------------------- Dependency : github.com/gofrs/flock Version: v0.8.1 @@ -10185,44 +10223,6 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. --------------------------------------------------------------------------------- -Dependency : github.com/adriansr/fsnotify -Version: v1.4.8-0.20211018144411-a81f2b630e7c -Licence type (autodetected): BSD-3-Clause --------------------------------------------------------------------------------- - -Contents of probable licence file $GOMODCACHE/github.com/adriansr/fsnotify@v1.4.8-0.20211018144411-a81f2b630e7c/LICENSE: - -Copyright (c) 2012 The Go Authors. All rights reserved. -Copyright (c) 2012 fsnotify Authors. All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -------------------------------------------------------------------------------- Dependency : github.com/ghodss/yaml Version: v1.0.0 diff --git a/go.mod b/go.mod index 2016effc622..897874a5261 100644 --- a/go.mod +++ b/go.mod @@ -22,6 +22,7 @@ require ( github.com/elastic/go-sysinfo v1.11.1 github.com/elastic/go-ucfg v0.8.6 github.com/fatih/color v1.15.0 + github.com/fsnotify/fsnotify v1.6.0 github.com/gofrs/flock v0.8.1 github.com/gofrs/uuid v4.4.0+incompatible github.com/google/go-cmp v0.5.9 @@ -92,7 +93,6 @@ require ( github.com/elastic/gosigar v0.14.2 // indirect github.com/emicklei/go-restful/v3 v3.10.1 // indirect github.com/evanphx/json-patch v5.6.0+incompatible // indirect - github.com/fsnotify/fsnotify v1.6.0 // indirect github.com/ghodss/yaml v1.0.0 // indirect github.com/go-logr/logr v1.2.3 // indirect github.com/go-ole/go-ole v1.2.6 // indirect diff --git a/internal/pkg/agent/application/actions/handlers/handler_action_upgrade_test.go b/internal/pkg/agent/application/actions/handlers/handler_action_upgrade_test.go index b917b00b900..01377e43313 100644 --- a/internal/pkg/agent/application/actions/handlers/handler_action_upgrade_test.go +++ b/internal/pkg/agent/application/actions/handlers/handler_action_upgrade_test.go @@ -14,6 +14,7 @@ import ( "github.com/elastic/elastic-agent/internal/pkg/agent/application/coordinator" "github.com/elastic/elastic-agent/internal/pkg/agent/application/info" "github.com/elastic/elastic-agent/internal/pkg/agent/application/reexec" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade" "github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details" "github.com/elastic/elastic-agent/internal/pkg/agent/configuration" "github.com/elastic/elastic-agent/internal/pkg/config" @@ -51,6 +52,10 @@ func (u *mockUpgradeManager) Ack(ctx context.Context, acker acker.Acker) error { return nil } +func (u *mockUpgradeManager) MarkerWatcher() upgrade.MarkerWatcher { + return nil +} + func TestUpgradeHandler(t *testing.T) { // Create a cancellable context that will shut down the coordinator after // the test. diff --git a/internal/pkg/agent/application/application.go b/internal/pkg/agent/application/application.go index 6ca33c35c05..1f8853120e3 100644 --- a/internal/pkg/agent/application/application.go +++ b/internal/pkg/agent/application/application.go @@ -101,7 +101,10 @@ func New( // monitoring is not supported in bootstrap mode https://github.com/elastic/elastic-agent/issues/1761 isMonitoringSupported := !disableMonitoring && cfg.Settings.V1MonitoringEnabled - upgrader := upgrade.NewUpgrader(log, cfg.Settings.DownloadConfig, agentInfo) + upgrader, err := upgrade.NewUpgrader(log, cfg.Settings.DownloadConfig, agentInfo) + if err != nil { + return nil, nil, nil, fmt.Errorf("failed to create upgrader: %w", err) + } monitor := monitoring.New(isMonitoringSupported, cfg.Settings.DownloadConfig.OS(), cfg.Settings.MonitoringConfig, agentInfo) runtime, err := runtime.NewManager( diff --git a/internal/pkg/agent/application/coordinator/coordinator.go b/internal/pkg/agent/application/coordinator/coordinator.go index 1bbfde40a20..c33cc700729 100644 --- a/internal/pkg/agent/application/coordinator/coordinator.go +++ b/internal/pkg/agent/application/coordinator/coordinator.go @@ -21,6 +21,7 @@ import ( "github.com/elastic/elastic-agent/internal/pkg/agent/application/info" "github.com/elastic/elastic-agent/internal/pkg/agent/application/reexec" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade" "github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details" "github.com/elastic/elastic-agent/internal/pkg/agent/configuration" "github.com/elastic/elastic-agent/internal/pkg/agent/transpiler" @@ -66,6 +67,9 @@ type UpgradeManager interface { // Ack is used on startup to check if the agent has upgraded and needs to send an ack for the action Ack(ctx context.Context, acker acker.Acker) error + + // MarkerWatcher returns a watcher for the upgrade marker. + MarkerWatcher() upgrade.MarkerWatcher } // MonitorManager provides an interface to perform the monitoring action for the agent. @@ -291,6 +295,8 @@ type managerChans struct { varsManagerUpdate <-chan []*transpiler.Vars varsManagerError <-chan error + + upgradeMarkerUpdate <-chan upgrade.UpdateMarker } // New creates a new coordinator. @@ -372,6 +378,9 @@ func New(logger *logger.Logger, cfg *configuration.Configuration, logLevel logp. c.managerChans.varsManagerUpdate = varsMgr.Watch() c.managerChans.varsManagerError = varsMgr.Errors() } + if upgradeMgr != nil && upgradeMgr.MarkerWatcher() != nil { + c.managerChans.upgradeMarkerUpdate = upgradeMgr.MarkerWatcher().Watch() + } return c } @@ -876,6 +885,18 @@ func (c *Coordinator) runner(ctx context.Context) error { varsErrCh <- nil } + upgradeMarkerWatcherErrCh := make(chan error, 1) + if c.upgradeMgr != nil && c.upgradeMgr.MarkerWatcher() != nil { + err := c.upgradeMgr.MarkerWatcher().Run(ctx) + if err != nil { + upgradeMarkerWatcherErrCh <- err + } else { + upgradeMarkerWatcherErrCh <- nil + } + } else { + upgradeMarkerWatcherErrCh <- nil + } + // Keep looping until the context ends. for ctx.Err() == nil { c.runLoopIteration(ctx) @@ -883,7 +904,7 @@ func (c *Coordinator) runner(ctx context.Context) error { // If we got fatal errors from any of the managers, return them. // Otherwise, just return the context's closing error. - err := collectManagerErrors(managerShutdownTimeout, varsErrCh, runtimeErrCh, configErrCh) + err := collectManagerErrors(managerShutdownTimeout, varsErrCh, runtimeErrCh, configErrCh, upgradeMarkerWatcherErrCh) if err != nil { c.logger.Debugf("Manager errors on Coordinator shutdown: %v", err.Error()) return err @@ -968,6 +989,11 @@ func (c *Coordinator) runLoopIteration(ctx context.Context) { if ctx.Err() == nil { c.processLogLevel(ctx, ll) } + + case upgradeMarker := <-c.managerChans.upgradeMarkerUpdate: + if ctx.Err() == nil { + c.setUpgradeDetails(upgradeMarker.Details) + } } // At the end of each iteration, if we made any changes to the state, @@ -1219,19 +1245,20 @@ func (c *Coordinator) filterByCapabilities(comps []component.Component) []compon } // collectManagerErrors listens on the shutdown channels for the -// runtime, config, and vars managers, and waits for up to -// the specified timeout for them to report their final status. +// runtime, config, and vars managers as well as the upgrade marker +// watcher and waits for up to the specified timeout for them to +// report their final status. // It returns any resulting errors as a multierror, or nil if no errors // were reported. // Called on the main Coordinator goroutine. -func collectManagerErrors(timeout time.Duration, varsErrCh, runtimeErrCh, configErrCh chan error) error { - var runtimeErr error - var configErr error - var varsErr error +func collectManagerErrors(timeout time.Duration, varsErrCh, runtimeErrCh, configErrCh, upgradeMarkerWatcherErrCh chan error) error { + var runtimeErr, configErr, varsErr, upgradeMarkerWatcherErr error + var returnedRuntime, returnedConfig, returnedVars, returnedUpgradeMarkerWatcher bool + // in case other components are locked up, let us time out timeoutWait := time.NewTimer(timeout) defer timeoutWait.Stop() - var returnedRuntime, returnedConfig, returnedVars bool + /* Wait for all managers to gently shut down. All managers send an error status on their termination channel after their Run method @@ -1248,7 +1275,7 @@ func collectManagerErrors(timeout time.Duration, varsErrCh, runtimeErrCh, config var combinedErr error waitLoop: - for !returnedRuntime || !returnedConfig || !returnedVars { + for !returnedRuntime || !returnedConfig || !returnedVars || !returnedUpgradeMarkerWatcher { select { case runtimeErr = <-runtimeErrCh: returnedRuntime = true @@ -1256,6 +1283,8 @@ waitLoop: returnedConfig = true case varsErr = <-varsErrCh: returnedVars = true + case upgradeMarkerWatcherErr = <-upgradeMarkerWatcherErrCh: + returnedUpgradeMarkerWatcher = true case <-timeoutWait.C: var timeouts []string if !returnedRuntime { @@ -1267,6 +1296,9 @@ waitLoop: if !returnedVars { timeouts = append(timeouts, "no response from vars manager") } + if !returnedUpgradeMarkerWatcher { + timeouts = append(timeouts, "no response from upgrade marker watcher") + } timeoutStr := strings.Join(timeouts, ", ") combinedErr = multierror.Append(combinedErr, fmt.Errorf("timeout while waiting for managers to shut down: %v", timeoutStr)) break waitLoop @@ -1281,6 +1313,9 @@ waitLoop: if varsErr != nil && !errors.Is(varsErr, context.Canceled) { combinedErr = multierror.Append(combinedErr, fmt.Errorf("vars manager: %w", varsErr)) } + if upgradeMarkerWatcherErr != nil && !errors.Is(upgradeMarkerWatcherErr, context.Canceled) { + combinedErr = multierror.Append(combinedErr, fmt.Errorf("upgrade marker watcher: %w", upgradeMarkerWatcherErr)) + } return combinedErr } diff --git a/internal/pkg/agent/application/coordinator/coordinator_test.go b/internal/pkg/agent/application/coordinator/coordinator_test.go index 074e5cf6e74..676906bbde0 100644 --- a/internal/pkg/agent/application/coordinator/coordinator_test.go +++ b/internal/pkg/agent/application/coordinator/coordinator_test.go @@ -15,15 +15,11 @@ import ( "testing" "time" - "github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details" - - "github.com/stretchr/testify/assert" - "github.com/elastic/elastic-agent-client/v7/pkg/client" - agentclient "github.com/elastic/elastic-agent/pkg/control/v2/client" - "github.com/elastic/elastic-agent/pkg/control/v2/cproto" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.elastic.co/apm/apmtest" "github.com/elastic/elastic-agent-libs/logp" @@ -31,6 +27,8 @@ import ( "github.com/elastic/elastic-agent/internal/pkg/agent/application/info" "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" "github.com/elastic/elastic-agent/internal/pkg/agent/application/reexec" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details" "github.com/elastic/elastic-agent/internal/pkg/agent/configuration" "github.com/elastic/elastic-agent/internal/pkg/agent/transpiler" "github.com/elastic/elastic-agent/internal/pkg/capabilities" @@ -39,6 +37,8 @@ import ( "github.com/elastic/elastic-agent/internal/pkg/fleetapi/acker" "github.com/elastic/elastic-agent/pkg/component" "github.com/elastic/elastic-agent/pkg/component/runtime" + agentclient "github.com/elastic/elastic-agent/pkg/control/v2/client" + "github.com/elastic/elastic-agent/pkg/control/v2/cproto" "github.com/elastic/elastic-agent/pkg/core/logger" ) @@ -321,7 +321,7 @@ func TestCoordinator_StateSubscribe(t *testing.T) { } func TestCollectManagerErrorsTimeout(t *testing.T) { - handlerChan, _, _, _ := setupManagerShutdownChannels(time.Millisecond) + handlerChan, _, _, _, _ := setupManagerShutdownChannels(time.Millisecond) // Don't send anything to the shutdown channels, causing a timeout // in collectManagerErrors waitAndTestError(t, func(err error) bool { @@ -332,7 +332,7 @@ func TestCollectManagerErrorsTimeout(t *testing.T) { } func TestCollectManagerErrorsOneResponse(t *testing.T) { - handlerChan, _, _, config := setupManagerShutdownChannels(10 * time.Millisecond) + handlerChan, _, _, config, _ := setupManagerShutdownChannels(10 * time.Millisecond) // Send an error for the config manager -- we should also get a // timeout error since we don't send anything on the other two channels. @@ -348,26 +348,30 @@ func TestCollectManagerErrorsOneResponse(t *testing.T) { } func TestCollectManagerErrorsAllResponses(t *testing.T) { - handlerChan, runtime, varWatcher, config := setupManagerShutdownChannels(5 * time.Second) + handlerChan, runtime, varWatcher, config, upgradeMarkerWatcher := setupManagerShutdownChannels(5 * time.Second) runtimeErrStr := "runtime error" varsErrStr := "vars error" + upgradeMarkerWatcherErrStr := "upgrade marker watcher error" runtime <- errors.New(runtimeErrStr) varWatcher <- errors.New(varsErrStr) config <- nil + upgradeMarkerWatcher <- errors.New(upgradeMarkerWatcherErrStr) waitAndTestError(t, func(err error) bool { return err != nil && - strings.Contains(err.Error(), "2 errors occurred") && + strings.Contains(err.Error(), "3 errors occurred") && strings.Contains(err.Error(), runtimeErrStr) && - strings.Contains(err.Error(), varsErrStr) + strings.Contains(err.Error(), varsErrStr) && + strings.Contains(err.Error(), upgradeMarkerWatcherErrStr) }, handlerChan) } func TestCollectManagerErrorsAllResponsesNoErrors(t *testing.T) { - handlerChan, runtime, varWatcher, config := setupManagerShutdownChannels(5 * time.Second) + handlerChan, runtime, varWatcher, config, upgradeMarkerWatcher := setupManagerShutdownChannels(5 * time.Second) runtime <- nil varWatcher <- nil config <- context.Canceled + upgradeMarkerWatcher <- nil // All errors are nil or context.Canceled, so collectManagerErrors // should also return nil. @@ -397,18 +401,19 @@ func waitAndTestError(t *testing.T, check func(error) bool, handlerErr chan erro } } -func setupManagerShutdownChannels(timeout time.Duration) (chan error, chan error, chan error, chan error) { +func setupManagerShutdownChannels(timeout time.Duration) (chan error, chan error, chan error, chan error, chan error) { runtime := make(chan error) varWatcher := make(chan error) config := make(chan error) + upgradeMarkerWatcher := make(chan error) handlerChan := make(chan error) go func() { - handlerErr := collectManagerErrors(timeout, varWatcher, runtime, config) + handlerErr := collectManagerErrors(timeout, varWatcher, runtime, config, upgradeMarkerWatcher) handlerChan <- handlerErr }() - return handlerChan, runtime, varWatcher, config + return handlerChan, runtime, varWatcher, config, upgradeMarkerWatcher } func TestCoordinator_ReExec(t *testing.T) { @@ -649,6 +654,10 @@ func (f *fakeUpgradeManager) Ack(ctx context.Context, acker acker.Acker) error { return nil } +func (f *fakeUpgradeManager) MarkerWatcher() upgrade.MarkerWatcher { + return nil +} + type testMonitoringManager struct{} func newTestMonitoringMgr() *testMonitoringManager { return &testMonitoringManager{} } diff --git a/internal/pkg/agent/application/coordinator/coordinator_unit_test.go b/internal/pkg/agent/application/coordinator/coordinator_unit_test.go index fd4034f24c4..956eb33b8c0 100644 --- a/internal/pkg/agent/application/coordinator/coordinator_unit_test.go +++ b/internal/pkg/agent/application/coordinator/coordinator_unit_test.go @@ -334,6 +334,13 @@ func TestCoordinatorReportsInvalidPolicy(t *testing.T) { defer cancel() logger := logp.NewLogger("testing") + upgradeMgr, err := upgrade.NewUpgrader( + logger, + &artifact.Config{}, + &info.AgentInfo{}, + ) + require.NoError(t, err) + // Channels have buffer length 1 so we don't have to run on multiple // goroutines. stateChan := make(chan State, 1) @@ -353,11 +360,7 @@ func TestCoordinatorReportsInvalidPolicy(t *testing.T) { // Policy changes are sent to the upgrade manager, which scans it // for updated artifact URIs. We take advantage of this for the // test by sending an invalid artifact URI to trigger an error. - upgradeMgr: upgrade.NewUpgrader( - logger, - &artifact.Config{}, - &info.AgentInfo{}, - ), + upgradeMgr: upgradeMgr, // Add a placeholder runtime manager that will accept any updates runtimeMgr: &fakeRuntimeManager{}, diff --git a/internal/pkg/agent/application/upgrade/details/details.go b/internal/pkg/agent/application/upgrade/details/details.go index 80d039882a8..884df46a3cc 100644 --- a/internal/pkg/agent/application/upgrade/details/details.go +++ b/internal/pkg/agent/application/upgrade/details/details.go @@ -68,6 +68,7 @@ func NewDetails(targetVersion string, initialState State, actionID string) *Deta // SetState is a convenience method to set the state of the upgrade and // notify all observers. +// Do NOT call SetState with StateFailed; call the Fail method instead. func (d *Details) SetState(s State) { d.mu.Lock() defer d.mu.Unlock() diff --git a/internal/pkg/agent/application/upgrade/details/details_test.go b/internal/pkg/agent/application/upgrade/details/details_test.go index 670b40705de..2ecd933e9f8 100644 --- a/internal/pkg/agent/application/upgrade/details/details_test.go +++ b/internal/pkg/agent/application/upgrade/details/details_test.go @@ -98,3 +98,26 @@ func TestDetailsDownloadRateJSON(t *testing.T) { require.Equal(t, 0.99, unmarshalledDetails.Metadata.DownloadPercent) }) } + +func TestEquals(t *testing.T) { + details1 := NewDetails("8.12.0", StateDownloading, "foobar") + details1.SetDownloadProgress(0.1234, 34.56) + details1.Fail(errors.New("download failed")) + + details2 := NewDetails("8.12.0", StateDownloading, "foobar") + details2.SetDownloadProgress(0.1234, 34.56) + details2.Fail(errors.New("download failed")) + + details3 := NewDetails("8.12.0", StateDownloading, "foobar") + + require.True(t, details1.Equals(details2)) + require.False(t, details1.Equals(details3)) + + // Nil checks + var details4 *Details + var details5 *Details + + require.True(t, details4.Equals(details5)) + require.False(t, details1.Equals(details4)) + require.False(t, details4.Equals(details1)) +} diff --git a/internal/pkg/agent/application/upgrade/marker_access_other.go b/internal/pkg/agent/application/upgrade/marker_access_other.go new file mode 100644 index 00000000000..ed854160e94 --- /dev/null +++ b/internal/pkg/agent/application/upgrade/marker_access_other.go @@ -0,0 +1,30 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +//go:build !windows + +package upgrade + +import ( + "os" + + "github.com/elastic/elastic-agent/internal/pkg/agent/errors" +) + +// On non-Windows platforms, readMarkerFile simply reads the marker file. +// See marker_access_windows.go for behavior on Windows platforms. +func readMarkerFile(markerFile string) ([]byte, error) { + markerFileBytes, err := os.ReadFile(markerFile) + if errors.Is(err, os.ErrNotExist) { + // marker doesn't exist, nothing to do + return nil, nil + } + return markerFileBytes, nil +} + +// On non-Windows platforms, writeMarkerFile simply writes the marker file. +// See marker_access_windows.go for behavior on Windows platforms. +func writeMarkerFile(markerFile string, markerBytes []byte) error { + return os.WriteFile(markerFilePath(), markerBytes, 0600) +} diff --git a/internal/pkg/agent/application/upgrade/marker_access_windows.go b/internal/pkg/agent/application/upgrade/marker_access_windows.go new file mode 100644 index 00000000000..673a57eeabf --- /dev/null +++ b/internal/pkg/agent/application/upgrade/marker_access_windows.go @@ -0,0 +1,76 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +package upgrade + +import ( + "context" + "fmt" + "os" + "time" + + "github.com/elastic/elastic-agent/internal/pkg/agent/errors" + + "github.com/cenkalti/backoff/v4" +) + +const markerAccessTimeout = 10 * time.Second +const markerAccessBackoffInitialInterval = 50 * time.Millisecond +const minMarkerAccessRetries = 5 + +// On Windows, readMarkerFile tries to read the marker file, retrying with +// randomized exponential backoff up to markerAccessTimeout duration. This retry +// mechanism is necessary since the marker file could be accessed by multiple +// processes (the Upgrade Watcher and the main Agent process) at the same time, +// which could fail on Windows. +func readMarkerFile(markerFile string) ([]byte, error) { + var markerFileBytes []byte + readFn := func() error { + var err error + markerFileBytes, err = os.ReadFile(markerFile) + if errors.Is(err, os.ErrNotExist) { + // marker doesn't exist, nothing to do + return nil + } + + return err + } + + if err := accessMarkerFileWithRetries(readFn); err != nil { + return nil, fmt.Errorf("failed to read upgrade marker file [%s] despite retrying: %w", markerFile, err) + } + + return markerFileBytes, nil +} + +// On Windows, writeMarkerFile tries to write the marker file, retrying with +// randomized exponential backoff up to markerAccessTimeout duration. This retry +// mechanism is necessary since the marker file could be accessed by multiple +// processes (the Upgrade Watcher and the main Agent process) at the same time, +// which could fail on Windows. +func writeMarkerFile(markerFile string, markerBytes []byte) error { + writeFn := func() error { + return os.WriteFile(markerFile, markerBytes, 0600) + } + + if err := accessMarkerFileWithRetries(writeFn); err != nil { + return fmt.Errorf("failed to write upgrade marker file [%s] despite retrying: %w", markerFile, err) + } + + return nil +} + +func accessMarkerFileWithRetries(accessFn func() error) error { + expBackoff := backoff.NewExponentialBackOff() + expBackoff.InitialInterval = markerAccessBackoffInitialInterval + expBackoff.MaxInterval = markerAccessTimeout / minMarkerAccessRetries + expBackoff.MaxElapsedTime = markerAccessTimeout + + ctx, cancel := context.WithTimeout(context.Background(), markerAccessTimeout) + defer cancel() + + expBackoffWithTimeout := backoff.WithContext(expBackoff, ctx) + + return backoff.Retry(accessFn, expBackoffWithTimeout) +} diff --git a/internal/pkg/agent/application/upgrade/marker_watcher.go b/internal/pkg/agent/application/upgrade/marker_watcher.go new file mode 100644 index 00000000000..faefc3e64b2 --- /dev/null +++ b/internal/pkg/agent/application/upgrade/marker_watcher.go @@ -0,0 +1,132 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +package upgrade + +import ( + "context" + "fmt" + "path/filepath" + + "github.com/fsnotify/fsnotify" + + "github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details" + "github.com/elastic/elastic-agent/pkg/core/logger" + "github.com/elastic/elastic-agent/version" +) + +type MarkerWatcher interface { + Watch() <-chan UpdateMarker + Run(ctx context.Context) error +} + +type MarkerFileWatcher struct { + markerFilePath string + logger *logger.Logger + updateCh chan UpdateMarker +} + +func newMarkerFileWatcher(upgradeMarkerFilePath string, logger *logger.Logger) MarkerWatcher { + logger = logger.Named("marker_file_watcher") + + return &MarkerFileWatcher{ + markerFilePath: upgradeMarkerFilePath, + logger: logger, + updateCh: make(chan UpdateMarker), + } +} + +func (mfw *MarkerFileWatcher) Watch() <-chan UpdateMarker { + return mfw.updateCh +} + +func (mfw *MarkerFileWatcher) Run(ctx context.Context) error { + watcher, err := fsnotify.NewWatcher() + if err != nil { + return fmt.Errorf("failed to create upgrade marker watcher: %w", err) + } + + // Watch the upgrade marker file's directory, not the file itself, so we + // notice the file even if it's deleted and recreated. + upgradeMarkerDirPath := filepath.Dir(mfw.markerFilePath) + if err := watcher.Add(upgradeMarkerDirPath); err != nil { + return fmt.Errorf("failed to set watch on upgrade marker's directory [%s]: %w", upgradeMarkerDirPath, err) + } + + // Do an initial read from the upgrade marker file, in case the file + // is already present before the watching starts. + doInitialRead := make(chan struct{}, 1) + doInitialRead <- struct{}{} + + // Handle watching + go func() { + defer watcher.Close() + for { + select { + case <-ctx.Done(): + return + case err, ok := <-watcher.Errors: + if !ok { // Channel was closed (i.e. Watcher.Close() was called). + mfw.logger.Debug("fsnotify.Watcher's error channel was closed") + return + } + mfw.logger.Errorf("upgrade marker watch returned error: %s", err) + continue + case e, ok := <-watcher.Events: + if !ok { // Channel was closed (i.e. Watcher.Close() was called). + mfw.logger.Debug("fsnotify.Watcher's events channel was closed") + return + } + + if e.Name != mfw.markerFilePath { + // Since we are watching the directory that will contain the upgrade + // marker file, we could receive events here for changes to files other + // than the upgrade marker. We ignore such events as we're only concerned + // with changes to the upgrade marker. + continue + } + + switch { + case e.Op&(fsnotify.Create|fsnotify.Write) != 0: + // Upgrade marker file was created or updated; read its contents + // and send them over the update channel. + mfw.processMarker(version.GetAgentPackageVersion()) + } + case <-doInitialRead: + mfw.processMarker(version.GetAgentPackageVersion()) + } + } + }() + + return nil +} + +func (mfw *MarkerFileWatcher) processMarker(currentVersion string) { + marker, err := loadMarker(mfw.markerFilePath) + if err != nil { + mfw.logger.Error(err) + return + } + + // Nothing to do if marker is not (yet) present + if marker == nil { + return + } + + // If the marker exists but the version of Agent we're running right + // now is the same as the prevVersion recorded in the marker, it means + // the upgrade was rolled back. Ideally, this UPG_ROLLBACK state would've + // been recorded in the marker's upgrade details field but, in case it + // isn't for some reason, we fallback to explicitly setting that state as + // part of the upgrade details in the marker. + if marker.PrevVersion == currentVersion { + if marker.Details == nil { + marker.Details = details.NewDetails("unknown", details.StateRollback, marker.GetActionID()) + } else { + marker.Details.SetState(details.StateRollback) + } + } + + mfw.updateCh <- *marker +} diff --git a/internal/pkg/agent/application/upgrade/marker_watcher_test.go b/internal/pkg/agent/application/upgrade/marker_watcher_test.go new file mode 100644 index 00000000000..9d54a9bbcae --- /dev/null +++ b/internal/pkg/agent/application/upgrade/marker_watcher_test.go @@ -0,0 +1,225 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +package upgrade + +import ( + "context" + "os" + "path/filepath" + "sync" + "testing" + "time" + + "go.uber.org/zap/zapcore" + + "github.com/stretchr/testify/require" + + "gopkg.in/yaml.v2" + + "github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details" + "github.com/elastic/elastic-agent/pkg/core/logger" +) + +func TestMarkerWatcher(t *testing.T) { + testMarkerDir := t.TempDir() + testMarkerFile := filepath.Join(testMarkerDir, markerFilename) + testLogger, _ := logger.NewTesting("watch_marker") + + markerWatcher := newMarkerFileWatcher(testMarkerFile, testLogger) + + testCtx, testCancel := context.WithCancel(context.Background()) + defer testCancel() + + var testDetails *details.Details + var testDetailsMu sync.Mutex + + var testErr error + go func() { + for { + select { + case <-testCtx.Done(): + return + case marker := <-markerWatcher.Watch(): + testDetailsMu.Lock() + testDetails = marker.Details + testDetailsMu.Unlock() + } + } + }() + + err := markerWatcher.Run(testCtx) + require.NoError(t, err) + + // Write out the expected upgrade details to the test upgrade marker + // file. + expectedDetails := &details.Details{ + TargetVersion: "8.12.0", + State: details.StateWatching, + } + marker := updateMarkerSerializer{ + PrevVersion: "8.6.0", + Details: expectedDetails, + } + data, err := yaml.Marshal(marker) + require.NoError(t, err) + err = os.WriteFile(testMarkerFile, data, 0644) + require.NoError(t, err) + + // We expect that the details that were just written out to the test upgrade + // marker file will be noticed and read by the watchMarker function, and the + // testDetailsObs function will be called with them. + require.Eventually(t, func() bool { + testDetailsMu.Lock() + defer testDetailsMu.Unlock() + + return testDetails != nil && testDetails.Equals(expectedDetails) + }, 1*time.Second, 10*time.Millisecond) + + require.NoError(t, testErr) +} + +func TestProcessMarker(t *testing.T) { + cases := map[string]struct { + markerFileContents string + + expectedErrLogMsg bool + expectedDetails *details.Details + }{ + "failed_loading": { + markerFileContents: ` +invalid +`, + expectedErrLogMsg: true, + expectedDetails: nil, + }, + "no_marker": { + markerFileContents: "", + expectedErrLogMsg: false, + expectedDetails: nil, + }, + "same_version_no_details": { + markerFileContents: ` +prev_version: 8.9.2 +`, + expectedDetails: &details.Details{ + TargetVersion: "unknown", + State: details.StateRollback, + }, + }, + "same_version_with_details_no_state": { + markerFileContents: ` +prev_version: 8.9.2 +details: + target_version: 8.9.2 +`, + expectedErrLogMsg: false, + expectedDetails: &details.Details{ + TargetVersion: "8.9.2", + State: details.StateRollback, + }, + }, + "same_version_with_details_wrong_state": { + markerFileContents: ` +prev_version: 8.9.2 +details: + target_version: 8.9.2 + state: UPG_WATCHING +`, + expectedErrLogMsg: false, + expectedDetails: &details.Details{ + TargetVersion: "8.9.2", + State: details.StateRollback, + }, + }, + "different_version": { + markerFileContents: ` +prev_version: 8.8.2 +details: + target_version: 8.9.2 + state: UPG_WATCHING +`, + expectedErrLogMsg: false, + expectedDetails: &details.Details{ + TargetVersion: "8.9.2", + State: details.StateWatching, + }, + }, + } + + for name, test := range cases { + t.Run(name, func(t *testing.T) { + tmpDir := t.TempDir() + testMarkerFilePath := filepath.Join(tmpDir, markerFilename) + if test.markerFileContents != "" { + err := os.WriteFile(testMarkerFilePath, []byte(test.markerFileContents), 0644) + require.NoError(t, err) + } + log, obs := logger.NewTesting("marker_watcher") + updateCh := make(chan UpdateMarker) + mfw := MarkerFileWatcher{ + markerFilePath: testMarkerFilePath, + logger: log, + updateCh: updateCh, + } + + done := make(chan struct{}) + var markerRead bool + var actualMarker UpdateMarker + var markerMu sync.Mutex + go func() { + for { + select { + case <-done: + return + case m := <-updateCh: + markerMu.Lock() + markerRead = true + actualMarker = m + markerMu.Unlock() + } + } + }() + + mfw.processMarker("8.9.2") + + // error loading marker + if test.expectedErrLogMsg { + done <- struct{}{} + logs := obs.FilterLevelExact(zapcore.ErrorLevel).TakeAll() + require.NotEmpty(t, logs) + + markerMu.Lock() + defer markerMu.Unlock() + require.False(t, markerRead) + + return + } + + // no marker + if test.markerFileContents == "" { + done <- struct{}{} + + markerMu.Lock() + defer markerMu.Unlock() + require.False(t, markerRead) + + return + } + + // marker exists and can be read + require.Eventually(t, func() bool { + markerMu.Lock() + defer markerMu.Unlock() + return markerRead + }, 5*time.Second, 100*time.Millisecond) + + markerMu.Lock() + defer markerMu.Unlock() + + require.True(t, actualMarker.Details.Equals(test.expectedDetails)) + }) + + } +} diff --git a/internal/pkg/agent/application/upgrade/step_download_test.go b/internal/pkg/agent/application/upgrade/step_download_test.go index dcdc4da7de8..2f4500389ac 100644 --- a/internal/pkg/agent/application/upgrade/step_download_test.go +++ b/internal/pkg/agent/application/upgrade/step_download_test.go @@ -89,7 +89,9 @@ func TestDownloadWithRetries(t *testing.T) { return &mockDownloader{expectedDownloadPath, nil}, nil } - u := NewUpgrader(testLogger, &settings, &info.AgentInfo{}) + u, err := NewUpgrader(testLogger, &settings, &info.AgentInfo{}) + require.NoError(t, err) + parsedVersion, err := agtversion.ParseVersion("8.9.0") require.NoError(t, err) upgradeDetails := details.NewDetails(parsedVersion.String(), details.StateRequested, "") @@ -124,7 +126,9 @@ func TestDownloadWithRetries(t *testing.T) { return nil, nil } - u := NewUpgrader(testLogger, &settings, &info.AgentInfo{}) + u, err := NewUpgrader(testLogger, &settings, &info.AgentInfo{}) + require.NoError(t, err) + parsedVersion, err := agtversion.ParseVersion("8.9.0") require.NoError(t, err) upgradeDetails := details.NewDetails(parsedVersion.String(), details.StateRequested, "") @@ -161,7 +165,9 @@ func TestDownloadWithRetries(t *testing.T) { return nil, nil } - u := NewUpgrader(testLogger, &settings, &info.AgentInfo{}) + u, err := NewUpgrader(testLogger, &settings, &info.AgentInfo{}) + require.NoError(t, err) + parsedVersion, err := agtversion.ParseVersion("8.9.0") require.NoError(t, err) upgradeDetails := details.NewDetails(parsedVersion.String(), details.StateRequested, "") @@ -186,7 +192,9 @@ func TestDownloadWithRetries(t *testing.T) { return &mockDownloader{"", errors.New("download failed")}, nil } - u := NewUpgrader(testLogger, &settings, &info.AgentInfo{}) + u, err := NewUpgrader(testLogger, &settings, &info.AgentInfo{}) + require.NoError(t, err) + parsedVersion, err := agtversion.ParseVersion("8.9.0") require.NoError(t, err) upgradeDetails := details.NewDetails(parsedVersion.String(), details.StateRequested, "") diff --git a/internal/pkg/agent/application/upgrade/step_mark.go b/internal/pkg/agent/application/upgrade/step_mark.go index 0e9cf0bd92a..b5743582317 100644 --- a/internal/pkg/agent/application/upgrade/step_mark.go +++ b/internal/pkg/agent/application/upgrade/step_mark.go @@ -14,6 +14,7 @@ import ( "gopkg.in/yaml.v2" "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details" "github.com/elastic/elastic-agent/internal/pkg/agent/errors" "github.com/elastic/elastic-agent/internal/pkg/fleetapi" "github.com/elastic/elastic-agent/internal/pkg/release" @@ -37,6 +38,17 @@ type UpdateMarker struct { // Acked is a flag marking whether or not action was acked Acked bool `json:"acked" yaml:"acked"` Action *fleetapi.ActionUpgrade `json:"action" yaml:"action"` + + Details *details.Details `json:"details,omitempty" yaml:"details,omitempty"` +} + +// GetActionID returns the Fleet Action ID associated with the +// upgrade action, if it's recorded in the UpdateMarker. +func (um UpdateMarker) GetActionID() string { + if um.Action != nil { + return um.Action.ActionID + } + return "" } // MarkerActionUpgrade adapter struct compatible with pre 8.3 version of the marker file format @@ -78,6 +90,7 @@ type updateMarkerSerializer struct { PrevHash string `yaml:"prev_hash"` Acked bool `yaml:"acked"` Action *MarkerActionUpgrade `yaml:"action"` + Details *details.Details `yaml:"details"` } func newMarkerSerializer(m *UpdateMarker) *updateMarkerSerializer { @@ -88,11 +101,12 @@ func newMarkerSerializer(m *UpdateMarker) *updateMarkerSerializer { PrevHash: m.PrevHash, Acked: m.Acked, Action: convertToMarkerAction(m.Action), + Details: m.Details, } } // markUpgrade marks update happened so we can handle grace period -func (u *Upgrader) markUpgrade(_ context.Context, log *logger.Logger, hash string, action *fleetapi.ActionUpgrade) error { +func (u *Upgrader) markUpgrade(_ context.Context, log *logger.Logger, hash string, action *fleetapi.ActionUpgrade, upgradeDetails *details.Details) error { prevVersion := release.Version() prevHash := release.Commit() if len(prevHash) > hashLen { @@ -105,6 +119,7 @@ func (u *Upgrader) markUpgrade(_ context.Context, log *logger.Logger, hash strin PrevVersion: prevVersion, PrevHash: prevHash, Action: action, + Details: upgradeDetails, } markerBytes, err := yaml.Marshal(newMarkerSerializer(marker)) @@ -150,14 +165,18 @@ func CleanMarker(log *logger.Logger) error { // LoadMarker loads the update marker. If the file does not exist it returns nil // and no error. func LoadMarker() (*UpdateMarker, error) { - markerFile := markerFilePath() - markerBytes, err := ioutil.ReadFile(markerFile) + return loadMarker(markerFilePath()) +} + +func loadMarker(markerFile string) (*UpdateMarker, error) { + markerBytes, err := readMarkerFile(markerFile) if err != nil { - if errors.Is(err, os.ErrNotExist) { - return nil, nil - } return nil, err } + if markerBytes == nil { + // marker doesn't exist + return nil, nil + } marker := &updateMarkerSerializer{} if err := yaml.Unmarshal(markerBytes, &marker); err != nil { @@ -171,10 +190,11 @@ func LoadMarker() (*UpdateMarker, error) { PrevHash: marker.PrevHash, Acked: marker.Acked, Action: convertToActionUpgrade(marker.Action), + Details: marker.Details, }, nil } -func saveMarker(marker *UpdateMarker) error { +func SaveMarker(marker *UpdateMarker) error { makerSerializer := &updateMarkerSerializer{ Hash: marker.Hash, UpdatedOn: marker.UpdatedOn, @@ -182,13 +202,14 @@ func saveMarker(marker *UpdateMarker) error { PrevHash: marker.PrevHash, Acked: marker.Acked, Action: convertToMarkerAction(marker.Action), + Details: marker.Details, } markerBytes, err := yaml.Marshal(makerSerializer) if err != nil { return err } - return ioutil.WriteFile(markerFilePath(), markerBytes, 0600) + return writeMarkerFile(markerFilePath(), markerBytes) } func markerFilePath() string { diff --git a/internal/pkg/agent/application/upgrade/upgrade.go b/internal/pkg/agent/application/upgrade/upgrade.go index da0ea3df6be..a7c3c7e817b 100644 --- a/internal/pkg/agent/application/upgrade/upgrade.go +++ b/internal/pkg/agent/application/upgrade/upgrade.go @@ -55,6 +55,7 @@ type Upgrader struct { agentInfo *info.AgentInfo upgradeable bool fleetServerURI string + markerWatcher MarkerWatcher } // IsUpgradeable when agent is installed and running as a service or flag was provided. @@ -65,13 +66,14 @@ func IsUpgradeable() bool { } // NewUpgrader creates an upgrader which is capable of performing upgrade operation -func NewUpgrader(log *logger.Logger, settings *artifact.Config, agentInfo *info.AgentInfo) *Upgrader { +func NewUpgrader(log *logger.Logger, settings *artifact.Config, agentInfo *info.AgentInfo) (*Upgrader, error) { return &Upgrader{ - log: log, - settings: settings, - agentInfo: agentInfo, - upgradeable: IsUpgradeable(), - } + log: log, + settings: settings, + agentInfo: agentInfo, + upgradeable: IsUpgradeable(), + markerWatcher: newMarkerFileWatcher(markerFilePath(), log), + }, nil } // SetClient reloads URI based on up to date fleet client @@ -181,14 +183,13 @@ func (u *Upgrader) Upgrade(ctx context.Context, version string, sourceURI string return nil, err } - if err := u.markUpgrade(ctx, u.log, newHash, action); err != nil { + det.SetState(details.StateWatching) + if err := u.markUpgrade(ctx, u.log, newHash, action, det); err != nil { u.log.Errorw("Rolling back: marking upgrade failed", "error.message", err) rollbackInstall(ctx, u.log, newHash) return nil, err } - det.SetState(details.StateWatching) - if err := InvokeWatcher(u.log); err != nil { u.log.Errorw("Rolling back: starting watcher failed", "error.message", err) rollbackInstall(ctx, u.log, newHash) @@ -237,7 +238,11 @@ func (u *Upgrader) Ack(ctx context.Context, acker acker.Acker) error { marker.Acked = true - return saveMarker(marker) + return SaveMarker(marker) +} + +func (u *Upgrader) MarkerWatcher() MarkerWatcher { + return u.markerWatcher } func (u *Upgrader) sourceURI(retrievedURI string) string { diff --git a/internal/pkg/agent/cmd/watch.go b/internal/pkg/agent/cmd/watch.go index 45648752eda..b3d5727175c 100644 --- a/internal/pkg/agent/cmd/watch.go +++ b/internal/pkg/agent/cmd/watch.go @@ -13,22 +13,23 @@ import ( "syscall" "time" - "github.com/elastic/elastic-agent/version" - - "github.com/elastic/elastic-agent/internal/pkg/config" - "github.com/spf13/cobra" "github.com/elastic/elastic-agent-libs/logp" "github.com/elastic/elastic-agent-libs/logp/configure" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/filelock" "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" "github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details" "github.com/elastic/elastic-agent/internal/pkg/agent/configuration" "github.com/elastic/elastic-agent/internal/pkg/agent/errors" "github.com/elastic/elastic-agent/internal/pkg/cli" + "github.com/elastic/elastic-agent/internal/pkg/config" "github.com/elastic/elastic-agent/internal/pkg/release" "github.com/elastic/elastic-agent/pkg/core/logger" + agtversion "github.com/elastic/elastic-agent/pkg/version" + "github.com/elastic/elastic-agent/version" ) const ( @@ -107,14 +108,49 @@ func watchCmd(log *logp.Logger, cfg *configuration.Configuration) error { errorCheckInterval := cfg.Settings.Upgrade.Watcher.ErrorCheck.Interval ctx := context.Background() if err := watch(ctx, tilGrace, errorCheckInterval, log); err != nil { - log.Error("Error detected proceeding to rollback: %v", err) + log.Error("Error detected, proceeding to rollback: %v", err) + + // If we are upgrading from version >= 8.12.0, marker.Details should be non-nil + // because the Agent we upgraded FROM would've written upgrade details in the upgrade + // marker. However, if we're upgrading from version < 8.12.0, the marker won't + // contain upgrade details, so we populate them now. + if marker.Details == nil { + fromVersion, err := agtversion.ParseVersion(marker.PrevVersion) + if err != nil { + log.Warnf("upgrade details are nil, but unable to parse version being upgraded from [%s]: %s", marker.PrevVersion, err.Error()) + } else if fromVersion.Less(*agtversion.NewParsedSemVer(8, 12, 0, "", "")) { + log.Warnf("upgrade details are unexpectedly nil, upgrading from version [%s]", marker.PrevVersion) + } + + marker.Details = details.NewDetails(version.GetAgentPackageVersion(), details.StateRollback, marker.GetActionID()) + } + + marker.Details.SetState(details.StateRollback) + err = upgrade.SaveMarker(marker) + if err != nil { + log.Errorf("unable to save upgrade marker before attempting to rollback: %s", err.Error()) + } + err = upgrade.Rollback(ctx, log, marker.PrevHash, marker.Hash) if err != nil { log.Error("rollback failed", err) + + marker.Details.Fail(err) + err = upgrade.SaveMarker(marker) + if err != nil { + log.Errorf("unable to save upgrade marker after rollback failed: %s", err.Error()) + } } return err } + // watch succeeded - upgrade was successful! + marker.Details.SetState(details.StateCompleted) + err = upgrade.SaveMarker(marker) + if err != nil { + log.Errorf("unable to save upgrade marker after successful watch: %s", err.Error()) + } + // cleanup older versions, // in windows it might leave self untouched, this will get cleaned up // later at the start, because for windows we leave marker untouched. diff --git a/testing/integration/upgrade_rollback_test.go b/testing/integration/upgrade_rollback_test.go index 02a209e18eb..c91ac967df0 100644 --- a/testing/integration/upgrade_rollback_test.go +++ b/testing/integration/upgrade_rollback_test.go @@ -21,6 +21,7 @@ import ( "github.com/stretchr/testify/require" "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details" "github.com/elastic/elastic-agent/internal/pkg/agent/install" atesting "github.com/elastic/elastic-agent/pkg/testing" "github.com/elastic/elastic-agent/pkg/testing/define" @@ -119,6 +120,23 @@ inputs: } require.NoError(t, err) + // ensure that upgrade details now show the state as UPG_ROLLBACK. This is only possible with Elastic + // Agent versions >= 8.12.0. + startVersion, err := version.ParseVersion(startVersionInfo.Binary.Version) + require.NoError(t, err) + + if !startVersion.Less(*version.NewParsedSemVer(8, 12, 0, "", "")) { + client := startFixture.Client() + err = client.Connect(ctx) + require.NoError(t, err) + + state, err := client.State(ctx) + require.NoError(t, err) + + require.NotNil(t, state.UpgradeDetails) + require.Equal(t, details.StateRollback, state.UpgradeDetails.State) + } + // rollback should stop the watcher // killTimeout is greater than timeout as the watcher should have been // stopped on its own, and we don't want this test to hide that fact @@ -233,6 +251,23 @@ func TestStandaloneUpgradeRollbackOnRestarts(t *testing.T) { } require.NoError(t, err) + // ensure that upgrade details now show the state as UPG_ROLLBACK. This is only possible with Elastic + // Agent versions >= 8.12.0. + startVersion, err := version.ParseVersion(startVersionInfo.Binary.Version) + require.NoError(t, err) + + if !startVersion.Less(*version.NewParsedSemVer(8, 12, 0, "", "")) { + client := startFixture.Client() + err = client.Connect(ctx) + require.NoError(t, err) + + state, err := client.State(ctx) + require.NoError(t, err) + + require.NotNil(t, state.UpgradeDetails) + require.Equal(t, details.StateRollback, state.UpgradeDetails.State) + } + // rollback should stop the watcher // killTimeout is greater than timeout as the watcher should have been // stopped on its own, and we don't want this test to hide that fact From 38bee2471dc687258859d260d68427c42d5e75df Mon Sep 17 00:00:00 2001 From: Shaunak Kashyap Date: Thu, 16 Nov 2023 18:32:55 +0530 Subject: [PATCH 08/21] Send upgrade details to Fleet Server in check-in API requests (#3528) * Adding FSM for upgrades * Implementing TODO * WIP * WIP * Reorganizing imports * Running go mod tidy * Fix type * Handle failures in one place * Remove Fleet changes * Send upgrade details in check-in requests to Fleet * Add unit test * Adding CHANGELOG entry * Compare pointer values so we're not copying locks * Add missing assertion * Update Fleet-managed upgrade E2E test * Make scheduled_at optional so empty time is not serialized * Normalize comparison of hostnames * Convert comment to log * Fix assertion logic * Removing replace directive * Address linter error --- NOTICE.txt | 4 +- ...9016628-send-upgrade-details-to-fleet.yaml | 32 +++++++++ go.mod | 2 +- go.sum | 4 +- .../coordinator/diagnostics_test.go | 2 +- .../gateway/fleet/fleet_gateway.go | 11 +-- .../gateway/fleet/fleet_gateway_test.go | 70 +++++++++++++++++++ .../application/upgrade/details/details.go | 2 +- internal/pkg/fleetapi/checkin_cmd.go | 12 ++-- pkg/control/v2/server/server.go | 7 +- pkg/control/v2/server/server_test.go | 7 +- pkg/testing/tools/fleettools/fleet.go | 3 +- testing/integration/upgrade_fleet_test.go | 10 +++ 13 files changed, 146 insertions(+), 20 deletions(-) create mode 100644 changelog/fragments/1699016628-send-upgrade-details-to-fleet.yaml diff --git a/NOTICE.txt b/NOTICE.txt index a6e55930766..4bbca01b922 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -1166,11 +1166,11 @@ SOFTWARE -------------------------------------------------------------------------------- Dependency : github.com/elastic/elastic-agent-libs -Version: v0.6.2 +Version: v0.7.0 Licence type (autodetected): Apache-2.0 -------------------------------------------------------------------------------- -Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-agent-libs@v0.6.2/LICENSE: +Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-agent-libs@v0.7.0/LICENSE: Apache License Version 2.0, January 2004 diff --git a/changelog/fragments/1699016628-send-upgrade-details-to-fleet.yaml b/changelog/fragments/1699016628-send-upgrade-details-to-fleet.yaml new file mode 100644 index 00000000000..181cc9114f3 --- /dev/null +++ b/changelog/fragments/1699016628-send-upgrade-details-to-fleet.yaml @@ -0,0 +1,32 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: feature + +# Change summary; a 80ish characters long description of the change. +summary: Send upgrade details to Fleet + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. +#description: + +# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. +component: elastic-agent + +# PR URL; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +pr: https://github.com/elastic/elastic-agent/pull/3528 + +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +issue: https://github.com/elastic/elastic-agent/issues/3119 diff --git a/go.mod b/go.mod index 897874a5261..a0b30a7f3f1 100644 --- a/go.mod +++ b/go.mod @@ -14,7 +14,7 @@ require ( github.com/elastic/e2e-testing v1.1.0 github.com/elastic/elastic-agent-autodiscover v0.6.5 github.com/elastic/elastic-agent-client/v7 v7.5.0 - github.com/elastic/elastic-agent-libs v0.6.2 + github.com/elastic/elastic-agent-libs v0.7.0 github.com/elastic/elastic-agent-system-metrics v0.8.0 github.com/elastic/elastic-transport-go/v8 v8.3.0 github.com/elastic/go-elasticsearch/v8 v8.10.1 diff --git a/go.sum b/go.sum index 54eca1468e7..ffcbb8e262a 100644 --- a/go.sum +++ b/go.sum @@ -781,8 +781,8 @@ github.com/elastic/elastic-agent-autodiscover v0.6.5 h1:5DeMpuNc8c/tN6HN0A4A2uOF github.com/elastic/elastic-agent-autodiscover v0.6.5/go.mod h1:chulyCAyZb/njMHgzkhC/yWnt8v/Y6eCRUhmFVnsA5o= github.com/elastic/elastic-agent-client/v7 v7.5.0 h1:niI3WQ+01Lnp2r5LxK8SyNhrPJe13vBiOkqrDRK2oTA= github.com/elastic/elastic-agent-client/v7 v7.5.0/go.mod h1:DYoX95xjC4BW/p2avyu724Qr2+hoUIz9eCU9CVS1d+0= -github.com/elastic/elastic-agent-libs v0.6.2 h1:tE5pFK4y7xm1FtXm+r+63G7STjJAaWh3+oKIQDzdPDo= -github.com/elastic/elastic-agent-libs v0.6.2/go.mod h1:o+EySawBZGeYu49shJxerg2wRCimS1dhrD4As0MS700= +github.com/elastic/elastic-agent-libs v0.7.0 h1:g/+Gzpn4ayXPFbfZsn5lGjbPR1TGqlVpshJVVUNJGlQ= +github.com/elastic/elastic-agent-libs v0.7.0/go.mod h1:o+EySawBZGeYu49shJxerg2wRCimS1dhrD4As0MS700= github.com/elastic/elastic-agent-system-metrics v0.8.0 h1:EsWbtd83JvnaqnL57bKS1E6GhOdemTRbxdFDcenR8zQ= github.com/elastic/elastic-agent-system-metrics v0.8.0/go.mod h1:9C1UEfj0P687HAzZepHszN6zXA+2tN2Lx3Osvq1zby8= github.com/elastic/elastic-integration-corpus-generator-tool v0.5.0/go.mod h1:uf9N86y+UACGybdEhZLpwZ93XHWVhsYZAA4c2T2v6YM= diff --git a/internal/pkg/agent/application/coordinator/diagnostics_test.go b/internal/pkg/agent/application/coordinator/diagnostics_test.go index 9fec8e1844b..1e12dd38141 100644 --- a/internal/pkg/agent/application/coordinator/diagnostics_test.go +++ b/internal/pkg/agent/application/coordinator/diagnostics_test.go @@ -441,7 +441,7 @@ func TestDiagnosticState(t *testing.T) { ActionID: "foobar", Metadata: details.Metadata{ DownloadPercent: 0.17469, - ScheduledAt: now, + ScheduledAt: &now, DownloadRate: 123.56, }, }, diff --git a/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go b/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go index 000ec534bf2..109ece58be9 100644 --- a/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go +++ b/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go @@ -332,11 +332,12 @@ func (f *FleetGateway) execute(ctx context.Context) (*fleetapi.CheckinResponse, // checkin cmd := fleetapi.NewCheckinCmd(f.agentInfo, f.client) req := &fleetapi.CheckinRequest{ - AckToken: ackToken, - Metadata: ecsMeta, - Status: agentStateToString(state.State), - Message: state.Message, - Components: components, + AckToken: ackToken, + Metadata: ecsMeta, + Status: agentStateToString(state.State), + Message: state.Message, + Components: components, + UpgradeDetails: state.UpgradeDetails, } resp, took, err := cmd.Execute(ctx, req) diff --git a/internal/pkg/agent/application/gateway/fleet/fleet_gateway_test.go b/internal/pkg/agent/application/gateway/fleet/fleet_gateway_test.go index 04572eab845..de40565c0b5 100644 --- a/internal/pkg/agent/application/gateway/fleet/fleet_gateway_test.go +++ b/internal/pkg/agent/application/gateway/fleet/fleet_gateway_test.go @@ -7,6 +7,7 @@ package fleet import ( "bytes" "context" + "encoding/json" "fmt" "io" @@ -23,9 +24,11 @@ import ( "gotest.tools/assert" "github.com/elastic/elastic-agent/internal/pkg/agent/application/coordinator" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details" "github.com/elastic/elastic-agent/internal/pkg/agent/errors" "github.com/elastic/elastic-agent/internal/pkg/agent/storage" "github.com/elastic/elastic-agent/internal/pkg/agent/storage/store" + "github.com/elastic/elastic-agent/internal/pkg/fleetapi" "github.com/elastic/elastic-agent/internal/pkg/fleetapi/acker/noop" "github.com/elastic/elastic-agent/internal/pkg/scheduler" agentclient "github.com/elastic/elastic-agent/pkg/control/v2/client" @@ -308,6 +311,73 @@ func TestFleetGateway(t *testing.T) { require.NoError(t, err) }) + t.Run("Sends upgrade details", func(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + scheduler := scheduler.NewStepper() + client := newTestingClient() + + log, _ := logger.NewTesting("fleet_gateway") + + stateStore := newStateStore(t, log) + + upgradeDetails := &details.Details{ + TargetVersion: "8.12.0", + State: "UPG_WATCHING", + ActionID: "foobarbaz", + } + stateFetcher := func() coordinator.State { + return coordinator.State{ + UpgradeDetails: upgradeDetails, + } + } + + gateway, err := newFleetGatewayWithScheduler( + log, + settings, + agentInfo, + client, + scheduler, + noop.New(), + stateFetcher, + stateStore, + ) + + require.NoError(t, err) + + waitFn := ackSeq( + client.Answer(func(headers http.Header, body io.Reader) (*http.Response, error) { + data, err := io.ReadAll(body) + require.NoError(t, err) + + var checkinRequest fleetapi.CheckinRequest + err = json.Unmarshal(data, &checkinRequest) + require.NoError(t, err) + + require.NotNil(t, checkinRequest.UpgradeDetails) + require.Equal(t, upgradeDetails, checkinRequest.UpgradeDetails) + + resp := wrapStrToResp(http.StatusOK, `{ "actions": [] }`) + return resp, nil + }), + ) + + errCh := runFleetGateway(ctx, gateway) + + // Synchronize scheduler and acking of calls from the worker go routine. + scheduler.Next() + waitFn() + + cancel() + err = <-errCh + require.NoError(t, err) + select { + case actions := <-gateway.Actions(): + t.Errorf("Expected no actions, got %v", actions) + default: + } + }) } func TestRetriesOnFailures(t *testing.T) { diff --git a/internal/pkg/agent/application/upgrade/details/details.go b/internal/pkg/agent/application/upgrade/details/details.go index 884df46a3cc..dcc990d33cd 100644 --- a/internal/pkg/agent/application/upgrade/details/details.go +++ b/internal/pkg/agent/application/upgrade/details/details.go @@ -35,7 +35,7 @@ type Details struct { // Metadata consists of metadata relating to a specific upgrade state type Metadata struct { - ScheduledAt time.Time `json:"scheduled_at,omitempty" yaml:"scheduled_at,omitempty"` + ScheduledAt *time.Time `json:"scheduled_at,omitempty" yaml:"scheduled_at,omitempty"` // DownloadPercent is the percentage of the artifact that has been // downloaded. Minimum value is 0 and maximum value is 1. diff --git a/internal/pkg/fleetapi/checkin_cmd.go b/internal/pkg/fleetapi/checkin_cmd.go index b52eeb3903f..4c14454ce18 100644 --- a/internal/pkg/fleetapi/checkin_cmd.go +++ b/internal/pkg/fleetapi/checkin_cmd.go @@ -14,6 +14,7 @@ import ( "time" "github.com/elastic/elastic-agent/internal/pkg/agent/application/info" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details" "github.com/elastic/elastic-agent/internal/pkg/agent/errors" "github.com/elastic/elastic-agent/internal/pkg/fleetapi/client" ) @@ -47,11 +48,12 @@ type CheckinComponent struct { // CheckinRequest consists of multiple events reported to fleet ui. type CheckinRequest struct { - Status string `json:"status"` - AckToken string `json:"ack_token,omitempty"` - Metadata *info.ECSMeta `json:"local_metadata,omitempty"` - Message string `json:"message"` // V2 Agent message - Components []CheckinComponent `json:"components"` // V2 Agent components + Status string `json:"status"` + AckToken string `json:"ack_token,omitempty"` + Metadata *info.ECSMeta `json:"local_metadata,omitempty"` + Message string `json:"message"` // V2 Agent message + Components []CheckinComponent `json:"components"` // V2 Agent components + UpgradeDetails *details.Details `json:"upgrade_details,omitempty"` } // SerializableEvent is a representation of the event to be send to the Fleet Server API via the checkin diff --git a/pkg/control/v2/server/server.go b/pkg/control/v2/server/server.go index 645068ea64d..149c426e2e4 100644 --- a/pkg/control/v2/server/server.go +++ b/pkg/control/v2/server/server.go @@ -370,12 +370,17 @@ func stateToProto(state *coordinator.State, agentInfo *info.AgentInfo) (*cproto. State: string(state.UpgradeDetails.State), ActionId: state.UpgradeDetails.ActionID, Metadata: &cproto.UpgradeDetailsMetadata{ - ScheduledAt: timestamppb.New(state.UpgradeDetails.Metadata.ScheduledAt), DownloadPercent: float32(state.UpgradeDetails.Metadata.DownloadPercent), FailedState: string(state.UpgradeDetails.Metadata.FailedState), ErrorMsg: state.UpgradeDetails.Metadata.ErrorMsg, }, } + + if state.UpgradeDetails.Metadata.ScheduledAt != nil && + !state.UpgradeDetails.Metadata.ScheduledAt.IsZero() { + upgradeDetails.Metadata.ScheduledAt = timestamppb.New(*state.UpgradeDetails.Metadata.ScheduledAt) + + } } return &cproto.StateResponse{ diff --git a/pkg/control/v2/server/server_test.go b/pkg/control/v2/server/server_test.go index cafab60450f..d7aee6d96c7 100644 --- a/pkg/control/v2/server/server_test.go +++ b/pkg/control/v2/server/server_test.go @@ -164,11 +164,16 @@ func TestStateMapping(t *testing.T) { if tc.upgradeDetails != nil { expectedMetadata := &cproto.UpgradeDetailsMetadata{ - ScheduledAt: timestamppb.New(tc.upgradeDetails.Metadata.ScheduledAt), DownloadPercent: float32(tc.upgradeDetails.Metadata.DownloadPercent), FailedState: string(tc.upgradeDetails.Metadata.FailedState), ErrorMsg: tc.upgradeDetails.Metadata.ErrorMsg, } + + if tc.upgradeDetails.Metadata.ScheduledAt != nil && + !tc.upgradeDetails.Metadata.ScheduledAt.IsZero() { + expectedMetadata.ScheduledAt = timestamppb.New(*tc.upgradeDetails.Metadata.ScheduledAt) + } + assert.Equal(t, string(tc.upgradeDetails.State), stateResponse.UpgradeDetails.State) assert.Equal(t, tc.upgradeDetails.TargetVersion, stateResponse.UpgradeDetails.TargetVersion) assert.Equal(t, tc.upgradeDetails.ActionID, stateResponse.UpgradeDetails.ActionId) diff --git a/pkg/testing/tools/fleettools/fleet.go b/pkg/testing/tools/fleettools/fleet.go index baa1ca659c0..77c64069e41 100644 --- a/pkg/testing/tools/fleettools/fleet.go +++ b/pkg/testing/tools/fleettools/fleet.go @@ -9,6 +9,7 @@ import ( "errors" "fmt" "os" + "strings" "github.com/elastic/elastic-agent-libs/kibana" ) @@ -25,7 +26,7 @@ func GetAgentByPolicyIDAndHostnameFromList(client *kibana.Client, policyID, host agentHostname := item.LocalMetadata.Host.Hostname agentPolicyID := item.PolicyID - if agentHostname == hostname && agentPolicyID == policyID { + if strings.EqualFold(agentHostname, hostname) && agentPolicyID == policyID { hostnameAgents = append(hostnameAgents, &listAgentsResp.Items[i]) } } diff --git a/testing/integration/upgrade_fleet_test.go b/testing/integration/upgrade_fleet_test.go index aca07472d58..9c44386eb87 100644 --- a/testing/integration/upgrade_fleet_test.go +++ b/testing/integration/upgrade_fleet_test.go @@ -8,6 +8,7 @@ package integration import ( "context" + "os" "strings" "testing" "time" @@ -140,6 +141,15 @@ func testUpgradeFleetManagedElasticAgent(ctx context.Context, t *testing.T, info err = fleettools.UpgradeAgent(kibClient, policy.ID, endVersionInfo.Binary.String(), true) require.NoError(t, err) + t.Log("Waiting from upgrade details to show up in Fleet") + hostname, err := os.Hostname() + require.NoError(t, err) + require.Eventually(t, func() bool { + agent, err := fleettools.GetAgentByPolicyIDAndHostnameFromList(kibClient, policy.ID, hostname) + return err == nil && agent.UpgradeDetails != nil + + }, 5*time.Minute, time.Second) + // wait for the watcher to show up t.Logf("Waiting for upgrade watcher to start...") err = upgradetest.WaitForWatcher(ctx, 5*time.Minute, 10*time.Second) From c813d3f12776532e9209d1848e39a3da73f83b98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Paolo=20Chil=C3=A0?= Date: Thu, 16 Nov 2023 17:54:48 +0100 Subject: [PATCH 09/21] Fix comparison of upgrade metadata (#3779) --- .../pkg/agent/application/dispatcher/dispatcher.go | 2 +- .../agent/application/dispatcher/dispatcher_test.go | 5 +++-- .../agent/application/upgrade/details/details.go | 13 ++++++++++++- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/internal/pkg/agent/application/dispatcher/dispatcher.go b/internal/pkg/agent/application/dispatcher/dispatcher.go index eda450d6a0b..63d00aeeea3 100644 --- a/internal/pkg/agent/application/dispatcher/dispatcher.go +++ b/internal/pkg/agent/application/dispatcher/dispatcher.go @@ -321,7 +321,7 @@ func reportNextScheduledUpgrade(input []fleetapi.Action, detailsSetter details.O upgradeDetails := details.NewDetails(nextUpgrade.Version, details.StateScheduled, nextUpgrade.ID()) startTime, _ := nextUpgrade.StartTime() - upgradeDetails.Metadata.ScheduledAt = startTime + upgradeDetails.Metadata.ScheduledAt = &startTime detailsSetter(upgradeDetails) } diff --git a/internal/pkg/agent/application/dispatcher/dispatcher_test.go b/internal/pkg/agent/application/dispatcher/dispatcher_test.go index 212c50a3068..bf69c76ce74 100644 --- a/internal/pkg/agent/application/dispatcher/dispatcher_test.go +++ b/internal/pkg/agent/application/dispatcher/dispatcher_test.go @@ -493,6 +493,7 @@ func Test_ActionDispatcher_scheduleRetry(t *testing.T) { func TestReportNextScheduledUpgrade(t *testing.T) { now := time.Now().UTC() later := now.Add(3 * time.Hour) + laterTruncate := later.Truncate(time.Second) muchLater := later.Add(3 * time.Hour) cases := map[string]struct { @@ -522,7 +523,7 @@ func TestReportNextScheduledUpgrade(t *testing.T) { State: details.StateScheduled, ActionID: "action2", Metadata: details.Metadata{ - ScheduledAt: later.Truncate(time.Second), + ScheduledAt: &laterTruncate, }, }, }, @@ -544,7 +545,7 @@ func TestReportNextScheduledUpgrade(t *testing.T) { State: details.StateScheduled, ActionID: "action4", Metadata: details.Metadata{ - ScheduledAt: later.Truncate(time.Second), + ScheduledAt: &laterTruncate, }, }, }, diff --git a/internal/pkg/agent/application/upgrade/details/details.go b/internal/pkg/agent/application/upgrade/details/details.go index dcc990d33cd..dae27923ecf 100644 --- a/internal/pkg/agent/application/upgrade/details/details.go +++ b/internal/pkg/agent/application/upgrade/details/details.go @@ -167,13 +167,24 @@ func (d *Details) notifyObserver(observer Observer) { } func (m Metadata) Equals(otherM Metadata) bool { - return m.ScheduledAt.Equal(otherM.ScheduledAt) && + return equalTimePointers(m.ScheduledAt, otherM.ScheduledAt) && m.FailedState == otherM.FailedState && m.ErrorMsg == otherM.ErrorMsg && m.DownloadPercent == otherM.DownloadPercent && m.DownloadRate == otherM.DownloadRate } +func equalTimePointers(t, otherT *time.Time) bool { + if t == otherT { + return true + } + if t == nil || otherT == nil { + return false + } + + return t.Equal(*otherT) +} + func (dr *downloadRate) MarshalJSON() ([]byte, error) { downloadRateBytesPerSecond := float64(*dr) if math.IsInf(downloadRateBytesPerSecond, 0) { From 112f61896951f738c98f0a46c6df4160d31b718f Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Thu, 16 Nov 2023 17:39:38 -0500 Subject: [PATCH 10/21] Rework runtime manager updates to block the coordinator less (#3747) --- .../application/coordinator/coordinator.go | 32 +-- .../coordinator/coordinator_state.go | 17 +- .../coordinator/coordinator_test.go | 12 +- .../coordinator/coordinator_unit_test.go | 20 +- pkg/component/runtime/manager.go | 248 ++++++++++-------- pkg/component/runtime/manager_test.go | 150 +++++++---- 6 files changed, 280 insertions(+), 199 deletions(-) diff --git a/internal/pkg/agent/application/coordinator/coordinator.go b/internal/pkg/agent/application/coordinator/coordinator.go index c33cc700729..1129ef96e32 100644 --- a/internal/pkg/agent/application/coordinator/coordinator.go +++ b/internal/pkg/agent/application/coordinator/coordinator.go @@ -100,10 +100,7 @@ type RuntimeManager interface { Runner // Update updates the current components model. - Update(model component.Model) error - - // State returns the current components model state. - State() []runtime.ComponentComponentState + Update(model component.Model) // PerformAction executes an action on a unit. PerformAction(ctx context.Context, comp component.Component, unit component.Unit, name string, params map[string]interface{}) (map[string]interface{}, error) @@ -241,17 +238,19 @@ type Coordinator struct { // into the reported state before broadcasting -- State() will report // agentclient.Failed if one of these is set, even if the underlying // coordinator state is agentclient.Healthy. - runtimeMgrErr error // Currently unused - configMgrErr error - actionsErr error - varsMgrErr error + // Errors from the runtime manager report policy update failures and are + // stored in runtimeUpdateErr below. + configMgrErr error + actionsErr error + varsMgrErr error // Errors resulting from different possible failure modes when setting a // new policy. Right now there are three different stages where a policy // update can fail: // - in generateAST, converting the policy to an AST // - in process, converting the AST and vars into a full component model - // - while sending the final component model to the runtime manager + // - while applying the final component model in the runtime manager + // (reported asynchronously via the runtime manager error channel) // // The plan is to improve our preprocessing so we can always detect // failures immediately https://github.com/elastic/elastic-agent/issues/2887. @@ -920,7 +919,13 @@ func (c *Coordinator) runLoopIteration(ctx context.Context) { return case runtimeErr := <-c.managerChans.runtimeManagerError: - c.setRuntimeManagerError(runtimeErr) + // runtime manager errors report the result of a policy update. + // Coordinator transitions from starting to healthy when a policy update + // is successful. + c.setRuntimeUpdateError(runtimeErr) + if runtimeErr == nil { + c.setCoordinatorState(agentclient.Healthy, "Running") + } case configErr := <-c.managerChans.configManagerError: if c.isManaged { @@ -1153,12 +1158,7 @@ func (c *Coordinator) refreshComponentModel(ctx context.Context) (err error) { c.logger.Info("Updating running component model") c.logger.With("components", model.Components).Debug("Updating running component model") - err = c.runtimeMgr.Update(model) - c.setRuntimeUpdateError(err) - if err != nil { - return fmt.Errorf("updating runtime: %w", err) - } - c.setCoordinatorState(agentclient.Healthy, "Running") + c.runtimeMgr.Update(model) return nil } diff --git a/internal/pkg/agent/application/coordinator/coordinator_state.go b/internal/pkg/agent/application/coordinator/coordinator_state.go index 6e645c3a06b..22394ee52e0 100644 --- a/internal/pkg/agent/application/coordinator/coordinator_state.go +++ b/internal/pkg/agent/application/coordinator/coordinator_state.go @@ -63,10 +63,10 @@ func (c *Coordinator) SetUpgradeDetails(upgradeDetails *details.Details) { c.upgradeDetailsChan <- upgradeDetails } -// setRuntimeManagerError updates the error state for the runtime manager. +// setRuntimeUpdateError reports a failed policy update in the runtime manager. // Called on the main Coordinator goroutine. -func (c *Coordinator) setRuntimeManagerError(err error) { - c.runtimeMgrErr = err +func (c *Coordinator) setRuntimeUpdateError(err error) { + c.runtimeUpdateErr = err c.stateNeedsRefresh = true } @@ -107,14 +107,6 @@ func (c *Coordinator) setComponentGenError(err error) { c.stateNeedsRefresh = true } -// setRuntimeUpdateError updates the error state for sending a component model -// update to the runtime manager. -// Called on the main Coordinator goroutine. -func (c *Coordinator) setRuntimeUpdateError(err error) { - c.runtimeUpdateErr = err - c.stateNeedsRefresh = true -} - // setOverrideState is the internal helper to set the override state and // set stateNeedsRefresh. // Must be called on the main Coordinator goroutine. @@ -201,9 +193,6 @@ func (c *Coordinator) generateReportableState() (s State) { } else if c.runtimeUpdateErr != nil { s.State = agentclient.Failed s.Message = fmt.Sprintf("Runtime update failed: %s", c.runtimeUpdateErr.Error()) - } else if c.runtimeMgrErr != nil { - s.State = agentclient.Failed - s.Message = fmt.Sprintf("Runtime manager: %s", c.runtimeMgrErr.Error()) } else if c.configMgrErr != nil { s.State = agentclient.Failed s.Message = fmt.Sprintf("Config manager: %s", c.configMgrErr.Error()) diff --git a/internal/pkg/agent/application/coordinator/coordinator_test.go b/internal/pkg/agent/application/coordinator/coordinator_test.go index 676906bbde0..49770bc1e97 100644 --- a/internal/pkg/agent/application/coordinator/coordinator_test.go +++ b/internal/pkg/agent/application/coordinator/coordinator_test.go @@ -787,6 +787,8 @@ func (f *fakeVarsManager) Vars(ctx context.Context, vars []*transpiler.Vars) { type fakeRuntimeManager struct { state []runtime.ComponentComponentState updateCallback func([]component.Component) error + result error + errChan chan error } func (r *fakeRuntimeManager) Run(ctx context.Context) error { @@ -796,11 +798,15 @@ func (r *fakeRuntimeManager) Run(ctx context.Context) error { func (r *fakeRuntimeManager) Errors() <-chan error { return nil } -func (r *fakeRuntimeManager) Update(model component.Model) error { +func (r *fakeRuntimeManager) Update(model component.Model) { + r.result = nil if r.updateCallback != nil { - return r.updateCallback(model.Components) + r.result = r.updateCallback(model.Components) + } + if r.errChan != nil { + // If a reporting channel is set, send the result to it + r.errChan <- r.result } - return nil } // State returns the current components model state. diff --git a/internal/pkg/agent/application/coordinator/coordinator_unit_test.go b/internal/pkg/agent/application/coordinator/coordinator_unit_test.go index 956eb33b8c0..89869e48d5e 100644 --- a/internal/pkg/agent/application/coordinator/coordinator_unit_test.go +++ b/internal/pkg/agent/application/coordinator/coordinator_unit_test.go @@ -754,6 +754,7 @@ func TestCoordinatorReportsRuntimeManagerUpdateFailure(t *testing.T) { logger := logp.NewLogger("testing") configChan := make(chan ConfigChange, 1) + updateErrChan := make(chan error, 1) const errorStr = "update failed for testing reasons" // Create a mocked runtime manager that always reports an error @@ -761,6 +762,7 @@ func TestCoordinatorReportsRuntimeManagerUpdateFailure(t *testing.T) { updateCallback: func(comp []component.Component) error { return fmt.Errorf(errorStr) }, + errChan: updateErrChan, } coord := &Coordinator{ @@ -769,6 +771,9 @@ func TestCoordinatorReportsRuntimeManagerUpdateFailure(t *testing.T) { stateBroadcaster: broadcaster.New(State{}, 0, 0), managerChans: managerChans{ configManagerUpdate: configChan, + // Give coordinator the same error channel we set on the runtime + // manager, so it receives the update result. + runtimeManagerError: updateErrChan, }, runtimeMgr: runtimeManager, vars: emptyVars(t), @@ -781,16 +786,19 @@ func TestCoordinatorReportsRuntimeManagerUpdateFailure(t *testing.T) { configChan <- configChange coord.runLoopIteration(ctx) - // Make sure the failure was reported to the config manager - assert.True(t, configChange.failed, "Config change should report failure if the runtime manager returns an error") - require.Error(t, configChange.err, "Config change should get an error if runtime manager update fails") - assert.Contains(t, configChange.err.Error(), errorStr) + // Make sure the config change was acknowledged to the config manager + // (the failure is not reported here since it happens asynchronously; it + // will appear in the coordinator state afterwards.) + assert.True(t, configChange.acked, "Config change should be acknowledged to the config manager") + assert.NoError(t, configChange.err, "Config change with async error should succeed") - // Make sure the error is saved in Coordinator.runtimeUpdateErr + // Now do another run loop iteration to let the update error propagate, + // and make sure it is reported correctly. + coord.runLoopIteration(ctx) require.Error(t, coord.runtimeUpdateErr, "Runtime update failure should be saved in runtimeUpdateErr") assert.Equal(t, errorStr, coord.runtimeUpdateErr.Error(), "runtimeUpdateErr should match the error reported by the runtime manager") - // Make sure the error is reported in Coordinator state. + // Make sure the error appears in the Coordinator state. state := coord.State() assert.Equal(t, agentclient.Failed, state.State, "Failed policy update should cause failed Coordinator") assert.Contains(t, state.Message, errorStr, "Failed policy update should be reported in Coordinator state message") diff --git a/pkg/component/runtime/manager.go b/pkg/component/runtime/manager.go index 8462ac3c17e..850349406d6 100644 --- a/pkg/component/runtime/manager.go +++ b/pkg/component/runtime/manager.go @@ -93,22 +93,36 @@ type Manager struct { baseLogger *logger.Logger ca *authority.CertificateAuthority listenAddr string + listenPort int agentInfo *info.AgentInfo tracer *apm.Tracer monitor MonitoringManager grpcConfig *configuration.GRPCConfig - // netMx synchronizes the access to listener and server only - netMx sync.RWMutex - listener net.Listener - server *grpc.Server - // Set when the RPC server is ready to receive requests, for use by tests. serverReady *atomic.Bool - // updateMx protects the call to update to ensure that - // only one call to update occurs at a time - updateMx sync.Mutex + // updateChan forwards component model updates from the public Update method + // to the internal run loop. + updateChan chan component.Model + + // Component model update is run asynchronously and pings this channel when + // finished, so the runtime manager loop knows it's safe to advance to the + // next update without ever having to block on the result. + updateDoneChan chan struct{} + + // Next component model update that will be applied, in case we get one + // while a previous update is still in progress. If we get more than one, + // keep only the most recent. + // Only access from the main runtime manager goroutine. + nextUpdate *component.Model + + // Whether we're already waiting on the results of an update call. + // If this is true when the run loop finishes, we need to wait for the + // final update result before shutting down, otherwise the shutdown's + // update call will conflict. + // Only access from the main runtime manager goroutine. + updateInProgress bool // currentMx protects access to the current map only currentMx sync.RWMutex @@ -123,10 +137,9 @@ type Manager struct { errCh chan error - // upon creation the Manager is neither running not shutting down, thus both - // flags are needed. - running atomic.Bool - shuttingDown atomic.Bool + // doneChan is closed when Manager is shutting down to signal that any + // pending requests should be canceled. + doneChan chan struct{} } // NewManager creates a new manager. @@ -144,19 +157,22 @@ func NewManager( return nil, err } m := &Manager{ - logger: logger, - baseLogger: baseLogger, - ca: ca, - listenAddr: listenAddr, - agentInfo: agentInfo, - tracer: tracer, - current: make(map[string]*componentRuntimeState), - shipperConns: make(map[string]*shipperConn), - subscriptions: make(map[string][]*Subscription), - errCh: make(chan error), - monitor: monitor, - grpcConfig: grpcConfig, - serverReady: atomic.NewBool(false), + logger: logger, + baseLogger: baseLogger, + ca: ca, + listenAddr: listenAddr, + agentInfo: agentInfo, + tracer: tracer, + current: make(map[string]*componentRuntimeState), + shipperConns: make(map[string]*shipperConn), + subscriptions: make(map[string][]*Subscription), + updateChan: make(chan component.Model), + updateDoneChan: make(chan struct{}), + errCh: make(chan error), + monitor: monitor, + grpcConfig: grpcConfig, + serverReady: atomic.NewBool(false), + doneChan: make(chan struct{}), } return m, nil } @@ -169,16 +185,11 @@ func NewManager( // // Blocks until the context is done. func (m *Manager) Run(ctx context.Context) error { - m.running.Store(true) - m.shuttingDown.Store(false) - - lis, err := net.Listen("tcp", m.listenAddr) + listener, err := net.Listen("tcp", m.listenAddr) if err != nil { return fmt.Errorf("error starting tcp listener for runtime manager: %w", err) } - m.netMx.Lock() - m.listener = lis - m.netMx.Unlock() + m.listenPort = listener.Addr().(*net.TCPAddr).Port certPool := x509.NewCertPool() if ok := certPool.AppendCertsFromPEM(m.ca.Crt()); !ok { @@ -205,87 +216,123 @@ func (m *Manager) Run(ctx context.Context) error { grpc.MaxRecvMsgSize(m.grpcConfig.MaxMsgSize), ) } - m.netMx.Lock() - m.server = server - m.netMx.Unlock() - proto.RegisterElasticAgentServer(m.server, m) + proto.RegisterElasticAgentServer(server, m) // start serving GRPC connections - var wg sync.WaitGroup - wg.Add(1) + var wgServer sync.WaitGroup + wgServer.Add(1) go func() { - defer wg.Done() - m.serverReady.Store(true) - for { - err := server.Serve(lis) - if err != nil { - m.logger.Errorf("control protocol failed: %s", err) - } - if ctx.Err() != nil { - // context has an error don't start again - return - } - } + defer wgServer.Done() + go m.serverLoop(ctx, listener, server) }() - <-ctx.Done() - m.running.Store(false) - m.shuttingDown.Store(true) + // Start the run loop, which continues on the main goroutine + // until the context is canceled. + m.runLoop(ctx) + + // Notify components to shutdown and wait for their response m.shutdown() + // Close the rpc listener and wait for serverLoop to return + listener.Close() + wgServer.Wait() + + // Cancel any remaining connections server.Stop() - wg.Wait() - m.netMx.Lock() - m.listener = nil - m.server = nil - m.netMx.Unlock() return ctx.Err() } +// The main run loop for the runtime manager, whose responsibilities are: +// - Accept component model updates from the Coordinator +// - Apply those updates safely without ever blocking, because a block here +// propagates to a block in the Coordinator +// - Close doneChan when the loop ends, so the Coordinator knows not to send +// any more updates +func (m *Manager) runLoop(ctx context.Context) { +LOOP: + for ctx.Err() == nil { + select { + case <-ctx.Done(): + break LOOP + case model := <-m.updateChan: + // We got a new component model from m.Update(), mark it as the + // next update to apply, overwriting any previous pending value. + m.nextUpdate = &model + case <-m.updateDoneChan: + // An update call has finished, we can initiate another when available. + m.updateInProgress = false + } + + // After each select call, check if there's a pending update that + // can be applied. + if m.nextUpdate != nil && !m.updateInProgress { + // There is a component model update available, apply it. + go func(model component.Model) { + // Run the update with tearDown set to true since this is coming + // from a user-initiated policy update + err := m.update(model, true) + + // When update is done, send its result back to the coordinator, + // unless we're shutting down. + select { + case m.errCh <- err: + case <-ctx.Done(): + } + // Signal the runtime manager that we're finished. Note that + // we don't select on ctx.Done() in this case because the runtime + // manager always reads the results of an update once initiated, + // even if it is shutting down. + m.updateDoneChan <- struct{}{} + }(*m.nextUpdate) + m.updateInProgress = true + m.nextUpdate = nil + } + } + // Signal that the run loop is ended to unblock any incoming messages. + // We need to do this before waiting on the final update result, otherwise + // it might be stuck trying to send the result to errCh. + close(m.doneChan) + + if m.updateInProgress { + // Wait for the existing update to finish before shutting down, + // otherwise the new update call closing everything will + // conflict. + <-m.updateDoneChan + m.updateInProgress = false + } +} + +func (m *Manager) serverLoop(ctx context.Context, listener net.Listener, server *grpc.Server) { + m.serverReady.Store(true) + for ctx.Err() == nil { + err := server.Serve(listener) + if err != nil && ctx.Err() == nil { + // Only log an error if we aren't shutting down, otherwise we'll spam + // the logs with "use of closed network connection" for a connection that + // was closed on purpose. + m.logger.Errorf("control protocol listener failed: %s", err) + } + } +} + // Errors returns channel that errors are reported on. func (m *Manager) Errors() <-chan error { return m.errCh } -// Update updates the currComp state of the running components. +// Update forwards a new component model to Manager's run loop. +// When it has been processed, a result will be sent on Manager's +// error channel. // Called from the main Coordinator goroutine. // -// This returns as soon as possible, the work is performed in the background. -func (m *Manager) Update(model component.Model) error { - shuttingDown := m.shuttingDown.Load() - if shuttingDown { - // ignore any updates once shutdown started - return nil - } - // teardown is true because the public `Update` method would be coming directly from - // policy so if a component was removed it needs to be torn down. - return m.update(model, true) -} - -// State returns the current component states. -func (m *Manager) State() []ComponentComponentState { - m.currentMx.RLock() - defer m.currentMx.RUnlock() - states := make([]ComponentComponentState, 0, len(m.current)) - for _, crs := range m.current { - var legacyPID string - if crs.runtime != nil { - if commandRuntime, ok := crs.runtime.(*commandRuntime); ok { - if commandRuntime != nil { - procInfo := commandRuntime.proc - if procInfo != nil { - legacyPID = fmt.Sprint(commandRuntime.proc.PID) - } - } - } - } - states = append(states, ComponentComponentState{ - Component: crs.getCurrent(), - State: crs.getLatest(), - LegacyPID: legacyPID, - }) +// If calling from a test, you should read from errCh afterwards to avoid +// blocking Manager's main loop. +func (m *Manager) Update(model component.Model) { + select { + case m.updateChan <- model: + case <-m.doneChan: + // Manager is shutting down, ignore the update } - return states } // PerformAction executes an action on a unit. @@ -658,13 +705,10 @@ func (m *Manager) Actions(server proto.ElasticAgent_ActionsServer) error { } // update updates the current state of the running components. +// It is only called by the main runtime manager goroutine in Manager.Run. // // This returns as soon as possible, work is performed in the background. func (m *Manager) update(model component.Model, teardown bool) error { - // ensure that only one `update` can occur at the same time - m.updateMx.Lock() - defer m.updateMx.Unlock() - // prepare the components to add consistent shipper connection information between // the connected components in the model err := m.connectShippers(model.Components) @@ -891,13 +935,7 @@ func (m *Manager) getRuntimeFromComponent(comp component.Component) *componentRu func (m *Manager) getListenAddr() string { addr := strings.SplitN(m.listenAddr, ":", 2) if len(addr) == 2 && addr[1] == "0" { - m.netMx.RLock() - lis := m.listener - m.netMx.RUnlock() - if lis != nil { - port := lis.Addr().(*net.TCPAddr).Port - return fmt.Sprintf("%s:%d", addr[0], port) - } + return fmt.Sprintf("%s:%d", addr[0], m.listenPort) } return m.listenAddr } diff --git a/pkg/component/runtime/manager_test.go b/pkg/component/runtime/manager_test.go index bb83de6fd04..51bb941bca6 100644 --- a/pkg/component/runtime/manager_test.go +++ b/pkg/component/runtime/manager_test.go @@ -145,7 +145,8 @@ func TestManager_SimpleComponentErr(t *testing.T) { defer drainErrChan(errCh) defer drainErrChan(subErrCh) - err = m.Update(component.Model{Components: []component.Component{comp}}) + m.Update(component.Model{Components: []component.Component{comp}}) + err = <-m.errCh require.NoError(t, err) endTimer := time.NewTimer(30 * time.Second) @@ -237,7 +238,8 @@ func TestManager_FakeInput_StartStop(t *testing.T) { subErrCh <- fmt.Errorf("unit failed: %s", unit.Message) } else if unit.State == client.UnitStateHealthy { // remove the component which will stop it - err := m.Update(component.Model{Components: []component.Component{}}) + m.Update(component.Model{Components: []component.Component{}}) + err := <-m.errCh if err != nil { subErrCh <- err } @@ -260,7 +262,8 @@ func TestManager_FakeInput_StartStop(t *testing.T) { defer drainErrChan(errCh) defer drainErrChan(subErrCh) - err = m.Update(component.Model{Components: []component.Component{comp}}) + m.Update(component.Model{Components: []component.Component{comp}}) + err = <-m.errCh require.NoError(t, err) endTimer := time.NewTimer(30 * time.Second) @@ -384,7 +387,8 @@ func TestManager_FakeInput_Features(t *testing.T) { Fqdn: &proto.FQDNFeature{Enabled: true}, } - err := m.Update(component.Model{Components: []component.Component{comp}}) + m.Update(component.Model{Components: []component.Component{comp}}) + err := <-m.errCh if err != nil { subscriptionErrCh <- fmt.Errorf("[case %d]: failed to update component: %w", healthIteration, err) @@ -439,7 +443,8 @@ func TestManager_FakeInput_Features(t *testing.T) { "message": "Fake Healthy", }) - err := m.Update(component.Model{Components: []component.Component{comp}}) + m.Update(component.Model{Components: []component.Component{comp}}) + err := <-m.errCh if err != nil { t.Logf("error updating component state to health: %v", err) @@ -459,7 +464,8 @@ func TestManager_FakeInput_Features(t *testing.T) { defer drainErrChan(managerErrCh) defer drainErrChan(subscriptionErrCh) - err = m.Update(component.Model{Components: []component.Component{comp}}) + m.Update(component.Model{Components: []component.Component{comp}}) + err = <-m.errCh require.NoError(t, err) timeout := 30 * time.Second @@ -609,7 +615,8 @@ func TestManager_FakeInput_APM(t *testing.T) { comp.Component = &proto.Component{ ApmConfig: initialAPMConfig, } - err := m.Update(component.Model{Components: []component.Component{comp}}) + m.Update(component.Model{Components: []component.Component{comp}}) + err := <-m.errCh if err != nil { subscriptionErrCh <- fmt.Errorf("[case %d]: failed to update component: %w", healthIteration, err) @@ -653,7 +660,8 @@ func TestManager_FakeInput_APM(t *testing.T) { comp.Component = &proto.Component{ ApmConfig: modifiedAPMConfig, } - err := m.Update(component.Model{Components: []component.Component{comp}}) + m.Update(component.Model{Components: []component.Component{comp}}) + err := <-m.errCh if err != nil { subscriptionErrCh <- fmt.Errorf("[case %d]: failed to update component: %w", healthIteration, err) @@ -692,7 +700,8 @@ func TestManager_FakeInput_APM(t *testing.T) { comp.Component = &proto.Component{ ApmConfig: nil, } - err := m.Update(component.Model{Components: []component.Component{comp}}) + m.Update(component.Model{Components: []component.Component{comp}}) + err := <-m.errCh if err != nil { subscriptionErrCh <- fmt.Errorf("[case %d]: failed to update component: %w", healthIteration, err) @@ -741,7 +750,8 @@ func TestManager_FakeInput_APM(t *testing.T) { "message": "Fake Healthy", }) - err := m.Update(component.Model{Components: []component.Component{comp}}) + m.Update(component.Model{Components: []component.Component{comp}}) + err := <-m.errCh if err != nil { t.Logf("error updating component state to health: %v", err) @@ -761,7 +771,8 @@ func TestManager_FakeInput_APM(t *testing.T) { defer drainErrChan(managerErrCh) defer drainErrChan(subscriptionErrCh) - err = m.Update(component.Model{Components: []component.Component{comp}}) + m.Update(component.Model{Components: []component.Component{comp}}) + err = <-m.errCh require.NoError(t, err) timeout := 30 * time.Second @@ -902,9 +913,10 @@ func TestManager_FakeInput_Limits(t *testing.T) { GoMaxProcs: 101, }, } - err := m.Update(component.Model{ + m.Update(component.Model{ Components: []component.Component{comp}, }) + err := <-m.errCh if err != nil { subscriptionErrCh <- fmt.Errorf("[case %d]: failed to update component: %w", healthyIteration, err) @@ -917,9 +929,10 @@ func TestManager_FakeInput_Limits(t *testing.T) { assert.Equal(t, uint64(101), componentState.Component.Limits.GoMaxProcs) comp.Component = nil - err := m.Update(component.Model{ + m.Update(component.Model{ Components: []component.Component{comp}, }) + err := <-m.errCh if err != nil { subscriptionErrCh <- fmt.Errorf("[case %d]: failed to update component: %w", healthyIteration, err) @@ -945,7 +958,8 @@ func TestManager_FakeInput_Limits(t *testing.T) { defer drainErrChan(managerErrCh) defer drainErrChan(subscriptionErrCh) - err = m.Update(component.Model{Components: []component.Component{comp}}) + m.Update(component.Model{Components: []component.Component{comp}}) + err = <-m.errCh require.NoError(t, err) timeout := 30 * time.Second @@ -1063,9 +1077,10 @@ func TestManager_FakeShipper_Limits(t *testing.T) { GoMaxProcs: 101, }, } - err := m.Update(component.Model{ + m.Update(component.Model{ Components: []component.Component{comp}, }) + err := <-m.errCh if err != nil { subscriptionErrCh <- fmt.Errorf("[case %d]: failed to update component: %w", healthyIteration, err) @@ -1078,9 +1093,10 @@ func TestManager_FakeShipper_Limits(t *testing.T) { assert.Equal(t, uint64(101), componentState.Component.Limits.GoMaxProcs) comp.Component = nil - err := m.Update(component.Model{ + m.Update(component.Model{ Components: []component.Component{comp}, }) + err := <-m.errCh if err != nil { subscriptionErrCh <- fmt.Errorf("[case %d]: failed to update component: %w", healthyIteration, err) @@ -1106,7 +1122,8 @@ func TestManager_FakeShipper_Limits(t *testing.T) { defer drainErrChan(managerErrCh) defer drainErrChan(subscriptionErrCh) - err = m.Update(component.Model{Components: []component.Component{comp}}) + m.Update(component.Model{Components: []component.Component{comp}}) + err = <-m.errCh require.NoError(t, err) timeout := 30 * time.Second @@ -1222,7 +1239,8 @@ func TestManager_FakeInput_BadUnitToGood(t *testing.T) { } unitBad = false - err := m.Update(component.Model{Components: []component.Component{updatedComp}}) + m.Update(component.Model{Components: []component.Component{updatedComp}}) + err := <-m.errCh if err != nil { subErrCh <- err } @@ -1250,7 +1268,8 @@ func TestManager_FakeInput_BadUnitToGood(t *testing.T) { } } else if unit.State == client.UnitStateHealthy { // bad unit is now healthy; stop the component - err := m.Update(component.Model{Components: []component.Component{}}) + m.Update(component.Model{Components: []component.Component{}}) + err := <-m.errCh if err != nil { subErrCh <- err } @@ -1274,7 +1293,8 @@ func TestManager_FakeInput_BadUnitToGood(t *testing.T) { defer drainErrChan(errCh) defer drainErrChan(subErrCh) - err = m.Update(component.Model{Components: []component.Component{comp}}) + m.Update(component.Model{Components: []component.Component{comp}}) + err = <-m.errCh require.NoError(t, err) endTimer := time.NewTimer(30 * time.Second) @@ -1365,7 +1385,8 @@ func TestManager_FakeInput_GoodUnitToBad(t *testing.T) { endTimer := time.NewTimer(30 * time.Second) defer endTimer.Stop() - err = m.Update(component.Model{Components: []component.Component{healthyComp}}) + m.Update(component.Model{Components: []component.Component{healthyComp}}) + err = <-m.errCh require.NoError(t, err) // nextState tracks the stage of the test. We expect the sequence @@ -1395,7 +1416,8 @@ LOOP: if unit.State == client.UnitStateHealthy { // good unit is healthy; now make it bad t.Logf("marking good-input as having a hard-error for config") - err := m.Update(component.Model{Components: []component.Component{unhealthyComp}}) + m.Update(component.Model{Components: []component.Component{unhealthyComp}}) + err := <-m.errCh require.NoError(t, err, "Component model update should succeed") // We next expect to transition to Failed @@ -1414,7 +1436,8 @@ LOOP: if unit.State == client.UnitStateFailed { // Reached the expected state, now send an empty component model // to stop everything. - err := m.Update(component.Model{Components: []component.Component{}}) + m.Update(component.Model{Components: []component.Component{}}) + err := <-m.errCh require.NoError(t, err, "Component model update should succeed") nextState = client.UnitStateStopped } else { @@ -1445,12 +1468,10 @@ LOOP: } func TestManager_FakeInput_NoDeadlock(t *testing.T) { - /* - NOTE: This is a long-running test that spams the runtime managers `Update` function to try and - trigger a deadlock. This test takes 2 minutes to run trying to re-produce issue: + // NOTE: This is a long-running test that spams the runtime managers `Update` function to try and + // trigger a deadlock. This test takes 2 minutes to run trying to re-produce issue: + // https://github.com/elastic/elastic-agent/issues/2691 - https://github.com/elastic/elastic-agent/issues/2691 - */ testPaths(t) ctx, cancel := context.WithCancel(context.Background()) @@ -1523,7 +1544,8 @@ func TestManager_FakeInput_NoDeadlock(t *testing.T) { } i += 1 comp = updatedComp - err := m.Update(component.Model{Components: []component.Component{updatedComp}}) + m.Update(component.Model{Components: []component.Component{updatedComp}}) + err := <-m.errCh if err != nil { updatedErr <- err return @@ -1564,7 +1586,8 @@ LOOP: case <-endTimer.C: // no deadlock after timeout (all good stop the component) updatedCancel() - _ = m.Update(component.Model{Components: []component.Component{}}) + m.Update(component.Model{Components: []component.Component{}}) + <-m.errCh // Don't care about the result of Update, just that it runs break LOOP case err := <-errCh: require.NoError(t, err) @@ -1656,7 +1679,8 @@ func TestManager_FakeInput_Configure(t *testing.T) { "state": int(client.UnitStateDegraded), "message": "Fake Degraded", }) - err := m.Update(component.Model{Components: []component.Component{comp}}) + m.Update(component.Model{Components: []component.Component{comp}}) + err := <-m.errCh if err != nil { subErrCh <- err } @@ -1679,7 +1703,8 @@ func TestManager_FakeInput_Configure(t *testing.T) { defer drainErrChan(errCh) defer drainErrChan(subErrCh) - err = m.Update(component.Model{Components: []component.Component{comp}}) + m.Update(component.Model{Components: []component.Component{comp}}) + err = <-m.errCh require.NoError(t, err) endTimer := time.NewTimer(30 * time.Second) @@ -1797,7 +1822,8 @@ func TestManager_FakeInput_RemoveUnit(t *testing.T) { } else if unit1.State == client.UnitStateHealthy { // unit1 is healthy lets remove it from the component comp.Units = comp.Units[0:1] - err := m.Update(component.Model{Components: []component.Component{comp}}) + m.Update(component.Model{Components: []component.Component{comp}}) + err = <-m.errCh if err != nil { subErrCh <- err } @@ -1832,7 +1858,8 @@ func TestManager_FakeInput_RemoveUnit(t *testing.T) { defer drainErrChan(errCh) defer drainErrChan(subErrCh) - err = m.Update(component.Model{Components: []component.Component{comp}}) + m.Update(component.Model{Components: []component.Component{comp}}) + err = <-m.errCh require.NoError(t, err) endTimer := time.NewTimer(30 * time.Second) @@ -1956,7 +1983,8 @@ func TestManager_FakeInput_ActionState(t *testing.T) { defer drainErrChan(errCh) defer drainErrChan(subErrCh) - err = m.Update(component.Model{Components: []component.Component{comp}}) + m.Update(component.Model{Components: []component.Component{comp}}) + err = <-m.errCh require.NoError(t, err) endTimer := time.NewTimer(30 * time.Second) @@ -2091,7 +2119,8 @@ func TestManager_FakeInput_Restarts(t *testing.T) { defer drainErrChan(errCh) defer drainErrChan(subErrCh) - err = m.Update(component.Model{Components: []component.Component{comp}}) + m.Update(component.Model{Components: []component.Component{comp}}) + err = <-m.errCh require.NoError(t, err) endTimer := time.NewTimer(30 * time.Second) @@ -2208,7 +2237,8 @@ func TestManager_FakeInput_Restarts_ConfigKill(t *testing.T) { "message": "Fake Healthy", "kill": rp[1], }) - err := m.Update(component.Model{Components: []component.Component{comp}}) + m.Update(component.Model{Components: []component.Component{comp}}) + err := <-m.errCh if err != nil { subErrCh <- err } @@ -2233,7 +2263,8 @@ func TestManager_FakeInput_Restarts_ConfigKill(t *testing.T) { defer drainErrChan(errCh) defer drainErrChan(subErrCh) - err = m.Update(component.Model{Components: []component.Component{comp}}) + m.Update(component.Model{Components: []component.Component{comp}}) + err = <-m.errCh require.NoError(t, err) endTimer := time.NewTimer(1 * time.Minute) @@ -2347,7 +2378,8 @@ func TestManager_FakeInput_KeepsRestarting(t *testing.T) { "message": fmt.Sprintf("Fake Healthy %d", lastStoppedCount), "kill_on_interval": true, }) - err := m.Update(component.Model{Components: []component.Component{comp}}) + m.Update(component.Model{Components: []component.Component{comp}}) + err := <-m.errCh if err != nil { subErrCh <- err } @@ -2375,7 +2407,8 @@ func TestManager_FakeInput_KeepsRestarting(t *testing.T) { defer drainErrChan(errCh) defer drainErrChan(subErrCh) - err = m.Update(component.Model{Components: []component.Component{comp}}) + m.Update(component.Model{Components: []component.Component{comp}}) + err = <-m.errCh require.NoError(t, err) endTimer := time.NewTimer(1 * time.Minute) @@ -2493,7 +2526,8 @@ func TestManager_FakeInput_RestartsOnMissedCheckins(t *testing.T) { defer drainErrChan(errCh) defer drainErrChan(subErrCh) - err = m.Update(component.Model{Components: []component.Component{comp}}) + m.Update(component.Model{Components: []component.Component{comp}}) + err = <-m.errCh require.NoError(t, err) endTimer := time.NewTimer(30 * time.Second) @@ -2610,7 +2644,8 @@ func TestManager_FakeInput_InvalidAction(t *testing.T) { defer drainErrChan(errCh) defer drainErrChan(subErrCh) - err = m.Update(component.Model{Components: []component.Component{comp}}) + m.Update(component.Model{Components: []component.Component{comp}}) + err = <-m.errCh require.NoError(t, err) endTimer := time.NewTimer(30 * time.Second) @@ -2807,7 +2842,8 @@ func TestManager_FakeInput_MultiComponent(t *testing.T) { defer drainErrChan(subErrCh1) defer drainErrChan(subErrCh2) - err = m.Update(component.Model{Components: components}) + m.Update(component.Model{Components: components}) + err = <-m.errCh require.NoError(t, err) count := 0 @@ -2963,7 +2999,8 @@ func TestManager_FakeInput_LogLevel(t *testing.T) { defer drainErrChan(errCh) defer drainErrChan(subErrCh) - err = m.Update(component.Model{Components: []component.Component{comp}}) + m.Update(component.Model{Components: []component.Component{comp}}) + err = <-m.errCh require.NoError(t, err) endTimer := time.NewTimer(30 * time.Second) @@ -3177,7 +3214,8 @@ func TestManager_FakeShipper(t *testing.T) { subErrCh <- err } else { // successful; turn it all off - err := m.Update(component.Model{Components: []component.Component{}}) + m.Update(component.Model{Components: []component.Component{}}) + err = <-m.errCh if err != nil { subErrCh <- err } @@ -3206,7 +3244,8 @@ func TestManager_FakeShipper(t *testing.T) { subErrCh <- err } else { // successful; turn it all off - err := m.Update(component.Model{Components: []component.Component{}}) + m.Update(component.Model{Components: []component.Component{}}) + err := <-m.errCh if err != nil { subErrCh <- err } @@ -3241,7 +3280,8 @@ func TestManager_FakeShipper(t *testing.T) { subErrCh <- err } else { // successful; turn it all off - err := m.Update(component.Model{Components: []component.Component{}}) + m.Update(component.Model{Components: []component.Component{}}) + err := <-m.errCh if err != nil { subErrCh <- err } @@ -3266,7 +3306,8 @@ func TestManager_FakeShipper(t *testing.T) { defer drainErrChan(errCh) defer drainErrChan(subErrCh) - err = m.Update(component.Model{Components: comps}) + m.Update(component.Model{Components: comps}) + err = <-m.errCh require.NoError(t, err) timeout := 2 * time.Minute @@ -3458,11 +3499,8 @@ func TestManager_FakeInput_OutputChange(t *testing.T) { stateProgressionWG.Done() }() - // Wait manager start running, then check if any error happened - assert.Eventually(t, - func() bool { return m.running.Load() }, - 500*time.Millisecond, - 10*time.Millisecond) + err = waitForReady(waitCtx, m) + require.NoError(t, err, "Manager must finish initializing") select { case err := <-errCh: @@ -3471,7 +3509,8 @@ func TestManager_FakeInput_OutputChange(t *testing.T) { } time.Sleep(100 * time.Millisecond) - err = m.Update(component.Model{Components: components}) + m.Update(component.Model{Components: components}) + err = <-m.errCh require.NoError(t, err) updateSleep := 300 * time.Millisecond @@ -3480,7 +3519,8 @@ func TestManager_FakeInput_OutputChange(t *testing.T) { updateSleep = time.Second } time.Sleep(updateSleep) - err = m.Update(component.Model{Components: components2}) + m.Update(component.Model{Components: components2}) + err = <-m.errCh require.NoError(t, err) count := 0 From adf19dbcdfb5540345d79538623efd97061850b5 Mon Sep 17 00:00:00 2001 From: Pavel Zorin Date: Mon, 20 Nov 2023 15:21:22 +0100 Subject: [PATCH 11/21] Sync k8s migtraion to buildkite (#3762) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Sync k8s migtraion to buildkite * Sync k8s migtraion to buildkite * Sync k8s migtraion to buildkite * Sync k8s migtraion to buildkite * Sync k8s migtraion to buildkite * Sync k8s migtraion to buildkite * Sync k8s migtraion to buildkite * Fix sync-k8s.sh * try monorepo-diff step * try monorepo-diff step * try monorepo-diff step * try monorepo-diff step * try monorepo-diff step * Cleanup * Cleanup * Cleanup * Added retry fro install-gh.sh. Cleanup * Update .buildkite/scripts/install-gh.sh Co-authored-by: Paolo Chilà --------- Co-authored-by: Paolo Chilà --- .buildkite/pipeline.yml | 20 +++++++++++ .buildkite/scripts/install-gh.sh | 54 ++++++++++++++++++++++++++++ .buildkite/scripts/install-kind.sh | 2 +- .buildkite/scripts/steps/sync-k8s.sh | 22 ++++++++++++ deploy/kubernetes/Makefile | 2 +- 5 files changed, 98 insertions(+), 2 deletions(-) create mode 100644 .buildkite/scripts/install-gh.sh create mode 100644 .buildkite/scripts/steps/sync-k8s.sh diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 866dc91b367..6c47d08bf92 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -227,3 +227,23 @@ steps: plugins: - junit-annotate#v2.4.1: artifacts: build/TEST-go-integration*.xml + + # Triggers a dynamic step: Sync K8s + # Runs only on main and if k8s files are changed + - label: "Trigger k8s sync" + branches: main + plugins: + - monebag/monorepo-diff#v2.5.9: + diff: "git diff --name-only HEAD~1" + watch: + - path: + - deploy/kubernetes/* + - version/docs/version.asciidoc + config: + label: "Sync K8s" + command: ".buildkite/scripts/steps/sync-k8s.sh" + agents: + provider: "gcp" + image: "family/core-ubuntu-2204" + env: + - GH_VERSION=2.4.0 diff --git a/.buildkite/scripts/install-gh.sh b/.buildkite/scripts/install-gh.sh new file mode 100644 index 00000000000..ff52687d02f --- /dev/null +++ b/.buildkite/scripts/install-gh.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash + +# Required environment variables: +# - GH_VERSION - the version of gh to install +set -exuo pipefail + +echo "--- Install gh cli" + +MSG="environment variable missing." +DEFAULT_HOME="/usr/local" +GH_VERSION=${GH_VERSION:-$MSG} +HOME=${HOME:-$DEFAULT_HOME} +GH_CMD="${HOME}/bin/gh" + +if command -v gh +then + set +e + echo "Found GH. Checking version.." + FOUND_GH_VERSION=$(gh --version 2>&1 >/dev/null | awk '{print $3}') + if [ "$FOUND_GH_VERSION" == "$GH_VERSION" ] + then + echo "GH Versions match: $GH_VERSION. No need to install gh. Exiting." + exit 0 + else + echo "GH Version mismatch. Desired version: $GH_VERSION, found version: $FOUND_GH_VERSION. Installing new version." + fi + set -e +fi + +source .buildkite/scripts/common.sh + +OS=$(uname -s| tr '[:upper:]' '[:lower:]') +ARCH=$(uname -m| tr '[:upper:]' '[:lower:]') +if [ "${ARCH}" == "aarch64" ] ; then + ARCH_SUFFIX=arm64 +else + ARCH_SUFFIX=amd64 +fi + +echo "Downloading gh : ${GH_VERSION}..." +TMP_DIR=$(mktemp -d) +if retry 5 curl -sL "https://github.com/cli/cli/releases/download/v${GH_VERSION}/gh_${GH_VERSION}_linux_amd64.tar.gz" | tar xz -C $TMP_DIR ; then + mkdir -p "${HOME}/bin" + mv "${TMP_DIR}/gh_${GH_VERSION}_linux_amd64/bin/gh" "${GH_CMD}" + rm -rf "${TMP_DIR}" +else + echo "Something bad with the download, deleting the binary" + if [ -e "${GH_CMD}" ] ; then + rm "${GH_CMD}" + fi + exit 1 +fi + + diff --git a/.buildkite/scripts/install-kind.sh b/.buildkite/scripts/install-kind.sh index 171480d7685..ee0e0039719 100644 --- a/.buildkite/scripts/install-kind.sh +++ b/.buildkite/scripts/install-kind.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -set -exuo pipefail +set -euo pipefail echo "--- Install Kind" diff --git a/.buildkite/scripts/steps/sync-k8s.sh b/.buildkite/scripts/steps/sync-k8s.sh new file mode 100644 index 00000000000..ecd64e815a4 --- /dev/null +++ b/.buildkite/scripts/steps/sync-k8s.sh @@ -0,0 +1,22 @@ +#!/bin/bash +set -euo pipefail + +export PATH=$HOME/bin:${PATH} + +source .buildkite/scripts/install-gh.sh +source .buildkite/scripts/common.sh + +export GITHUB_TOKEN=$(retry 5 vault kv get -field token kv/ci-shared/platform-ingest/github_token) + +cd deploy/kubernetes + +echo "--- [File Creation] Create-Needed-Manifest" +WITHOUTCONFIG=true make generate-k8s +./creator_k8s_manifest.sh . + +echo "--- [Clone] Kibana-Repository" +make ci-clone-kibana-repository +cp Makefile ./kibana +cd kibana +echo "--- Create Kibana PR" +make ci-create-kubernetes-templates-pull-request \ No newline at end of file diff --git a/deploy/kubernetes/Makefile b/deploy/kubernetes/Makefile index 6247c9461e1..d02fc6eeb38 100644 --- a/deploy/kubernetes/Makefile +++ b/deploy/kubernetes/Makefile @@ -81,7 +81,7 @@ else echo "INFO: create pull request" @gh pr create \ --title "Update kubernetes templates for elastic-agent" \ - --body "Automated by ${BUILD_URL}" \ + --body "Automated by ${BUILDKITE_BUILD_URL}" \ --label automation \ --label release_note:skip \ --base main \ From bdbba7baa9cca337e3de9d1b833e73e67ab5411f Mon Sep 17 00:00:00 2001 From: Pavel Zorin Date: Mon, 20 Nov 2023 15:22:04 +0100 Subject: [PATCH 12/21] Jenkins decommission (#3771) --- .ci/Jenkinsfile | 316 ------------------ .ci/jobs/defaults.yml | 19 -- .ci/jobs/elastic-agent-mbp.yml | 43 --- .ci/jobs/elastic-agent-schedule-daily.yml | 26 -- .ci/jobs/folders.yml | 5 - .ci/schedule-daily.groovy | 48 --- .ci/scripts/install-go.bat | 57 ---- .ci/scripts/install-go.sh | 43 --- .../updatecli-bump-golang.yml | 1 + .github/workflows/bump-golang.yml | 5 +- 10 files changed, 3 insertions(+), 560 deletions(-) delete mode 100644 .ci/Jenkinsfile delete mode 100644 .ci/jobs/defaults.yml delete mode 100644 .ci/jobs/elastic-agent-mbp.yml delete mode 100644 .ci/jobs/elastic-agent-schedule-daily.yml delete mode 100644 .ci/jobs/folders.yml delete mode 100644 .ci/schedule-daily.groovy delete mode 100755 .ci/scripts/install-go.bat delete mode 100755 .ci/scripts/install-go.sh rename .ci/bump-golang.yml => .github/updatecli-bump-golang.yml (98%) diff --git a/.ci/Jenkinsfile b/.ci/Jenkinsfile deleted file mode 100644 index 23f3ca1e707..00000000000 --- a/.ci/Jenkinsfile +++ /dev/null @@ -1,316 +0,0 @@ -#!/usr/bin/env groovy - -@Library('apm@current') _ - -pipeline { - agent { label 'ubuntu-22 && immutable' } - environment { - REPO = "elastic-agent" - BASE_DIR = "src/github.com/elastic/${env.REPO}" - JOB_GIT_CREDENTIALS = "f6c7695a-671e-4f4f-a331-acdce44ff9ba" - PIPELINE_LOG_LEVEL = 'INFO' - SNAPSHOT = true - JOB_GCS_CREDENTIALS = 'fleet-ci-gcs-plugin' // Support stash/unstash v2 - JOB_GCS_BUCKET = 'fleet-ci-temp' // Support stash/unstash v2 - JOB_GCS_EXT_BUCKET = 'fleet-ci-artifacts' // Support uploadPackagesToGoogleBucket - JOB_GCS_EXT_CREDENTIALS = 'fleet-ci-gcs-plugin-file-credentials' // Support uploadPackagesToGoogleBucket - DOCKER_ELASTIC_SECRET = 'secret/observability-team/ci/docker-registry/prod' - DOCKER_REGISTRY = 'docker.elastic.co' - DEVELOPER_MODE=true - } - options { - timeout(time: 3, unit: 'HOURS') - buildDiscarder(logRotator(numToKeepStr: '20', artifactNumToKeepStr: '20', daysToKeepStr: '30')) - timestamps() - ansiColor('xterm') - disableResume() - durabilityHint('PERFORMANCE_OPTIMIZED') - rateLimitBuilds(throttle: [count: 60, durationName: 'hour', userBoost: true]) - quietPeriod(10) - } - triggers { - issueCommentTrigger("(${obltGitHubComments()}|^run (integration|end-to-end) tests|/package)") - } - parameters { - // disabled by default, but required for merge, there are two GH checks: - // opt-in with 'ci:integration' - booleanParam(name: 'integration_tests_ci', defaultValue: false, description: 'Enable Integration tests') - - // disabled by default, but required for merge: - // opt-in with 'ci:end-to-end' tag on PR - booleanParam(name: 'end_to_end_tests_ci', defaultValue: false, description: 'Enable End-to-End tests') - - // disabled by default, but required for merge: - // opt-in with 'ci:extended-windows' tag on PR - booleanParam(name: 'extended_windows_ci', defaultValue: false, description: 'Enable Extended Windows tests') - - // disabled by default, but required for merge: - // opt-in with 'ci:extended-m1' tag on PR - booleanParam(name: 'extended_m1_ci', defaultValue: false, description: 'Enable M1 tests') - } - stages { - stage('Checkout') { - steps { - pipelineManager([ cancelPreviousRunningBuilds: [ when: 'PR' ] ]) - deleteDir() - gitCheckout(basedir: "${BASE_DIR}", githubNotifyFirstTimeContributor: true) - stashV2(name: 'source', bucket: "${JOB_GCS_BUCKET}", credentialsId: "${JOB_GCS_CREDENTIALS}") - dir("${BASE_DIR}"){ - setEnvVar('ONLY_DOCS', isGitRegionMatch(patterns: [ '.*\\.(asciidoc|md)' ], shouldMatchAll: true).toString()) - setEnvVar('PACKAGING_CHANGES', isGitRegionMatch(patterns: [ '(^dev-tools/packaging/.*|.ci/Jenkinsfile|.go-version|Dockerfile)' ], shouldMatchAll: false).toString()) - setEnvVar('K8S_CHANGES', isGitRegionMatch(patterns: [ '(^deploy/kubernetes/.*|^version/docs/version.asciidoc|.ci/Jenkinsfile)' ], shouldMatchAll: false).toString()) - setEnvVar('EXT_WINDOWS_CHANGES', isGitRegionMatch(patterns: [ '.ci/Jenkinsfile' ], shouldMatchAll: false).toString()) - setEnvVar('EXT_M1_CHANGES', isGitRegionMatch(patterns: [ '.ci/Jenkinsfile' ], shouldMatchAll: false).toString()) - // set the GO_VERSION env variable with the go version to be used in withMageEnv - setEnvVar('GO_VERSION', readFile(file: '.go-version')?.trim()) - } - } - } - stage('Check'){ - steps { - withGithubNotify(context: "Check") { - withMageEnv(){ - dir("${BASE_DIR}"){ - setEnvVar('BEAT_VERSION', sh(label: 'Get beat version', script: 'make get-version', returnStdout: true)?.trim()) - log(level: 'INFO', text: "env.BEAT_VERSION=${env.BEAT_VERSION}") - cmd(label: 'check', script: 'make check-ci') - } - } - } - } - } - stage('Test') { - when { - beforeAgent true - expression { return env.ONLY_DOCS == "false" } - } - failFast false - matrix { - agent {label "${PLATFORM}"} - options { skipDefaultCheckout() } - axes { - axis { - name 'PLATFORM' - // Orka workers are not healthy (memory and connectivity issues) - values 'ubuntu-22 && immutable', 'aws && aarch64 && gobld/diskSizeGb:200', 'windows-2016 && windows-immutable', 'windows-2022 && windows-immutable' //, 'macos12 && x86_64' - } - } - stages { - stage('Package') { - when { - beforeAgent true - allOf { - anyOf { - expression { return isE2eEnabled() } - expression { return isPackageEnabled() } - not { changeRequest() } - } - // Run packaging only for the linux specific arch - expression { return (PLATFORM.contains('ubuntu') || PLATFORM.contains('aarch64')) } - } - } - environment { - ARCH = "${PLATFORM.contains('aarch64') ? 'arm64' : 'amd64'}" - DEV = true - EXTERNAL = true - } - steps { - withGithubNotify(context: "Package ${PLATFORM}") { - deleteDir() - unstashV2(name: 'source', bucket: "${JOB_GCS_BUCKET}", credentialsId: "${JOB_GCS_CREDENTIALS}") - withMageEnv(){ - dir("${BASE_DIR}"){ - withPackageEnv("${PLATFORM}") { - cmd(label: 'Go package', script: 'mage package ironbank') - uploadPackagesToGoogleBucket( - credentialsId: env.JOB_GCS_EXT_CREDENTIALS, - repo: env.REPO, - bucket: env.JOB_GCS_EXT_BUCKET, - pattern: "build/distributions/**/*") - pushDockerImages( - registry: env.DOCKER_REGISTRY, - secret: env.DOCKER_ELASTIC_SECRET, - snapshot: env.SNAPSHOT, - version: env.BEAT_VERSION, - images: [ - [ source: "beats/elastic-agent", arch: env.ARCH, target: "observability-ci/elastic-agent"], - [ source: "beats/elastic-agent-oss", arch: env.ARCH, target: "observability-ci/elastic-agent-oss"], - [ source: "beats/elastic-agent-ubi9", arch: env.ARCH, target: "observability-ci/elastic-agent-ubi9"], - [ source: "beats/elastic-agent-complete", arch: env.ARCH, target: "observability-ci/elastic-agent-complete"], - [ source: "beats-ci/elastic-agent-cloud", arch: env.ARCH, target: "observability-ci/elastic-agent-cloud"] - ] - ) - } - } - } - } - } - } - } - } - } - stage('Sync K8s') { //This stage opens a PR to kibana Repository in order to sync k8s manifests - when { - // Only on main branch - // Enable if k8s related changes. - allOf { - branch 'main' // Only runs for branch main - expression { return env.K8S_CHANGES == "true" } // If k8s changes - } - } - failFast false - agent {label 'ubuntu-22 && immutable'} - options { skipDefaultCheckout() } - stages { - stage('OpenKibanaPR') { - steps { - withGhEnv(version: '2.4.0') { - deleteDir() - unstashV2(name: 'source', bucket: "${JOB_GCS_BUCKET}", credentialsId: "${JOB_GCS_CREDENTIALS}") - dir("${BASE_DIR}/deploy/kubernetes"){ - sh(label: '[File Creation] Create-Needed-Manifest', script: """ - WITHOUTCONFIG=true make generate-k8s - ./creator_k8s_manifest.sh . """) - sh(label: '[Clone] Kibana-Repository', script: """ - make ci-clone-kibana-repository - cp Makefile ./kibana - cd kibana - make ci-create-kubernetes-templates-pull-request """) - } - } - } - post { - always { - junit(allowEmptyResults: true, keepLongStdio: true, testResults: "${BASE_DIR}/build/TEST-*.xml") - } - } - } - } - } - } - post { - cleanup { - notifyBuildResult(prComment: true, - analyzeFlakey: !isTag(), jobName: getFlakyJobName(withBranch: (isPR() ? env.CHANGE_TARGET : env.BRANCH_NAME)), - githubIssue: false, // Disable creating gh issues for build failures while the E2E tests are stabilized. - githubLabels: 'Team:Elastic-Agent-Control-Plane') - } - } -} - -// As agreed let's report the code coverage for Linux but no ARM only. -def isCodeCoverageEnabled() { - return (isUnix() && !isArm()) -} - -def withPackageEnv(platform, Closure body) { - if (isUnix()) { - if (isDarwin()) { - withPackageDarwinEnv() { - body() - } - } else { - if (isArm()) { - withPackageArmEnv() { - body() - } - } else { - withPackageLinuxEnv() { - body() - } - } - } - } else { - error 'withPackageEnv: unsupported platform' - } -} - -def withPackageLinuxEnv(Closure body) { - // Copied from https://github.com/elastic/beats/blob/e6e65aa92fe355c95789691ebf5a3bcecaf5b4ea/.ci/packaging.groovy#L126-L142 - def PLATFORMS = [ '+all', - 'linux/amd64', - 'linux/386', - 'linux/arm64', - // armv7 packaging isn't working, and we don't currently - // need it for release. Do not re-enable it without - // confirming it is fixed, you will break the packaging - // pipeline! - //'linux/armv7', - // The platforms above are disabled temporarly as crossbuild images are - // not available. See: https://github.com/elastic/golang-crossbuild/issues/71 - //'linux/ppc64le', - //'linux/mips64', - //'linux/s390x', - 'windows/amd64', - 'windows/386' - ].join(' ') - withEnv([ - "PLATFORMS=${PLATFORMS}" - ]) { - body() - } -} - -def withPackageArmEnv(Closure body) { - // Copied from https://github.com/elastic/beats/blob/e6e65aa92fe355c95789691ebf5a3bcecaf5b4ea/.ci/packaging.groovy#L126-L142 - def PLATFORMS = [ 'linux/arm64' ].join(' ') - withEnv([ - "PLATFORMS=${PLATFORMS}", - "PACKAGES=docker" - ]) { - body() - } -} - -def withPackageDarwinEnv(Closure body) { - // Copied from https://github.com/elastic/beats/blob/e6e65aa92fe355c95789691ebf5a3bcecaf5b4ea/.ci/packaging.groovy#L126-L142 - def PLATFORMS = [ '+all', - 'darwin/amd64', - 'darwin/arm64', - ].join(' ') - withEnv([ - "PLATFORMS=${PLATFORMS}" - ]) { - body() - } -} - -def runK8s(Map args=[:]) { - withGithubNotify(context: args.context) { - withMageEnv(){ - withKindEnv(args) { - dir("${BASE_DIR}"){ - sh(label: "Deploy to kubernetes",script: "make -C deploy/kubernetes test") - } - } - } - } -} - -/** -* Wrapper to know if the build should enalbe the e2e stage -*/ -def isE2eEnabled() { - return params.end_to_end_tests_ci || env.GITHUB_COMMENT?.contains('e2e tests') || matchesPrLabel(label: 'ci:end-to-end') -} - -/** -* Wrapper to know if the build should enalbe the package stage -*/ -def isPackageEnabled() { - return env.PACKAGING_CHANGES == "true" || env.GITHUB_COMMENT?.contains('package') || matchesPrLabel(label: 'ci:package') -} - -/** -* Wrapper to know if the build should enable the windows extended support -*/ -def isExtendedWindowsEnabled() { - return env.EXT_WINDOWS_CHANGES == "true" || params.extended_windows_ci || env.GITHUB_COMMENT?.contains('extended windows') || matchesPrLabel(label: 'ci:extended-windows') -} - -/** -* Wrapper to know if the build should enable the M1 extended support -*/ -def isExtendedM1Enabled() { - return env.EXT_M1_CHANGES == "true" || params.extended_m1_ci || env.GITHUB_COMMENT?.contains('extended m1') || matchesPrLabel(label: 'ci:extended-m1') -} diff --git a/.ci/jobs/defaults.yml b/.ci/jobs/defaults.yml deleted file mode 100644 index a2bcaae57e1..00000000000 --- a/.ci/jobs/defaults.yml +++ /dev/null @@ -1,19 +0,0 @@ - ---- - -##### GLOBAL METADATA - -- meta: - cluster: fleet-ci - -##### JOB DEFAULTS - -- job: - logrotate: - numToKeep: 20 - node: linux - concurrent: true - publishers: - - email: - recipients: infra-root+build@elastic.co - prune-dead-branches: true diff --git a/.ci/jobs/elastic-agent-mbp.yml b/.ci/jobs/elastic-agent-mbp.yml deleted file mode 100644 index 8947d15880a..00000000000 --- a/.ci/jobs/elastic-agent-mbp.yml +++ /dev/null @@ -1,43 +0,0 @@ ---- -- job: - name: "elastic-agent/elastic-agent-mbp" - display-name: elastic-agent - description: "Elastic agent" - project-type: multibranch - script-path: .ci/Jenkinsfile - scm: - - github: - branch-discovery: no-pr - discover-pr-forks-strategy: merge-current - discover-pr-forks-trust: permission - discover-pr-origin: merge-current - discover-tags: true - head-filter-regex: '(main|7\.17|8\.\d+|PR-.*|v\d+\.\d+\.\d+)' - notification-context: 'fleet-ci' - repo: elastic-agent - repo-owner: elastic - credentials-id: 2a9602aa-ab9f-4e52-baf3-b71ca88469c7-UserAndToken - ssh-checkout: - credentials: f6c7695a-671e-4f4f-a331-acdce44ff9ba - build-strategies: - - tags: - ignore-tags-older-than: -1 - ignore-tags-newer-than: -1 - - regular-branches: true - - change-request: - ignore-target-only-changes: true - clean: - after: true - before: true - prune: true - shallow-clone: true - depth: 4 - do-not-fetch-tags: true - submodule: - disable: false - recursive: true - parent-credentials: true - timeout: 100 - timeout: '15' - use-author: true - wipe-workspace: true diff --git a/.ci/jobs/elastic-agent-schedule-daily.yml b/.ci/jobs/elastic-agent-schedule-daily.yml deleted file mode 100644 index 2fd728a9e8c..00000000000 --- a/.ci/jobs/elastic-agent-schedule-daily.yml +++ /dev/null @@ -1,26 +0,0 @@ ---- -- job: - name: "elastic-agent/elastic-agent-schedule-daily" - display-name: Jobs scheduled daily (weekdays) - description: Jobs scheduled daily (weekdays) - view: Beats - project-type: pipeline - parameters: - - string: - name: branch_specifier - default: main - description: the Git branch specifier to build - pipeline-scm: - script-path: .ci/schedule-daily.groovy - scm: - - git: - url: git@github.com:elastic/elastic-agent.git - refspec: +refs/heads/*:refs/remotes/origin/* - wipe-workspace: 'True' - name: origin - shallow-clone: true - credentials-id: f6c7695a-671e-4f4f-a331-acdce44ff9ba - branches: - - $branch_specifier - triggers: - - timed: 'H H(2-3) * * 1-5' diff --git a/.ci/jobs/folders.yml b/.ci/jobs/folders.yml deleted file mode 100644 index cf309e259a3..00000000000 --- a/.ci/jobs/folders.yml +++ /dev/null @@ -1,5 +0,0 @@ ---- -- job: - name: "elastic-agent" - description: "Elastic Agent" - project-type: folder diff --git a/.ci/schedule-daily.groovy b/.ci/schedule-daily.groovy deleted file mode 100644 index 70ad931ecda..00000000000 --- a/.ci/schedule-daily.groovy +++ /dev/null @@ -1,48 +0,0 @@ -@Library('apm@current') _ - -pipeline { - agent none - environment { - NOTIFY_TO = credentials('notify-to') - PIPELINE_LOG_LEVEL = 'INFO' - } - options { - timeout(time: 1, unit: 'HOURS') - buildDiscarder(logRotator(numToKeepStr: '20', artifactNumToKeepStr: '20')) - timestamps() - ansiColor('xterm') - disableResume() - durabilityHint('PERFORMANCE_OPTIMIZED') - } - triggers { - cron('H H(2-3) * * 1-5') - } - stages { - stage('Nighly beats builds') { - steps { - runBuilds(quietPeriodFactor: 2000, branches: ['main', '8.', '8.']) - } - } - } - post { - cleanup { - notifyBuildResult(prComment: false) - } - } -} - -def runBuilds(Map args = [:]) { - def branches = getBranchesFromAliases(aliases: args.branches) - def quietPeriod = 0 - branches.each { branch -> - build(quietPeriod: quietPeriod, - job: "elastic-agent/elastic-agent-mbp/${branch}", - parameters: [ - booleanParam(name: 'integration_tests_ci', value: true), - // Disable running e2e until we fix the 2e2 testing suite - booleanParam(name: 'end_to_end_tests_ci', value: false) - ], - wait: false, propagate: false) - quietPeriod += args.quietPeriodFactor - } -} diff --git a/.ci/scripts/install-go.bat b/.ci/scripts/install-go.bat deleted file mode 100755 index 29448bd4f63..00000000000 --- a/.ci/scripts/install-go.bat +++ /dev/null @@ -1,57 +0,0 @@ -set GOPATH=%WORKSPACE% -set MAGEFILE_CACHE=%WORKSPACE%\.magefile - -set PATH=%WORKSPACE%\bin;C:\ProgramData\chocolatey\bin;%PATH% - -curl --version >nul 2>&1 && ( - echo found curl -) || ( - choco install curl -y --no-progress --skipdownloadcache -) - -mkdir %WORKSPACE%\bin - -IF EXIST "%PROGRAMFILES(X86)%" ( - REM Force the gvm installation. - SET GVM_BIN=gvm.exe - curl -L -o %WORKSPACE%\bin\gvm.exe https://github.com/andrewkroh/gvm/releases/download/v0.3.0/gvm-windows-amd64.exe - IF ERRORLEVEL 1 ( - REM gvm installation has failed. - del bin\gvm.exe /s /f /q - exit /b 1 - ) -) ELSE ( - REM Windows 7 workers got a broken gvm installation. - curl -L -o %WORKSPACE%\bin\gvm.exe https://github.com/andrewkroh/gvm/releases/download/v0.3.0/gvm-windows-386.exe - IF ERRORLEVEL 1 ( - REM gvm installation has failed. - del bin\gvm.exe /s /f /q - exit /b 1 - ) -) - -SET GVM_BIN=gvm.exe -WHERE /q %GVM_BIN% -%GVM_BIN% version - -REM Install the given go version -%GVM_BIN% --debug install %GO_VERSION% - -REM Configure the given go version -FOR /f "tokens=*" %%i IN ('"%GVM_BIN%" use %GO_VERSION% --format=batch') DO %%i - -go env -IF ERRORLEVEL 1 ( - REM go is not configured correctly. - rmdir %WORKSPACE%\.gvm /s /q - exit /b 1 -) - -where mage -mage -version -IF ERRORLEVEL 1 ( - go get github.com/magefile/mage - IF ERRORLEVEL 1 ( - exit /b 1 - ) -) diff --git a/.ci/scripts/install-go.sh b/.ci/scripts/install-go.sh deleted file mode 100755 index 31566c08726..00000000000 --- a/.ci/scripts/install-go.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail - -MSG="environment variable missing" -GO_VERSION=${GO_VERSION:?$MSG} -PROPERTIES_FILE=${PROPERTIES_FILE:-"go_env.properties"} -HOME=${HOME:?$MSG} -OS=$(uname -s| tr '[:upper:]' '[:lower:]') -ARCH=$(uname -m| tr '[:upper:]' '[:lower:]') -GVM_CMD="${HOME}/bin/gvm" - -if command -v go -then - set +e - echo "Found Go. Checking version.." - FOUND_GO_VERSION=$(go version|awk '{print $3}'|sed s/go//) - if [ "$FOUND_GO_VERSION" == "$GO_VERSION" ] - then - echo "Versions match. No need to install Go. Exiting." - exit 0 - fi - set -e -fi - -if [ "${ARCH}" == "aarch64" ] ; then - GVM_ARCH_SUFFIX=arm64 -elif [ "${ARCH}" == "x86_64" ] ; then - GVM_ARCH_SUFFIX=amd64 -elif [ "${ARCH}" == "i686" ] ; then - GVM_ARCH_SUFFIX=386 -else - GVM_ARCH_SUFFIX=arm -fi - -echo "UNMET DEP: Installing Go" -mkdir -p "${HOME}/bin" - -curl -sSLo "${GVM_CMD}" "https://github.com/andrewkroh/gvm/releases/download/v0.3.0/gvm-${OS}-${GVM_ARCH_SUFFIX}" -chmod +x "${GVM_CMD}" - -${GVM_CMD} "${GO_VERSION}" |cut -d ' ' -f 2|tr -d '\"' > ${PROPERTIES_FILE} - -eval "$("${GVM_CMD}" "${GO_VERSION}")" diff --git a/.ci/bump-golang.yml b/.github/updatecli-bump-golang.yml similarity index 98% rename from .ci/bump-golang.yml rename to .github/updatecli-bump-golang.yml index 470c6f4c8d5..df5d8104048 100644 --- a/.ci/bump-golang.yml +++ b/.github/updatecli-bump-golang.yml @@ -1,3 +1,4 @@ +# update-cli configuration for automated go updates --- name: Bump golang-version to latest version diff --git a/.github/workflows/bump-golang.yml b/.github/workflows/bump-golang.yml index 30b8fbd89b3..34d488b9d72 100644 --- a/.github/workflows/bump-golang.yml +++ b/.github/workflows/bump-golang.yml @@ -4,7 +4,7 @@ name: bump-golang on: workflow_dispatch: schedule: - - cron: '0 20 * * 6' + - cron: "0 20 * * 6" permissions: contents: read @@ -13,7 +13,6 @@ jobs: bump: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - uses: elastic/apm-pipeline-library/.github/actions/updatecli@current @@ -21,4 +20,4 @@ jobs: vaultUrl: ${{ secrets.VAULT_ADDR }} vaultRoleId: ${{ secrets.VAULT_ROLE_ID }} vaultSecretId: ${{ secrets.VAULT_SECRET_ID }} - pipeline: ./.ci/bump-golang.yml + pipeline: ./.github/updatecli-bump-golang.yml From 08c51501035a2aaae1d52500db1f40f671d40815 Mon Sep 17 00:00:00 2001 From: Pavel Zorin Date: Mon, 20 Nov 2023 15:25:55 +0100 Subject: [PATCH 13/21] Migrate check-ci step to buildkite (#3759) * Migrate check-ci step to buildkite * Remove check-ci stage form jenkinsfile * Migrate check-ci step to buildkite * Migrate check-ci step to buildkite * chmod check-ci.sh * Removed package step --- .buildkite/pipeline.yml | 9 +++++++++ .buildkite/scripts/steps/check-ci.sh | 12 ++++++++++++ 2 files changed, 21 insertions(+) create mode 100755 .buildkite/scripts/steps/check-ci.sh diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 6c47d08bf92..f251bcaf81c 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -4,6 +4,15 @@ env: VAULT_PATH: "kv/ci-shared/observability-ingest/cloud/gcp" DOCKER_REGISTRY: "docker.elastic.co" steps: + - label: "check-ci" + key: "check-ci" + command: ".buildkite/scripts/steps/check-ci.sh" + agents: + provider: "gcp" + image: "family/core-ubuntu-2204" + retry: + manual: + allowed: true - group: "Unit tests" key: "unit-tests" steps: diff --git a/.buildkite/scripts/steps/check-ci.sh b/.buildkite/scripts/steps/check-ci.sh new file mode 100755 index 00000000000..46923d1b7ef --- /dev/null +++ b/.buildkite/scripts/steps/check-ci.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +set -euo pipefail + +source .buildkite/scripts/common.sh + +echo "--- Check CI" +go version +mage --version +BEAT_VERSION=$(make get-version) +echo "Beat version: $BEAT_VERSION" +make check-ci \ No newline at end of file From d5f34dfd21a969f7422efb5e0f085d96d89d4059 Mon Sep 17 00:00:00 2001 From: Alex K <8418476+fearful-symmetry@users.noreply.github.com> Date: Mon, 20 Nov 2023 08:00:44 -0800 Subject: [PATCH 14/21] Add serverless beats tests (#3658) * cleaning up * final bit of cleanup * fix magefile, cleanup docs * clean up errors, make linter happy * fix headers * fix fields in runner config * add dashboard checks * clean up, refactor * clean up * tinker with env vars * fix defaults in fixture * check binary name in test setup * allow ilm override in tests * fix filebeat tests, add cleanup * tinker with dashboards * fix ilm tests * use API keys for auth * add additional integration tests * remove beats-specific code * hack in serverless tests * tinker with tests * change env var naming * actually use correct provisioner name * tinker with buildkite again * fix things after refactor * fix buildkite * fix my bash scripts * my bash is a tad rusty * tinker with script hooks * not sure what ci role I broke * clean up es handlers * deal with recent refactor * fix my broken refactor * change url, see what happens * tinker with tests more * swap pipelines, see what happens * break apart beat runners * create test * add more tests * override tests in progress * finish overwrite tests * still adding tests * cleanup * fix env var names * fix filebeat setup, test names * fixing up filebeat * use templates, fix ES query * tinker with buildkite * still tinkering * still trying to get it to work * still trying to get it to work * add script, see what happens * tinkering with script * tinkering with script * trying to diagnose buildkite * still fighting with buildkite * still trying to get it to work * set workspace * change workspace * change workspace, again * change workspace, again * trying to debug buildkite * add debug statements * run mage twice * I swear I'll figure this out * potential script install issue? * use different condition for if block * fix potential wrong flag in setup scripts * change -p flag * fix workspace path, add tests,remove debug code * use proper env vars * fix mage call, directories * set working directories * now trying to get auth working * change name to make setup happy * disable cleanup * re-enable cleanup * run one test at a time * refactor, run filebeat * move function def * fix if blocks * fix test run, add support for auditbeat and packetbeat * add packetbeat support, run auditbeat tests * add serverless buildkite logic * add more documentation * fix up tests --- .buildkite/pipeline.yml | 12 + .buildkite/scripts/common.sh | 4 +- .buildkite/scripts/steps/beats_tests.sh | 70 +++ magefile.go | 27 + pkg/testing/tools/estools/elasticsearch.go | 2 +- testing/integration/beats_serverless_test.go | 628 +++++++++++++++++++ 6 files changed, 740 insertions(+), 3 deletions(-) create mode 100755 .buildkite/scripts/steps/beats_tests.sh create mode 100644 testing/integration/beats_serverless_test.go diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index f251bcaf81c..e7484aa3fe6 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -228,6 +228,18 @@ steps: provider: "gcp" machineType: "n1-standard-8" + - label: "Serverless Beats Tests" + key: "serverless-beats-integration-tests" + command: ".buildkite/scripts/steps/beats_tests.sh" + if: "build.env('CRON') == 'yes'" + agents: + provider: "gcp" + machineType: "n1-standard-8" + retry: + manual: + allowed: true + + - wait: ~ continue_on_failure: true - label: "Processing test results" diff --git a/.buildkite/scripts/common.sh b/.buildkite/scripts/common.sh index 16d32fb77ab..86c370ae360 100644 --- a/.buildkite/scripts/common.sh +++ b/.buildkite/scripts/common.sh @@ -53,7 +53,7 @@ getOSOptions() { # Wrapper function for executing mage mage() { go version - if ! [ -x "$(type -p mage | sed 's/mage is //g')" ]; + if ! [ -x "$(type -P mage | sed 's/mage is //g')" ]; then echo "installing mage ${SETUP_MAGE_VERSION}" make mage @@ -68,7 +68,7 @@ mage() { # Wrapper function for executing go go(){ # Search for the go in the Path - if ! [ -x "$(type -p go | sed 's/go is //g')" ]; + if ! [ -x "$(type -P go | sed 's/go is //g')" ]; then getOSOptions echo "installing golang "${GO_VERSION}" for "${AGENT_OS_NAME}/${AGENT_OS_ARCH}" " diff --git a/.buildkite/scripts/steps/beats_tests.sh b/.buildkite/scripts/steps/beats_tests.sh new file mode 100755 index 00000000000..b62bc947b6e --- /dev/null +++ b/.buildkite/scripts/steps/beats_tests.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +set -euo pipefail + +#========================= +# NOTE: This entire script is a temporary hack until we have buildkite set up on the beats repo. +# until then, we need some kind of serverless integration tests, hence this script, which just clones the beats repo, +# and runs the serverless integration suite against different beats +# After buildkite is set up on beats, this file/PR should be reverted. +#========================== + +source .buildkite/scripts/common.sh +STACK_PROVISIONER="${1:-"serverless"}" + +run_test_for_beat(){ + local beat_name=$1 + + #build + export WORKSPACE="build/beats/x-pack/${beat_name}" + SNAPSHOT=true PLATFORMS=linux/amd64 PACKAGES=tar.gz,zip mage package + + #run + export AGENT_BUILD_DIR="build/beats/x-pack/${beat_name}/build/distributions" + export WORKSPACE=$(pwd) + + set +e + TEST_INTEG_CLEAN_ON_EXIT=true TEST_PLATFORMS="linux/amd64" STACK_PROVISIONER="$STACK_PROVISIONER" SNAPSHOT=true mage integration:testBeatServerless $beat_name + TESTS_EXIT_STATUS=$? + set -e + + return $TESTS_EXIT_STATUS +} +#run mage before setup, since this will install go and mage +#the setup scripts will do a few things that assume we're running out of elastic-agent and will break things for beats, so run before we do actual setup +mage -l + +mkdir -p build +cd build + +git clone --filter=tree:0 git@github.com:elastic/beats.git +cd .. + +# export WORKSPACE=beats/x-pack/metricbeat + +# SNAPSHOT=true PLATFORMS=linux/amd64,windows/amd64 PACKAGES=tar.gz,zip mage package + + +# cd .. + +# export AGENT_BUILD_DIR=build/beats/x-pack/metricbeat/build/distributions +# export WORKSPACE=$(pwd) + +# set +e +# TEST_INTEG_CLEAN_ON_EXIT=true TEST_PLATFORMS="linux/amd64" STACK_PROVISIONER="$STACK_PROVISIONER" SNAPSHOT=true mage integration:testBeatServerless metricbeat +# TESTS_EXIT_STATUS=$? +# set -e + +# exit $TESTS_EXIT_STATUS + +echo "testing metricbeat..." +run_test_for_beat metricbeat + + + +echo "testing filebeat..." +run_test_for_beat filebeat + + + +echo "testing auditbeat..." +run_test_for_beat auditbeat diff --git a/magefile.go b/magefile.go index 6f29cd68991..fe320754b91 100644 --- a/magefile.go +++ b/magefile.go @@ -1567,6 +1567,33 @@ func (Integration) PrepareOnRemote() { mg.Deps(mage.InstallGoTestTools) } +// Run beat serverless tests +func (Integration) TestBeatServerless(ctx context.Context, beatname string) error { + beatBuildPath := filepath.Join("..", "beats", "x-pack", beatname, "build", "distributions") + if os.Getenv("AGENT_BUILD_DIR") == "" { + err := os.Setenv("AGENT_BUILD_DIR", beatBuildPath) + if err != nil { + return fmt.Errorf("error setting build dir: %s", err) + } + } + + // a bit of bypass logic; run as serverless by default + if os.Getenv("STACK_PROVISIONER") == "" { + err := os.Setenv("STACK_PROVISIONER", "serverless") + if err != nil { + return fmt.Errorf("error setting serverless stack var: %w", err) + } + } else if os.Getenv("STACK_PROVISIONER") == "stateful" { + fmt.Printf(">>> Warning: running TestBeatServerless as stateful\n") + } + + err := os.Setenv("TEST_BINARY_NAME", beatname) + if err != nil { + return fmt.Errorf("error setting binary name: %w", err) + } + return integRunner(ctx, false, "TestBeatsServerless") +} + // TestOnRemote shouldn't be called locally (called on remote host to perform testing) func (Integration) TestOnRemote(ctx context.Context) error { mg.Deps(Build.TestBinaries) diff --git a/pkg/testing/tools/estools/elasticsearch.go b/pkg/testing/tools/estools/elasticsearch.go index 304e917d7ee..1c85ed788f3 100644 --- a/pkg/testing/tools/estools/elasticsearch.go +++ b/pkg/testing/tools/estools/elasticsearch.go @@ -212,7 +212,7 @@ func GetLatestDocumentMatchingQuery(ctx context.Context, client elastictransport queryRaw := map[string]interface{}{ "query": query, "sort": map[string]interface{}{ - "timestamp": "desc", + "@timestamp": "desc", }, "size": 1, } diff --git a/testing/integration/beats_serverless_test.go b/testing/integration/beats_serverless_test.go new file mode 100644 index 00000000000..57123e9142e --- /dev/null +++ b/testing/integration/beats_serverless_test.go @@ -0,0 +1,628 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +//go:build integration + +package integration + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "net/url" + "os" + "path/filepath" + "strings" + "testing" + "text/template" + "time" + + "github.com/gofrs/uuid" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/stretchr/testify/suite" + + "github.com/elastic/elastic-agent-libs/mapstr" + atesting "github.com/elastic/elastic-agent/pkg/testing" + "github.com/elastic/elastic-agent/pkg/testing/define" + "github.com/elastic/elastic-agent/pkg/testing/tools" + "github.com/elastic/elastic-agent/pkg/testing/tools/estools" +) + +type BeatRunner struct { + suite.Suite + requirementsInfo *define.Info + agentFixture *atesting.Fixture + + // connection info + ESHost string + user string + pass string + kibHost string + + testUuid string + testbeatName string + + skipCleanup bool +} + +func TestBeatsServerless(t *testing.T) { + info := define.Require(t, define.Requirements{ + OS: []define.OS{ + {Type: define.Linux}, + }, + Stack: &define.Stack{}, + Local: false, + Sudo: true, + }) + + suite.Run(t, &BeatRunner{requirementsInfo: info}) +} + +func (runner *BeatRunner) SetupSuite() { + runner.skipCleanup = false + + runner.testbeatName = os.Getenv("TEST_BINARY_NAME") + if runner.testbeatName == "" { + runner.T().Fatalf("TEST_BINARY_NAME must be set") + } + if runner.testbeatName == "elastic-agent" { + runner.T().Skipf("tests must be run against a beat, not elastic-agent") + } + + if runner.testbeatName != "filebeat" && runner.testbeatName != "metricbeat" && runner.testbeatName != "auditbeat" && runner.testbeatName != "packetbeat" { + runner.T().Skip("test only supports metricbeat or filebeat") + } + runner.T().Logf("running serverless tests with %s", runner.testbeatName) + + agentFixture, err := define.NewFixtureWithBinary(runner.T(), define.Version(), runner.testbeatName, "/home/ubuntu", atesting.WithRunLength(time.Minute*3), atesting.WithAdditionalArgs([]string{"-E", "output.elasticsearch.allow_older_versions=true"})) + runner.agentFixture = agentFixture + require.NoError(runner.T(), err) + + // the require.* code will fail without these, so assume the values are non-nil + runner.ESHost = os.Getenv("ELASTICSEARCH_HOST") + runner.user = os.Getenv("ELASTICSEARCH_USERNAME") + runner.pass = os.Getenv("ELASTICSEARCH_PASSWORD") + runner.kibHost = os.Getenv("KIBANA_HOST") + + ctx, cancel := context.WithTimeout(context.Background(), time.Minute) + defer cancel() + + beatOutConfig := ` +output.elasticsearch: + hosts: ["{{.es_host}}"] + api_key: "{{.key_user}}:{{.key_pass}}" +setup.kibana: + host: {{.kb_host}} +processors: + - add_fields: + target: host + fields: + test-id: {{.test_id}} +{{.beat_cfg}} +` + + mbCfg := ` +metricbeat.config.modules: + path: ${path.config}/modules.d/*.yml +` + + fbCfg := ` +filebeat.modules: + - module: system + syslog: + enabled: true + auth: + enabled: true +filebeat.config.modules: + - modules: system + syslog: + enabled: true + auth: + enabled: true +` + auditbeatCfg := ` +auditbeat.modules: + +- module: file_integrity + paths: + - /bin + - /usr/bin + - /sbin + - /usr/sbin + - /etc +` + + packetbeatCfg := ` +` + + tmpl, err := template.New("config").Parse(beatOutConfig) + require.NoError(runner.T(), err) + + apiResp, err := estools.CreateAPIKey(ctx, runner.requirementsInfo.ESClient, estools.APIKeyRequest{Name: "test-api-key", Expiration: "1d"}) + require.NoError(runner.T(), err) + + // beats likes to add standard ports to URLs that don't have them, and ESS will sometimes return a URL without a port, assuming :443 + // so try to fix that here + fixedKibanaHost := runner.kibHost + parsedKibana, err := url.Parse(runner.kibHost) + require.NoError(runner.T(), err) + if parsedKibana.Port() == "" { + fixedKibanaHost = fmt.Sprintf("%s:443", fixedKibanaHost) + } + + fixedESHost := runner.ESHost + parsedES, err := url.Parse(runner.ESHost) + require.NoError(runner.T(), err) + if parsedES.Port() == "" { + fixedESHost = fmt.Sprintf("%s:443", fixedESHost) + } + + runner.T().Logf("configuring beats with %s / %s", fixedESHost, fixedKibanaHost) + + testUuid, err := uuid.NewV4() + require.NoError(runner.T(), err) + runner.testUuid = testUuid.String() + + additionalCfg := mbCfg + if runner.testbeatName == "filebeat" { + additionalCfg = fbCfg + } else if runner.testbeatName == "auditbeat" { + additionalCfg = auditbeatCfg + } else if runner.testbeatName == "packetbeat" { + additionalCfg = packetbeatCfg + } + + tmpl_map := map[string]string{"es_host": fixedESHost, "key_user": apiResp.Id, "key_pass": apiResp.APIKey, "kb_host": fixedKibanaHost, "test_id": testUuid.String(), "beat_cfg": additionalCfg} + parsedCfg := bytes.Buffer{} + err = tmpl.Execute(&parsedCfg, tmpl_map) + require.NoError(runner.T(), err) + + err = runner.agentFixture.WriteFileToWorkDir(ctx, parsedCfg.String(), fmt.Sprintf("%s.yml", runner.testbeatName)) + require.NoError(runner.T(), err) +} + +// run the beat with default metricsets, ensure no errors in logs + data is ingested +func (runner *BeatRunner) TestRunAndCheckData() { + ctx, cancel := context.WithTimeout(context.Background(), time.Minute*4) + defer cancel() + + // in case there's already a running template, delete it, forcing the beat to re-install + runner.CleanupTemplates(ctx) + + err := runner.agentFixture.RunBeat(ctx) + require.NoError(runner.T(), err) + + docs, err := estools.GetLatestDocumentMatchingQuery(ctx, runner.requirementsInfo.ESClient, map[string]interface{}{ + "match": map[string]interface{}{ + "host.test-id": runner.testUuid, + }, + }, fmt.Sprintf("*%s*", runner.testbeatName)) + require.NoError(runner.T(), err) + require.NotEmpty(runner.T(), docs.Hits.Hits) +} + +// tests the [beat] setup --dashboards command +func (runner *BeatRunner) TestSetupDashboards() { + ctx, cancel := context.WithTimeout(context.Background(), time.Minute*3) //dashboards seem to take a while + defer cancel() + + resp, err := runner.agentFixture.Exec(ctx, []string{"--path.home", runner.agentFixture.WorkDir(), "setup", "--dashboards"}) + assert.NoError(runner.T(), err) + runner.T().Logf("got response from dashboard setup: %s", string(resp)) + require.True(runner.T(), strings.Contains(string(resp), "Loaded dashboards")) + + dashList, err := tools.GetDashboards(ctx, runner.requirementsInfo.KibanaClient) + require.NoError(runner.T(), err) + + // interesting hack in cases where we don't have a clean environment + // check to see if any of the dashboards were created recently + found := false + for _, dash := range dashList { + if time.Since(dash.UpdatedAt) < time.Minute*5 { + found = true + break + } + } + require.True(runner.T(), found, fmt.Sprintf("could not find dashboard newer than 5 minutes, out of %d dashboards", len(dashList))) + + runner.Run("export dashboards", runner.SubtestExportDashboards) + // cleanup + if !runner.skipCleanup { + for _, dash := range dashList { + err = tools.DeleteDashboard(ctx, runner.requirementsInfo.KibanaClient, dash.ID) + if err != nil { + runner.T().Logf("WARNING: could not delete dashboards after test: %s", err) + break + } + } + } +} + +// tests the [beat] export dashboard command +func (runner *BeatRunner) SubtestExportDashboards() { + ctx, cancel := context.WithTimeout(context.Background(), time.Minute*2) + defer cancel() + outDir := runner.T().TempDir() + + dashlist, err := tools.GetDashboards(ctx, runner.requirementsInfo.KibanaClient) + require.NoError(runner.T(), err) + require.NotEmpty(runner.T(), dashlist) + + exportOut, err := runner.agentFixture.Exec(ctx, []string{"--path.home", + runner.agentFixture.WorkDir(), + "export", + "dashboard", "--folder", outDir, "--id", dashlist[0].ID}) + + runner.T().Logf("got output: %s", exportOut) + assert.NoError(runner.T(), err) + + inFolder, err := os.ReadDir(filepath.Join(outDir, "/_meta/kibana/8/dashboard")) + require.NoError(runner.T(), err) + runner.T().Logf("got log contents: %#v", inFolder) + require.NotEmpty(runner.T(), inFolder) +} + +// NOTE for the below tests: the testing framework doesn't guarantee a new stack instance each time, +// which means we might be running against a stack where a previous test has already done setup. +// perhaps CI should run `mage integration:clean` first? + +// tests the [beat] setup --pipelines command +func (runner *BeatRunner) TestSetupPipelines() { + if runner.testbeatName != "filebeat" { + runner.T().Skip("pipelines only available on filebeat") + } + ctx, cancel := context.WithTimeout(context.Background(), time.Minute) + defer cancel() + + defer func() { + // cleanup + if !runner.skipCleanup { + err := estools.DeletePipelines(ctx, runner.requirementsInfo.ESClient, "*filebeat*") + if err != nil { + runner.T().Logf("WARNING: could not clean up pipelines: %s", err) + } + } + + }() + + // need to actually enable something that has pipelines + resp, err := runner.agentFixture.Exec(ctx, []string{"--path.home", runner.agentFixture.WorkDir(), + "setup", "--pipelines", "--modules", "apache", "-M", "apache.error.enabled=true", "-M", "apache.access.enabled=true"}) + assert.NoError(runner.T(), err) + + runner.T().Logf("got response from pipeline setup: %s", string(resp)) + + pipelines, err := estools.GetPipelines(ctx, runner.requirementsInfo.ESClient, "*filebeat*") + require.NoError(runner.T(), err) + require.NotEmpty(runner.T(), pipelines) + +} + +// test beat setup --index-management with ILM disabled +func (runner *BeatRunner) TestIndexManagementNoILM() { + ctx, cancel := context.WithTimeout(context.Background(), time.Minute) + defer cancel() + defer func() { + runner.CleanupTemplates(ctx) + }() + + resp, err := runner.agentFixture.Exec(ctx, []string{"--path.home", + runner.agentFixture.WorkDir(), + "setup", + "--index-management", + "--E=setup.ilm.enabled=false"}) + runner.T().Logf("got response from management setup: %s", string(resp)) + assert.NoError(runner.T(), err) + // we should not print a warning if we've explicitly disabled ILM + assert.NotContains(runner.T(), string(resp), "not supported") + + tmpls, err := estools.GetIndexTemplatesForPattern(ctx, runner.requirementsInfo.ESClient, fmt.Sprintf("*%s*", runner.testbeatName)) + require.NoError(runner.T(), err) + for _, tmpl := range tmpls.IndexTemplates { + runner.T().Logf("got template: %s", tmpl.Name) + } + require.NotEmpty(runner.T(), tmpls.IndexTemplates) + + runner.Run("export templates", runner.SubtestExportTemplates) + runner.Run("export index patterns", runner.SubtestExportIndexPatterns) + +} + +// tests setup with all default settings +func (runner *BeatRunner) TestWithAllDefaults() { + ctx, cancel := context.WithTimeout(context.Background(), time.Minute) + defer cancel() + defer func() { + runner.CleanupTemplates(ctx) + }() + + // pre-delete in case something else missed cleanup + runner.CleanupTemplates(ctx) + + resp, err := runner.agentFixture.Exec(ctx, []string{"--path.home", + runner.agentFixture.WorkDir(), + "setup", + "--index-management"}) + runner.T().Logf("got response from management setup: %s", string(resp)) + require.NoError(runner.T(), err) + + streams, err := estools.GetDataStreamsForPattern(ctx, runner.requirementsInfo.ESClient, fmt.Sprintf("%s*", runner.testbeatName)) + require.NoError(runner.T(), err) + + require.NotEmpty(runner.T(), streams.DataStreams) + +} + +// test the setup process with mismatching template and DSL names +func (runner *BeatRunner) TestCustomBadNames() { + ctx, cancel := context.WithTimeout(context.Background(), time.Minute) + defer cancel() + + defer func() { + runner.CleanupTemplates(ctx) + }() + + resp, err := runner.agentFixture.Exec(ctx, []string{"-e", "--path.home", + runner.agentFixture.WorkDir(), + "setup", + "--index-management", + "--E=setup.dsl.enabled=true", "--E=setup.dsl.data_stream_pattern='custom-bad-name'", "--E=setup.template.name='custom-name'", "--E=setup.template.pattern='custom-name'"}) + runner.T().Logf("got response from management setup: %s", string(resp)) + require.NoError(runner.T(), err) + + require.True(runner.T(), strings.Contains(string(resp), "Additional updates & overwrites to this config will not work.")) + +} + +func (runner *BeatRunner) TestOverwriteWithCustomName() { + //an updated policy that has a different value than the default of 7d + updatedPolicy := mapstr.M{ + "data_retention": "1d", + } + ctx, cancel := context.WithTimeout(context.Background(), time.Minute) + defer cancel() + defer func() { + runner.CleanupTemplates(ctx) + }() + + lctemp := runner.T().TempDir() + raw, err := json.MarshalIndent(updatedPolicy, "", " ") + require.NoError(runner.T(), err) + + lifecyclePath := filepath.Join(lctemp, "dsl_policy.json") + + err = os.WriteFile(lifecyclePath, raw, 0o744) + require.NoError(runner.T(), err) + + runner.CleanupTemplates(ctx) + + resp, err := runner.agentFixture.Exec(ctx, []string{"--path.home", + runner.agentFixture.WorkDir(), + "setup", + "--index-management", + "--E=setup.dsl.enabled=true", "--E=setup.dsl.data_stream_pattern='custom-name'", "--E=setup.template.name='custom-name'", "--E=setup.template.pattern='custom-name'"}) + runner.T().Logf("got response from management setup: %s", string(resp)) + require.NoError(runner.T(), err) + + runner.CheckDSLPolicy(ctx, "*custom-name*", "7d") + + resp, err = runner.agentFixture.Exec(ctx, []string{"--path.home", + runner.agentFixture.WorkDir(), + "setup", + "--index-management", + "--E=setup.dsl.enabled=true", "--E=setup.dsl.overwrite=true", "--E=setup.dsl.data_stream_pattern='custom-name'", + "--E=setup.template.name='custom-name'", "--E=setup.template.pattern='custom-name'", fmt.Sprintf("--E=setup.dsl.policy_file=%s", lifecyclePath)}) + runner.T().Logf("got response from management setup: %s", string(resp)) + require.NoError(runner.T(), err) + + runner.CheckDSLPolicy(ctx, "*custom-name*", "1d") + +} + +// TestWithCustomLifecyclePolicy uploads a custom DSL policy +func (runner *BeatRunner) TestWithCustomLifecyclePolicy() { + //create a custom policy file + dslPolicy := mapstr.M{ + "data_retention": "1d", + } + + ctx, cancel := context.WithTimeout(context.Background(), time.Minute) + defer cancel() + defer func() { + runner.CleanupTemplates(ctx) + }() + + lctemp := runner.T().TempDir() + raw, err := json.MarshalIndent(dslPolicy, "", " ") + require.NoError(runner.T(), err) + + lifecyclePath := filepath.Join(lctemp, "dsl_policy.json") + + err = os.WriteFile(lifecyclePath, raw, 0o744) + require.NoError(runner.T(), err) + + runner.CleanupTemplates(ctx) + + resp, err := runner.agentFixture.Exec(ctx, []string{"--path.home", + runner.agentFixture.WorkDir(), + "setup", + "--index-management", + "--E=setup.dsl.enabled=true", fmt.Sprintf("--E=setup.dsl.policy_file=%s", lifecyclePath)}) + runner.T().Logf("got response from management setup: %s", string(resp)) + require.NoError(runner.T(), err) + + runner.CheckDSLPolicy(ctx, fmt.Sprintf("%s*", runner.testbeatName), "1d") + +} + +// tests beat setup --index-management with ILM explicitly set +// On serverless, this should fail. +func (runner *BeatRunner) TestIndexManagementILMEnabledFailure() { + ctx, cancel := context.WithTimeout(context.Background(), time.Minute) + defer cancel() + info, err := estools.GetPing(ctx, runner.requirementsInfo.ESClient) + require.NoError(runner.T(), err) + + if info.Version.BuildFlavor != "serverless" { + runner.T().Skip("must run on serverless") + } + + resp, err := runner.agentFixture.Exec(ctx, []string{"--path.home", + runner.agentFixture.WorkDir(), + "setup", + "--index-management", + "--E=setup.ilm.enabled=true", "--E=setup.ilm.overwrite=true"}) + runner.T().Logf("got response from management setup: %s", string(resp)) + require.Error(runner.T(), err) + assert.Contains(runner.T(), string(resp), "error creating") +} + +// tests setup with both ILM and DSL enabled, should fail +func (runner *BeatRunner) TestBothLifecyclesEnabled() { + ctx, cancel := context.WithTimeout(context.Background(), time.Minute) + defer cancel() + + resp, err := runner.agentFixture.Exec(ctx, []string{"--path.home", + runner.agentFixture.WorkDir(), + "setup", + "--index-management", + "--E=setup.ilm.enabled=true", "--E=setup.dsl.enabled=true"}) + runner.T().Logf("got response from management setup: %s", string(resp)) + require.Error(runner.T(), err) +} + +// disable all lifecycle management, ensure it's actually disabled +func (runner *BeatRunner) TestAllLifecyclesDisabled() { + ctx, cancel := context.WithTimeout(context.Background(), time.Minute) + defer cancel() + defer func() { + runner.CleanupTemplates(ctx) + }() + + runner.CleanupTemplates(ctx) + + resp, err := runner.agentFixture.Exec(ctx, []string{"--path.home", + runner.agentFixture.WorkDir(), + "setup", + "--index-management", + "--E=setup.ilm.enabled=false", "--E=setup.dsl.enabled=false"}) + runner.T().Logf("got response from management setup: %s", string(resp)) + require.NoError(runner.T(), err) + + // make sure we have data streams, but there's no lifecycles + streams, err := estools.GetDataStreamsForPattern(ctx, runner.requirementsInfo.ESClient, fmt.Sprintf("*%s*", runner.testbeatName)) + require.NoError(runner.T(), err) + + require.NotEmpty(runner.T(), streams.DataStreams, "found no datastreams") + foundPolicy := false + for _, stream := range streams.DataStreams { + if stream.Lifecycle.DataRetention != "" { + foundPolicy = true + break + } + } + require.False(runner.T(), foundPolicy, "Found a lifecycle policy despite disabling lifecycles. Found: %#v", streams) +} + +// the export command doesn't actually make a network connection, +// so this won't fail +func (runner *BeatRunner) TestExport() { + ctx, cancel := context.WithTimeout(context.Background(), time.Minute) + defer cancel() + info, err := estools.GetPing(ctx, runner.requirementsInfo.ESClient) + require.NoError(runner.T(), err) + + if info.Version.BuildFlavor != "serverless" { + runner.T().Skip("must run on serverless") + } + + resp, err := runner.agentFixture.Exec(ctx, []string{"--path.home", + runner.agentFixture.WorkDir(), + "export", "ilm-policy", "--E=setup.ilm.enabled=true"}) + runner.T().Logf("got response from export: %s", string(resp)) + assert.NoError(runner.T(), err) + // check to see if we got a valid output + policy := map[string]interface{}{} + err = json.Unmarshal(resp, &policy) + require.NoError(runner.T(), err) + + require.NotEmpty(runner.T(), policy["policy"]) +} + +// tests beat export with DSL +func (runner *BeatRunner) TestExportDSL() { + ctx, cancel := context.WithTimeout(context.Background(), time.Minute) + defer cancel() + resp, err := runner.agentFixture.Exec(ctx, []string{"--path.home", + runner.agentFixture.WorkDir(), + "export", "ilm-policy", "--E=setup.dsl.enabled=true"}) + runner.T().Logf("got response from export: %s", string(resp)) + assert.NoError(runner.T(), err) + // check to see if we got a valid output + policy := map[string]interface{}{} + err = json.Unmarshal(resp, &policy) + require.NoError(runner.T(), err) + + require.NotEmpty(runner.T(), policy["data_retention"]) +} + +func (runner *BeatRunner) SubtestExportTemplates() { + ctx, cancel := context.WithTimeout(context.Background(), time.Minute*2) + defer cancel() + outDir := runner.T().TempDir() + + _, err := runner.agentFixture.Exec(ctx, []string{"--path.home", + runner.agentFixture.WorkDir(), + "export", + "template", "--dir", outDir}) + assert.NoError(runner.T(), err) + + inFolder, err := os.ReadDir(filepath.Join(outDir, "/template")) + require.NoError(runner.T(), err) + runner.T().Logf("got log contents: %#v", inFolder) + require.NotEmpty(runner.T(), inFolder) +} + +func (runner *BeatRunner) SubtestExportIndexPatterns() { + ctx, cancel := context.WithTimeout(context.Background(), time.Minute*2) + defer cancel() + + rawPattern, err := runner.agentFixture.Exec(ctx, []string{"--path.home", + runner.agentFixture.WorkDir(), + "export", + "index-pattern"}) + assert.NoError(runner.T(), err) + + idxPattern := map[string]interface{}{} + + err = json.Unmarshal(rawPattern, &idxPattern) + require.NoError(runner.T(), err) + require.NotNil(runner.T(), idxPattern["attributes"]) +} + +// CheckDSLPolicy checks if we have a match for the given DSL policy given a template name and policy data_retention +func (runner *BeatRunner) CheckDSLPolicy(ctx context.Context, tmpl string, policy string) { + streams, err := estools.GetDataStreamsForPattern(ctx, runner.requirementsInfo.ESClient, tmpl) + require.NoError(runner.T(), err) + + foundCustom := false + for _, stream := range streams.DataStreams { + if stream.Lifecycle.DataRetention == policy { + foundCustom = true + break + } + } + + require.True(runner.T(), foundCustom, "did not find our lifecycle policy. Found: %#v", streams) +} + +// CleanupTemplates removes any existing index +func (runner *BeatRunner) CleanupTemplates(ctx context.Context) { + if !runner.skipCleanup { + _ = estools.DeleteIndexTemplatesDataStreams(ctx, runner.requirementsInfo.ESClient, fmt.Sprintf("%s*", runner.testbeatName)) + _ = estools.DeleteIndexTemplatesDataStreams(ctx, runner.requirementsInfo.ESClient, "*custom-name*") + } +} From fa5de97768314d906ff7db74b450c3c78f405689 Mon Sep 17 00:00:00 2001 From: Shaunak Kashyap Date: Thu, 23 Nov 2023 03:48:22 +0530 Subject: [PATCH 15/21] [Upgrade Details] Use `details.DownloadRate` type from elastic-agent-libs (#3793) * Use type from elastic-agent-libs * Updating SHA on elastic-agent-libs replace dependency * Fix test * Bumping up dependency on elastic-agent-libs * Remove unused type --- NOTICE.txt | 4 +- go.mod | 2 +- go.sum | 4 +- .../application/upgrade/details/details.go | 47 ++----------------- .../upgrade/details/details_test.go | 2 +- 5 files changed, 9 insertions(+), 50 deletions(-) diff --git a/NOTICE.txt b/NOTICE.txt index 4bbca01b922..4e9b7cc65ae 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -1166,11 +1166,11 @@ SOFTWARE -------------------------------------------------------------------------------- Dependency : github.com/elastic/elastic-agent-libs -Version: v0.7.0 +Version: v0.7.2 Licence type (autodetected): Apache-2.0 -------------------------------------------------------------------------------- -Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-agent-libs@v0.7.0/LICENSE: +Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-agent-libs@v0.7.2/LICENSE: Apache License Version 2.0, January 2004 diff --git a/go.mod b/go.mod index a0b30a7f3f1..2c550799de2 100644 --- a/go.mod +++ b/go.mod @@ -14,7 +14,7 @@ require ( github.com/elastic/e2e-testing v1.1.0 github.com/elastic/elastic-agent-autodiscover v0.6.5 github.com/elastic/elastic-agent-client/v7 v7.5.0 - github.com/elastic/elastic-agent-libs v0.7.0 + github.com/elastic/elastic-agent-libs v0.7.2 github.com/elastic/elastic-agent-system-metrics v0.8.0 github.com/elastic/elastic-transport-go/v8 v8.3.0 github.com/elastic/go-elasticsearch/v8 v8.10.1 diff --git a/go.sum b/go.sum index ffcbb8e262a..f428a834747 100644 --- a/go.sum +++ b/go.sum @@ -781,8 +781,8 @@ github.com/elastic/elastic-agent-autodiscover v0.6.5 h1:5DeMpuNc8c/tN6HN0A4A2uOF github.com/elastic/elastic-agent-autodiscover v0.6.5/go.mod h1:chulyCAyZb/njMHgzkhC/yWnt8v/Y6eCRUhmFVnsA5o= github.com/elastic/elastic-agent-client/v7 v7.5.0 h1:niI3WQ+01Lnp2r5LxK8SyNhrPJe13vBiOkqrDRK2oTA= github.com/elastic/elastic-agent-client/v7 v7.5.0/go.mod h1:DYoX95xjC4BW/p2avyu724Qr2+hoUIz9eCU9CVS1d+0= -github.com/elastic/elastic-agent-libs v0.7.0 h1:g/+Gzpn4ayXPFbfZsn5lGjbPR1TGqlVpshJVVUNJGlQ= -github.com/elastic/elastic-agent-libs v0.7.0/go.mod h1:o+EySawBZGeYu49shJxerg2wRCimS1dhrD4As0MS700= +github.com/elastic/elastic-agent-libs v0.7.2 h1:yT0hF0UAxJCdQqhHh6SFpgYrcpB10oFzPj8IaytPS2o= +github.com/elastic/elastic-agent-libs v0.7.2/go.mod h1:pVBEElQJUO9mr4WStWNXuQGsJn54lcjAoYAHmsvBLBc= github.com/elastic/elastic-agent-system-metrics v0.8.0 h1:EsWbtd83JvnaqnL57bKS1E6GhOdemTRbxdFDcenR8zQ= github.com/elastic/elastic-agent-system-metrics v0.8.0/go.mod h1:9C1UEfj0P687HAzZepHszN6zXA+2tN2Lx3Osvq1zby8= github.com/elastic/elastic-integration-corpus-generator-tool v0.5.0/go.mod h1:uf9N86y+UACGybdEhZLpwZ93XHWVhsYZAA4c2T2v6YM= diff --git a/internal/pkg/agent/application/upgrade/details/details.go b/internal/pkg/agent/application/upgrade/details/details.go index dae27923ecf..3522b181e59 100644 --- a/internal/pkg/agent/application/upgrade/details/details.go +++ b/internal/pkg/agent/application/upgrade/details/details.go @@ -5,20 +5,12 @@ package details import ( - "encoding/json" - "fmt" - "math" - "strings" "sync" "time" - "github.com/docker/go-units" + "github.com/elastic/elastic-agent-libs/upgrade/details" ) -// downloadRate is a float64 that can be safely marshalled to JSON -// when the value is Infinity. The rate is always in bytes/second units. -type downloadRate float64 - // Observer is a function that will be called with upgrade details type Observer func(details *Details) @@ -43,7 +35,7 @@ type Metadata struct { // DownloadRate is the rate, in bytes per second, at which the download // is progressing. - DownloadRate downloadRate `json:"download_rate,omitempty" yaml:"download_rate,omitempty"` + DownloadRate details.DownloadRate `json:"download_rate,omitempty" yaml:"download_rate,omitempty"` // FailedState is the state an upgrade was in if/when it failed. Use the // Fail() method of UpgradeDetails to correctly record details when @@ -93,7 +85,7 @@ func (d *Details) SetDownloadProgress(percent, rateBytesPerSecond float64) { defer d.mu.Unlock() d.Metadata.DownloadPercent = percent - d.Metadata.DownloadRate = downloadRate(rateBytesPerSecond) + d.Metadata.DownloadRate = details.DownloadRate(rateBytesPerSecond) d.notifyObservers() } @@ -184,36 +176,3 @@ func equalTimePointers(t, otherT *time.Time) bool { return t.Equal(*otherT) } - -func (dr *downloadRate) MarshalJSON() ([]byte, error) { - downloadRateBytesPerSecond := float64(*dr) - if math.IsInf(downloadRateBytesPerSecond, 0) { - return json.Marshal("+Inf bps") - } - - return json.Marshal( - fmt.Sprintf("%sps", units.HumanSizeWithPrecision(downloadRateBytesPerSecond, 2)), - ) -} - -func (dr *downloadRate) UnmarshalJSON(data []byte) error { - var downloadRateStr string - err := json.Unmarshal(data, &downloadRateStr) - if err != nil { - return err - } - - if downloadRateStr == "+Inf bps" { - *dr = downloadRate(math.Inf(1)) - return nil - } - - downloadRateStr = strings.TrimSuffix(downloadRateStr, "ps") - downloadRateBytesPerSecond, err := units.FromHumanSize(downloadRateStr) - if err != nil { - return err - } - - *dr = downloadRate(downloadRateBytesPerSecond) - return nil -} diff --git a/internal/pkg/agent/application/upgrade/details/details_test.go b/internal/pkg/agent/application/upgrade/details/details_test.go index 2ecd933e9f8..d1ade774e7b 100644 --- a/internal/pkg/agent/application/upgrade/details/details_test.go +++ b/internal/pkg/agent/application/upgrade/details/details_test.go @@ -80,7 +80,7 @@ func TestDetailsDownloadRateJSON(t *testing.T) { var unmarshalledDetails Details err = json.Unmarshal(data, &unmarshalledDetails) require.NoError(t, err) - require.Equal(t, float64(1800), float64(unmarshalledDetails.Metadata.DownloadRate)) + require.Equal(t, float64(1794), float64(unmarshalledDetails.Metadata.DownloadRate)) require.Equal(t, .8, unmarshalledDetails.Metadata.DownloadPercent) }) From 37f01b24273ad8f18bce46b8c7df8b35913be860 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Wed, 22 Nov 2023 20:10:23 -0500 Subject: [PATCH 16/21] Log some useful details during startup (#3781) * add some helpful log messages * Infof not Info --- internal/pkg/agent/application/application.go | 1 + pkg/component/runtime/manager.go | 1 + 2 files changed, 2 insertions(+) diff --git a/internal/pkg/agent/application/application.go b/internal/pkg/agent/application/application.go index 1f8853120e3..c277382ae93 100644 --- a/internal/pkg/agent/application/application.go +++ b/internal/pkg/agent/application/application.go @@ -86,6 +86,7 @@ func New( // monitoring is always disabled in testing mode disableMonitoring = true } else { + log.Infof("Loading baseline config from %v", pathConfigFile) rawConfig, err = config.LoadFile(pathConfigFile) if err != nil { return nil, nil, nil, fmt.Errorf("failed to load configuration: %w", err) diff --git a/pkg/component/runtime/manager.go b/pkg/component/runtime/manager.go index 850349406d6..81fde1090b0 100644 --- a/pkg/component/runtime/manager.go +++ b/pkg/component/runtime/manager.go @@ -203,6 +203,7 @@ func (m *Manager) Run(ctx context.Context) error { }) var server *grpc.Server + m.logger.Infof("Starting grpc control protocol listener on port %v with max_message_size %v", m.grpcConfig.Port, m.grpcConfig.MaxMsgSize) if m.tracer != nil { apmInterceptor := apmgrpc.NewUnaryServerInterceptor(apmgrpc.WithRecovery(), apmgrpc.WithTracer(m.tracer)) server = grpc.NewServer( From a6e0c1aef848cd29632e3d205425cc312d1a634b Mon Sep 17 00:00:00 2001 From: Victor Martinez Date: Thu, 23 Nov 2023 10:59:36 +0100 Subject: [PATCH 17/21] ironbank: use ubi:9.3 (#3787) --- dev-tools/packaging/packages.yml | 2 +- dev-tools/packaging/templates/ironbank/Dockerfile.tmpl | 2 +- .../packaging/templates/ironbank/hardening_manifest.yaml.tmpl | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dev-tools/packaging/packages.yml b/dev-tools/packaging/packages.yml index ec7f7e659c0..597c610271c 100644 --- a/dev-tools/packaging/packages.yml +++ b/dev-tools/packaging/packages.yml @@ -427,7 +427,7 @@ shared: - &docker_arm_ubi_spec extra_vars: image_name: '{{.BeatName}}-ubi9' - from: 'registry.access.redhat.com/ubi9/ubi-minimal:9.2' + from: 'registry.access.redhat.com/ubi9/ubi-minimal:9.3' - &elastic_docker_spec extra_vars: diff --git a/dev-tools/packaging/templates/ironbank/Dockerfile.tmpl b/dev-tools/packaging/templates/ironbank/Dockerfile.tmpl index 8d4c4aa9990..58d05bab9e0 100644 --- a/dev-tools/packaging/templates/ironbank/Dockerfile.tmpl +++ b/dev-tools/packaging/templates/ironbank/Dockerfile.tmpl @@ -4,7 +4,7 @@ ################################################################################ ARG BASE_REGISTRY=registry1.dsop.io ARG BASE_IMAGE=redhat/ubi/ubi9 -ARG BASE_TAG=9.2 +ARG BASE_TAG=9.3 FROM ${BASE_REGISTRY}/${BASE_IMAGE}:${BASE_TAG} as prep_files diff --git a/dev-tools/packaging/templates/ironbank/hardening_manifest.yaml.tmpl b/dev-tools/packaging/templates/ironbank/hardening_manifest.yaml.tmpl index 00be4eb62ae..94f4dba3408 100644 --- a/dev-tools/packaging/templates/ironbank/hardening_manifest.yaml.tmpl +++ b/dev-tools/packaging/templates/ironbank/hardening_manifest.yaml.tmpl @@ -14,7 +14,7 @@ tags: # Build args passed to Dockerfile ARGs args: BASE_IMAGE: "redhat/ubi/ubi9" - BASE_TAG: "9.2" + BASE_TAG: "9.3" ELASTIC_STACK: "{{ beat_version }}" ELASTIC_PRODUCT: "elastic-agent" From fb869d62f9fcf858ad14639f161d9daa7413d14a Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Thu, 23 Nov 2023 14:58:31 +0100 Subject: [PATCH 18/21] Limit batchID length. (#3786) If the BatchID is longer than 63 characters, OGC will fail to instantiate a GCP VM, this commit fixes this by using a MD5 hash of the whole ID and ensuring the final ID is at most 63 characters long. --- pkg/testing/runner/runner.go | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pkg/testing/runner/runner.go b/pkg/testing/runner/runner.go index 860ad57215e..aab9b695a2b 100644 --- a/pkg/testing/runner/runner.go +++ b/pkg/testing/runner/runner.go @@ -7,6 +7,7 @@ package runner import ( "bytes" "context" + "crypto/md5" "errors" "fmt" "io" @@ -15,6 +16,7 @@ import ( "strings" "sync" "time" + "unicode/utf8" "gopkg.in/yaml.v2" @@ -1019,6 +1021,16 @@ func createBatchID(batch OSBatch) string { id += "-" + batch.Batch.SudoTests[0].Tests[0].Name } } + + // The batchID needs to be at most 63 characters long otherwise + // OGC will fail to instantiate the VM. + maxIDLen := 63 + if len(id) > maxIDLen { + hash := fmt.Sprintf("%x", md5.Sum([]byte(id))) + hashLen := utf8.RuneCountInString(hash) + id = id[:maxIDLen-hashLen-1] + "-" + hash + } + return strings.ToLower(id) } From a14c51f043070c1fbdbd9c5c83307c5f34216b97 Mon Sep 17 00:00:00 2001 From: Pavel Zorin Date: Thu, 23 Nov 2023 15:05:03 +0100 Subject: [PATCH 19/21] [CI] Strict unit test artifact paths. Removes TEST-go-unit-cov.xml from artifacts (#3780) --- .buildkite/pipeline.yml | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index e7484aa3fe6..ddd92c8d3d5 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -20,7 +20,8 @@ steps: key: "unit-tests-2204" command: ".buildkite/scripts/steps/unit-tests.sh" artifact_paths: - - "build/TEST-**" + - "build/TEST-go-unit.html" + - "build/TEST-go-unit.xml" - "build/diagnostics/*" - "coverage.out" agents: @@ -34,7 +35,8 @@ steps: key: "unit-tests-2204-arm64" command: ".buildkite/scripts/steps/unit-tests.sh" artifact_paths: - - "build/TEST-**" + - "build/TEST-go-unit.html" + - "build/TEST-go-unit.xml" - "build/diagnostics/*" - "coverage.out" agents: @@ -50,7 +52,8 @@ steps: key: "unit-tests-win2022" command: ".\\.buildkite\\scripts\\steps\\unit-tests.ps1" artifact_paths: - - "build/TEST-**" + - "build/TEST-go-unit.html" + - "build/TEST-go-unit.xml" - "build/diagnostics/*" - "coverage.out" agents: @@ -67,7 +70,8 @@ steps: key: "unit-tests-win2016" command: ".\\.buildkite\\scripts\\steps\\unit-tests.ps1" artifact_paths: - - "build/TEST-**" + - "build/TEST-go-unit.html" + - "build/TEST-go-unit.xml" - "build/diagnostics/*" - "coverage.out" agents: @@ -86,7 +90,8 @@ steps: command: ".buildkite/scripts/steps/unit-tests.sh" branches: main artifact_paths: - - "build/TEST-**" + - "build/TEST-go-unit.html" + - "build/TEST-go-unit.xml" - "build/diagnostics/*" - "coverage.out" agents: @@ -100,7 +105,8 @@ steps: key: "unit-tests-macos-13" command: ".buildkite/scripts/steps/unit-tests.sh" artifact_paths: - - "build/TEST-**" + - "build/TEST-go-unit.html" + - "build/TEST-go-unit.xml" - "build/diagnostics/*" - "coverage.out" agents: @@ -117,7 +123,8 @@ steps: key: "unit-tests-win10" command: ".\\.buildkite\\scripts\\steps\\unit-tests.ps1" artifact_paths: - - "build/TEST-**" + - "build/TEST-go-unit.html" + - "build/TEST-go-unit.xml" - "build/diagnostics/*" - "coverage.out" agents: @@ -133,7 +140,8 @@ steps: key: "unit-tests-win11" command: ".\\.buildkite\\scripts\\steps\\unit-tests.ps1" artifact_paths: - - "build/TEST-**" + - "build/TEST-go-unit.html" + - "build/TEST-go-unit.xml" - "build/diagnostics/*" - "coverage.out" agents: @@ -161,7 +169,7 @@ steps: unit-tests-win11 " artifact_paths: - - "build/TEST-**" + - "build/TEST-go-unit.cov" agents: image: "golang:1.20.10" depends_on: @@ -177,10 +185,6 @@ steps: K8S_VERSION: "v{{matrix.k8s_version}}" KIND_VERSION: "v0.20.0" command: ".buildkite/scripts/steps/k8s-tests.sh" - artifact_paths: - - "build/TEST-**" - - "build/diagnostics/*" - - "coverage.out" agents: provider: "gcp" image: "family/core-ubuntu-2204" From 3e93536ce64a328ebf0f2cf92d6d59663c9c8393 Mon Sep 17 00:00:00 2001 From: Michael Wolf Date: Thu, 23 Nov 2023 18:55:19 -0800 Subject: [PATCH 20/21] Change the default FLEET_INSECURE values in Kubernetes templates to false. (#3805) The agent documentation states that the default value for FLEET_INSECURE is false, and true is not recommended to be used. So this changes the kubernetes templates to use the recommended values. --- .../base/elastic-agent-managed-daemonset.yaml | 4 ++-- .../base/elastic-agent-standalone-daemonset.yaml | 2 +- .../base/elastic-agent-managed-daemonset.yaml | 4 ++-- .../extra/elastic-agent-managed-statefulset.yaml | 4 ++-- deploy/kubernetes/elastic-agent-managed-kubernetes.yaml | 4 ++-- .../elastic-agent-managed-daemonset.yaml | 4 ++-- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/deploy/kubernetes/elastic-agent-kustomize/default/elastic-agent-managed/base/elastic-agent-managed-daemonset.yaml b/deploy/kubernetes/elastic-agent-kustomize/default/elastic-agent-managed/base/elastic-agent-managed-daemonset.yaml index c529b15460b..c40326b22b6 100644 --- a/deploy/kubernetes/elastic-agent-kustomize/default/elastic-agent-managed/base/elastic-agent-managed-daemonset.yaml +++ b/deploy/kubernetes/elastic-agent-kustomize/default/elastic-agent-managed/base/elastic-agent-managed-daemonset.yaml @@ -37,7 +37,7 @@ spec: value: "1" # Set to true to communicate with Fleet with either insecure HTTP or unverified HTTPS - name: FLEET_INSECURE - value: "true" + value: "false" # Fleet Server URL to enroll the Elastic Agent into # FLEET_URL can be found in Kibana, go to Management > Fleet > Settings - name: FLEET_URL @@ -62,7 +62,7 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name - # The following ELASTIC_NETINFO:false variable will disable the netinfo.enabled option of add-host-metadata processor. This will remove fields host.ip and host.mac. + # The following ELASTIC_NETINFO:false variable will disable the netinfo.enabled option of add-host-metadata processor. This will remove fields host.ip and host.mac. # For more info: https://www.elastic.co/guide/en/beats/metricbeat/current/add-host-metadata.html - name: ELASTIC_NETINFO value: "false" diff --git a/deploy/kubernetes/elastic-agent-kustomize/default/elastic-agent-standalone/base/elastic-agent-standalone-daemonset.yaml b/deploy/kubernetes/elastic-agent-kustomize/default/elastic-agent-standalone/base/elastic-agent-standalone-daemonset.yaml index 9cdcf8670d6..35dd5612674 100644 --- a/deploy/kubernetes/elastic-agent-kustomize/default/elastic-agent-standalone/base/elastic-agent-standalone-daemonset.yaml +++ b/deploy/kubernetes/elastic-agent-kustomize/default/elastic-agent-standalone/base/elastic-agent-standalone-daemonset.yaml @@ -63,7 +63,7 @@ spec: fieldPath: metadata.name - name: STATE_PATH value: "/etc/elastic-agent" - # The following ELASTIC_NETINFO:false variable will disable the netinfo.enabled option of add-host-metadata processor. This will remove fields host.ip and host.mac. + # The following ELASTIC_NETINFO:false variable will disable the netinfo.enabled option of add-host-metadata processor. This will remove fields host.ip and host.mac. # For more info: https://www.elastic.co/guide/en/beats/metricbeat/current/add-host-metadata.html - name: ELASTIC_NETINFO value: "false" diff --git a/deploy/kubernetes/elastic-agent-kustomize/ksm-autosharding/elastic-agent-managed/base/elastic-agent-managed-daemonset.yaml b/deploy/kubernetes/elastic-agent-kustomize/ksm-autosharding/elastic-agent-managed/base/elastic-agent-managed-daemonset.yaml index 6f5344825c0..d3b384e7a56 100644 --- a/deploy/kubernetes/elastic-agent-kustomize/ksm-autosharding/elastic-agent-managed/base/elastic-agent-managed-daemonset.yaml +++ b/deploy/kubernetes/elastic-agent-kustomize/ksm-autosharding/elastic-agent-managed/base/elastic-agent-managed-daemonset.yaml @@ -37,7 +37,7 @@ spec: value: "1" # Set to true to communicate with Fleet with either insecure HTTP or unverified HTTPS - name: FLEET_INSECURE - value: "true" + value: "false" # Fleet Server URL to enroll the Elastic Agent into # FLEET_URL can be found in Kibana, go to Management > Fleet > Settings - name: FLEET_URL @@ -62,7 +62,7 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name - # The following ELASTIC_NETINFO:false variable will disable the netinfo.enabled option of add-host-metadata processor. This will remove fields host.ip and host.mac. + # The following ELASTIC_NETINFO:false variable will disable the netinfo.enabled option of add-host-metadata processor. This will remove fields host.ip and host.mac. # For more info: https://www.elastic.co/guide/en/beats/metricbeat/current/add-host-metadata.html - name: ELASTIC_NETINFO value: "false" diff --git a/deploy/kubernetes/elastic-agent-kustomize/ksm-autosharding/elastic-agent-managed/extra/elastic-agent-managed-statefulset.yaml b/deploy/kubernetes/elastic-agent-kustomize/ksm-autosharding/elastic-agent-managed/extra/elastic-agent-managed-statefulset.yaml index 3a934d02685..bc493635411 100644 --- a/deploy/kubernetes/elastic-agent-kustomize/ksm-autosharding/elastic-agent-managed/extra/elastic-agent-managed-statefulset.yaml +++ b/deploy/kubernetes/elastic-agent-kustomize/ksm-autosharding/elastic-agent-managed/extra/elastic-agent-managed-statefulset.yaml @@ -37,7 +37,7 @@ spec: value: "1" # Set to true to communicate with Fleet with either insecure HTTP or unverified HTTPS - name: FLEET_INSECURE - value: "true" + value: "false" # Fleet Server URL to enroll the Elastic Agent into # FLEET_URL can be found in Kibana, go to Management > Fleet > Settings - name: FLEET_URL @@ -62,7 +62,7 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name - # The following ELASTIC_NETINFO:false variable will disable the netinfo.enabled option of add-host-metadata processor. This will remove fields host.ip and host.mac. + # The following ELASTIC_NETINFO:false variable will disable the netinfo.enabled option of add-host-metadata processor. This will remove fields host.ip and host.mac. # For more info: https://www.elastic.co/guide/en/beats/metricbeat/current/add-host-metadata.html - name: ELASTIC_NETINFO value: "false" diff --git a/deploy/kubernetes/elastic-agent-managed-kubernetes.yaml b/deploy/kubernetes/elastic-agent-managed-kubernetes.yaml index 44df212a629..0b9526f287d 100644 --- a/deploy/kubernetes/elastic-agent-managed-kubernetes.yaml +++ b/deploy/kubernetes/elastic-agent-managed-kubernetes.yaml @@ -37,7 +37,7 @@ spec: value: "1" # Set to true to communicate with Fleet with either insecure HTTP or unverified HTTPS - name: FLEET_INSECURE - value: "true" + value: "false" # Fleet Server URL to enroll the Elastic Agent into # FLEET_URL can be found in Kibana, go to Management > Fleet > Settings - name: FLEET_URL @@ -62,7 +62,7 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name - # The following ELASTIC_NETINFO:false variable will disable the netinfo.enabled option of add-host-metadata processor. This will remove fields host.ip and host.mac. + # The following ELASTIC_NETINFO:false variable will disable the netinfo.enabled option of add-host-metadata processor. This will remove fields host.ip and host.mac. # For more info: https://www.elastic.co/guide/en/beats/metricbeat/current/add-host-metadata.html - name: ELASTIC_NETINFO value: "false" diff --git a/deploy/kubernetes/elastic-agent-managed/elastic-agent-managed-daemonset.yaml b/deploy/kubernetes/elastic-agent-managed/elastic-agent-managed-daemonset.yaml index 36d5afef3be..7a028b524c1 100644 --- a/deploy/kubernetes/elastic-agent-managed/elastic-agent-managed-daemonset.yaml +++ b/deploy/kubernetes/elastic-agent-managed/elastic-agent-managed-daemonset.yaml @@ -37,7 +37,7 @@ spec: value: "1" # Set to true to communicate with Fleet with either insecure HTTP or unverified HTTPS - name: FLEET_INSECURE - value: "true" + value: "false" # Fleet Server URL to enroll the Elastic Agent into # FLEET_URL can be found in Kibana, go to Management > Fleet > Settings - name: FLEET_URL @@ -62,7 +62,7 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name - # The following ELASTIC_NETINFO:false variable will disable the netinfo.enabled option of add-host-metadata processor. This will remove fields host.ip and host.mac. + # The following ELASTIC_NETINFO:false variable will disable the netinfo.enabled option of add-host-metadata processor. This will remove fields host.ip and host.mac. # For more info: https://www.elastic.co/guide/en/beats/metricbeat/current/add-host-metadata.html - name: ELASTIC_NETINFO value: "false" From 97e82175953af75fbe29fccc3cfcc5688a96dc8f Mon Sep 17 00:00:00 2001 From: Blake Rouse Date: Fri, 24 Nov 2023 09:52:13 -0500 Subject: [PATCH 21/21] [Non-Root] Hide --unprivileged installation flag until supported (#3799) * Hide --unprivileged. * Fix lint. --------- Co-authored-by: Pierre HILBERT --- internal/pkg/agent/cmd/install.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/internal/pkg/agent/cmd/install.go b/internal/pkg/agent/cmd/install.go index 2b96dae2d7a..a1415d0e59b 100644 --- a/internal/pkg/agent/cmd/install.go +++ b/internal/pkg/agent/cmd/install.go @@ -47,7 +47,8 @@ would like the Agent to operate. cmd.Flags().BoolP("force", "f", false, "Force overwrite the current installation and do not prompt for confirmation") cmd.Flags().BoolP("non-interactive", "n", false, "Install Elastic Agent in non-interactive mode which will not prompt on missing parameters but fails instead.") cmd.Flags().String(flagInstallBasePath, paths.DefaultBasePath, "The path where the Elastic Agent will be installed. It must be an absolute path.") - cmd.Flags().Bool(flagInstallUnprivileged, false, "Installed Elastic Agent will create an 'elastic-agent' user and run as that user.") + cmd.Flags().Bool(flagInstallUnprivileged, false, "Installed Elastic Agent will create an 'elastic-agent' user and run as that user. (experimental)") + _ = cmd.Flags().MarkHidden(flagInstallUnprivileged) // Hidden until fully supported addEnrollFlags(cmd) return cmd @@ -77,6 +78,9 @@ func installCmd(streams *cli.IOStreams, cmd *cobra.Command) error { if unprivileged && runtime.GOOS != "linux" { return fmt.Errorf("unable to perform install command, unprivileged is currently only supported on Linux") } + if unprivileged { + fmt.Fprintln(streams.Out, "Unprivileged installation mode enabled; this is an experimental and currently unsupported feature.") + } topPath := paths.InstallPath(basePath)