Skip to content

Commit

Permalink
Increase Addon TTL to 1-hour and reset status per checksum change (#86)
Browse files Browse the repository at this point in the history
* Make starttime mandatory
* Add a sync.WaitGroup for updating status per addon
* Enhance logging
* Reset status whenever checksum changes
* Change TTL to 1 hour

Signed-off-by: Kevin D <[email protected]>
  • Loading branch information
kevdowney authored Jun 9, 2021
1 parent 288676b commit 70e350e
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 17 deletions.
2 changes: 1 addition & 1 deletion api/v1alpha1/addon_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ type AddonStatus struct {
Lifecycle AddonStatusLifecycle `json:"lifecycle"`
Resources []ObjectStatus `json:"resources"`
Reason string `json:"reason"`
StartTime int64 `json:"starttime,omitempty"`
StartTime int64 `json:"starttime"`
}

// +kubebuilder:object:root=true
Expand Down
1 change: 1 addition & 0 deletions config/crd/bases/addonmgr.keikoproj.io_addons.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,7 @@ spec:
- lifecycle
- reason
- resources
- starttime
type: object
type: object
served: true
Expand Down
38 changes: 22 additions & 16 deletions controllers/addon_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"context"
"fmt"
"strings"
"sync"
"time"

"github.com/go-logr/logr"
Expand Down Expand Up @@ -52,7 +53,7 @@ import (
)

// addon ttl time
const TTL int64 = 180000
const TTL = time.Duration(1) * time.Hour // 1 hour

// Watched resources
var (
Expand All @@ -77,6 +78,7 @@ type AddonReconciler struct {
dynClient dynamic.Interface
generatedClient *kubernetes.Clientset
recorder record.EventRecorder
statusWGMap map[string]*sync.WaitGroup
}

// NewAddonReconciler returns an instance of AddonReconciler
Expand All @@ -89,6 +91,7 @@ func NewAddonReconciler(mgr manager.Manager, log logr.Logger) *AddonReconciler {
dynClient: dynamic.NewForConfigOrDie(mgr.GetConfig()),
generatedClient: kubernetes.NewForConfigOrDie(mgr.GetConfig()),
recorder: mgr.GetEventRecorderFor("addons"),
statusWGMap: map[string]*sync.WaitGroup{},
}
}

Expand Down Expand Up @@ -220,13 +223,15 @@ func (r *AddonReconciler) processAddon(ctx context.Context, req reconcile.Reques
// Resources list
instance.Status.Resources = make([]addonmgrv1alpha1.ObjectStatus, 0)

// Set ttl starttime if checksum has changed
if changedStatus {
// Set ttl starttime if checksum has changed
instance.Status.StartTime = common.GetCurretTimestamp()
}

// Clear out the reason
instance.Status.Reason = ""
// Clear out status and reason
instance.Status.Lifecycle.Prereqs = ""
instance.Status.Lifecycle.Installed = ""
instance.Status.Reason = ""
}

// Update status that we have started reconciling this addon.
if instance.Status.Lifecycle.Installed == "" {
Expand All @@ -236,8 +241,8 @@ func (r *AddonReconciler) processAddon(ctx context.Context, req reconcile.Reques
}

// Check if addon installation expired.
if instance.Status.Lifecycle.Installed == addonmgrv1alpha1.Pending && common.IsExpired(instance.Status.StartTime, TTL) {
reason := fmt.Sprintf("Addon %s/%s ttl expired", instance.Namespace, instance.Name)
if instance.Status.Lifecycle.Installed == addonmgrv1alpha1.Pending && common.IsExpired(instance.Status.StartTime, TTL.Milliseconds()) {
reason := fmt.Sprintf("Addon %s/%s ttl expired, starttime exceeded %s", instance.Namespace, instance.Name, TTL.String())
r.recorder.Event(instance, "Warning", "Failed", reason)
err := fmt.Errorf(reason)
log.Error(err, reason)
Expand All @@ -264,7 +269,6 @@ func (r *AddonReconciler) processAddon(ctx context.Context, req reconcile.Reques
reason := fmt.Sprintf("Addon %s/%s could not be finalized. %v", instance.Namespace, instance.Name, err)
r.recorder.Event(instance, "Warning", "Failed", reason)
instance.Status.Lifecycle.Installed = addonmgrv1alpha1.DeleteFailed
instance.Status.StartTime = 0
instance.Status.Reason = reason
log.Error(err, "Failed to finalize addon.")
return reconcile.Result{}, err
Expand All @@ -281,7 +285,6 @@ func (r *AddonReconciler) processAddon(ctx context.Context, req reconcile.Reques
// Record an event if addon is not valid
r.recorder.Event(instance, "Normal", "Pending", reason)
instance.Status.Lifecycle.Installed = addonmgrv1alpha1.Pending
instance.Status.StartTime = 0
instance.Status.Reason = reason

log.Info("Addon %s/%s is waiting on dependencies to be out of Pending state.", instance.Namespace, instance.Name)
Expand All @@ -297,7 +300,6 @@ func (r *AddonReconciler) processAddon(ctx context.Context, req reconcile.Reques
// Record an event if addon is not valid
r.recorder.Event(instance, "Warning", "Failed", reason)
instance.Status.Lifecycle.Installed = addonmgrv1alpha1.Failed
instance.Status.StartTime = 0
instance.Status.Reason = reason

log.Error(err, "Failed to validate addon.")
Expand All @@ -314,7 +316,6 @@ func (r *AddonReconciler) processAddon(ctx context.Context, req reconcile.Reques
r.recorder.Event(instance, "Warning", "Failed", reason)
log.Error(err, "Failed to add finalizer for addon.")
instance.Status.Lifecycle.Installed = addonmgrv1alpha1.Failed
instance.Status.StartTime = 0
instance.Status.Reason = reason
return reconcile.Result{}, err
}
Expand All @@ -340,7 +341,6 @@ func (r *AddonReconciler) processAddon(ctx context.Context, req reconcile.Reques
r.recorder.Event(instance, "Warning", "Failed", reason)
log.Error(err, "Addon failed to find deployed resources.")
instance.Status.Lifecycle.Installed = addonmgrv1alpha1.Failed
instance.Status.StartTime = 0
instance.Status.Reason = reason

return reconcile.Result{}, err
Expand Down Expand Up @@ -407,6 +407,16 @@ func (r *AddonReconciler) validateSecrets(ctx context.Context, addon *addonmgrv1
}

func (r *AddonReconciler) updateAddonStatus(ctx context.Context, log logr.Logger, addon *addonmgrv1alpha1.Addon) error {
addonName := types.NamespacedName{Name: addon.Name, Namespace: addon.Namespace}.String()
wg, ok := r.statusWGMap[addonName]
if !ok {
wg = &sync.WaitGroup{}
r.statusWGMap[addonName] = wg
}
// Wait to process addon updates until we have finished updating same addon
wg.Wait()
wg.Add(1)
defer wg.Done()
err := retry.RetryOnConflict(retry.DefaultRetry, func() error {
return r.Status().Update(ctx, addon, &client.UpdateOptions{})
})
Expand Down Expand Up @@ -438,7 +448,6 @@ func (r *AddonReconciler) executePrereqAndInstall(ctx context.Context, log logr.
log.Error(err, "Addon prereqs workflow failed.")
// if prereqs failed, set install status to failed as well so that STATUS is updated
instance.Status.Lifecycle.Installed = addonmgrv1alpha1.Failed
instance.Status.StartTime = 0
instance.Status.Reason = reason

return err
Expand All @@ -452,7 +461,6 @@ func (r *AddonReconciler) executePrereqAndInstall(ctx context.Context, log logr.
r.recorder.Event(instance, "Warning", "Failed", reason)
// if prereqs failed, set install status to failed as well so that STATUS is updated
instance.Status.Lifecycle.Installed = addonmgrv1alpha1.Failed
instance.Status.StartTime = 0
instance.Status.Reason = reason

return fmt.Errorf(reason)
Expand All @@ -464,7 +472,6 @@ func (r *AddonReconciler) executePrereqAndInstall(ctx context.Context, log logr.
r.recorder.Event(instance, "Warning", "Failed", reason)
log.Error(err, "Addon could not validate secrets.")
instance.Status.Lifecycle.Installed = addonmgrv1alpha1.Failed
instance.Status.StartTime = 0
instance.Status.Reason = reason

return err
Expand All @@ -476,7 +483,6 @@ func (r *AddonReconciler) executePrereqAndInstall(ctx context.Context, log logr.
reason := fmt.Sprintf("Addon %s/%s could not be installed due to error. %v", instance.Namespace, instance.Name, err)
r.recorder.Event(instance, "Warning", "Failed", reason)
log.Error(err, "Addon install workflow failed.")
instance.Status.StartTime = 0
instance.Status.Reason = reason

return err
Expand Down

0 comments on commit 70e350e

Please sign in to comment.