Skip to content

Commit

Permalink
[Internal] Retry on 504 when calling the permission API (#4355)
Browse files Browse the repository at this point in the history
## Changes

This PR adds logic to retry failed get calls to the permissions API when
the error is a 504. This solution is meant to be temporary and will be
removed as soon as such retries are handled natively in the Databricks
Go SDK.

## Tests

Complete coverage of the added retrier.
  • Loading branch information
renaudhartert-db authored Dec 27, 2024
1 parent 151ed6d commit 1e5443d
Show file tree
Hide file tree
Showing 4 changed files with 144 additions and 5 deletions.
22 changes: 20 additions & 2 deletions common/retry.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@ package common

import (
"context"
"log"
"errors"
"regexp"

"github.com/databricks/databricks-sdk-go/apierr"
"github.com/databricks/databricks-sdk-go/logger"
"github.com/databricks/databricks-sdk-go/retries"
)

Expand All @@ -15,11 +17,27 @@ func RetryOnTimeout[T any](ctx context.Context, f func(context.Context) (*T, err
msg := err.Error()
isTimeout := timeoutRegex.MatchString(msg)
if isTimeout {
log.Printf("[DEBUG] Retrying due to timeout: %s", msg)
logger.Debugf(ctx, "Retrying due to timeout: %s", msg)
}
return isTimeout
}))
return r.Run(ctx, func(ctx context.Context) (*T, error) {
return f(ctx)
})
}

// RetryOn504 returns a [retries.Retrier] that calls the given method
// until it either succeeds or returns an error that is different from
// [apierr.ErrDeadlineExceeded].
func RetryOn504[T any](ctx context.Context, f func(context.Context) (*T, error)) (*T, error) {
r := retries.New[T](retries.WithTimeout(-1), retries.WithRetryFunc(func(err error) bool {
if !errors.Is(err, apierr.ErrDeadlineExceeded) {
return false
}
logger.Debugf(ctx, "Retrying on error 504")
return true
}))
return r.Run(ctx, func(ctx context.Context) (*T, error) {
return f(ctx)
})
}
85 changes: 85 additions & 0 deletions common/retry_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"errors"
"testing"

"github.com/databricks/databricks-sdk-go/apierr"
"github.com/databricks/databricks-sdk-go/experimental/mocks"
"github.com/databricks/databricks-sdk-go/service/workspace"
"github.com/stretchr/testify/assert"
Expand Down Expand Up @@ -47,3 +48,87 @@ func TestRetryOnTimeout_NonRetriableError(t *testing.T) {
})
assert.ErrorIs(t, err, expected)
}

func TestRetryOn504_noError(t *testing.T) {
wantErr := error(nil)
wantRes := (*workspace.ObjectInfo)(nil)
wantCalls := 1

w := mocks.NewMockWorkspaceClient(t)
api := w.GetMockWorkspaceAPI().EXPECT()
api.GetStatusByPath(mock.Anything, mock.Anything).Return(wantRes, wantErr)

gotCalls := 0
gotRes, gotErr := RetryOn504(context.Background(), func(ctx context.Context) (*workspace.ObjectInfo, error) {
gotCalls += 1
return w.WorkspaceClient.Workspace.GetStatusByPath(ctx, "path")
})

assert.ErrorIs(t, gotErr, wantErr)
assert.Equal(t, gotRes, wantRes)
assert.Equal(t, gotCalls, wantCalls)
}

func TestRetryOn504_errorNot504(t *testing.T) {
wantErr := errors.New("test error")
wantRes := (*workspace.ObjectInfo)(nil)
wantCalls := 1

w := mocks.NewMockWorkspaceClient(t)
api := w.GetMockWorkspaceAPI().EXPECT()
api.GetStatusByPath(mock.Anything, mock.Anything).Return(wantRes, wantErr)

gotCalls := 0
gotRes, gotErr := RetryOn504(context.Background(), func(ctx context.Context) (*workspace.ObjectInfo, error) {
gotCalls += 1
return w.WorkspaceClient.Workspace.GetStatusByPath(ctx, "path")
})

assert.ErrorIs(t, gotErr, wantErr)
assert.Equal(t, gotRes, wantRes)
assert.Equal(t, gotCalls, wantCalls)
}

func TestRetryOn504_error504ThenFail(t *testing.T) {
wantErr := errors.New("test error")
wantRes := (*workspace.ObjectInfo)(nil)
wantCalls := 2

w := mocks.NewMockWorkspaceClient(t)
api := w.GetMockWorkspaceAPI().EXPECT()
call := api.GetStatusByPath(mock.Anything, mock.Anything).Return(nil, apierr.ErrDeadlineExceeded)
call.Repeatability = 1
api.GetStatusByPath(mock.Anything, mock.Anything).Return(wantRes, wantErr)

gotCalls := 0
gotRes, gotErr := RetryOn504(context.Background(), func(ctx context.Context) (*workspace.ObjectInfo, error) {
gotCalls++
return w.WorkspaceClient.Workspace.GetStatusByPath(ctx, "path")
})

assert.ErrorIs(t, gotErr, wantErr)
assert.Equal(t, gotRes, wantRes)
assert.Equal(t, gotCalls, wantCalls)
}

func TestRetryOn504_error504ThenSuccess(t *testing.T) {
wantErr := error(nil)
wantRes := &workspace.ObjectInfo{}
wantCalls := 2

w := mocks.NewMockWorkspaceClient(t)
api := w.GetMockWorkspaceAPI().EXPECT()
call := api.GetStatusByPath(mock.Anything, mock.Anything).Return(nil, apierr.ErrDeadlineExceeded)
call.Repeatability = 1
api.GetStatusByPath(mock.Anything, mock.Anything).Return(wantRes, wantErr)

gotCalls := 0
gotRes, gotErr := RetryOn504(context.Background(), func(ctx context.Context) (*workspace.ObjectInfo, error) {
gotCalls++
return w.WorkspaceClient.Workspace.GetStatusByPath(ctx, "path")
})

assert.ErrorIs(t, gotErr, wantErr)
assert.Equal(t, gotRes, wantRes)
assert.Equal(t, gotCalls, wantCalls)
}
12 changes: 9 additions & 3 deletions permissions/resource_permissions.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,10 +122,16 @@ func (a PermissionsAPI) readRaw(objectID string, mapping resourcePermissions) (*
}
idParts := strings.Split(objectID, "/")
id := idParts[len(idParts)-1]
permissions, err := w.Permissions.Get(a.context, iam.GetPermissionRequest{
RequestObjectId: id,
RequestObjectType: mapping.requestObjectType,

// TODO: This a temporary measure to implement retry on 504 until this is
// supported natively in the Go SDK.
permissions, err := common.RetryOn504(a.context, func(ctx context.Context) (*iam.ObjectPermissions, error) {
return w.Permissions.Get(a.context, iam.GetPermissionRequest{
RequestObjectId: id,
RequestObjectType: mapping.requestObjectType,
})
})

var apiErr *apierr.APIError
// https://github.com/databricks/terraform-provider-databricks/issues/1227
// platform propagates INVALID_STATE error for auto-purged clusters in
Expand Down
30 changes: 30 additions & 0 deletions permissions/resource_permissions_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -484,6 +484,36 @@ func TestResourcePermissionsRead_EmptyListResultsInRemoval(t *testing.T) {
}.ApplyNoError(t)
}

func TestResourcePermissionsRead_EmptyListResultsInRemovalWith504Errors(t *testing.T) {
qa.ResourceFixture{
MockWorkspaceClientFunc: func(mwc *mocks.MockWorkspaceClient) {
mwc.GetMockCurrentUserAPI().EXPECT().Me(mock.Anything).Return(&iam.User{UserName: "admin"}, nil)

req := iam.GetPermissionRequest{
RequestObjectId: "abc",
RequestObjectType: "clusters",
}

// Fail 3 times with a 504 error. These should be retried
// transparently.
call := mwc.GetMockPermissionsAPI().EXPECT().Get(mock.Anything, req).Return(nil, apierr.ErrDeadlineExceeded)
call.Repeatability = 3

mwc.GetMockPermissionsAPI().EXPECT().Get(mock.Anything, req).Return(&iam.ObjectPermissions{
ObjectId: "/clusters/abc",
ObjectType: "cluster",
}, nil)
},
Resource: ResourcePermissions(),
Read: true,
Removed: true,
InstanceState: map[string]string{
"cluster_id": "abc",
},
ID: "/clusters/abc",
}.ApplyNoError(t)
}

func TestResourcePermissionsDelete(t *testing.T) {
d, err := qa.ResourceFixture{
MockWorkspaceClientFunc: func(mwc *mocks.MockWorkspaceClient) {
Expand Down

0 comments on commit 1e5443d

Please sign in to comment.