Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion src/control/lib/control/system.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
//
// (C) Copyright 2020-2024 Intel Corporation.
// (C) Copyright 2025 Hewlett Packard Enterprise Development LP
// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand Down Expand Up @@ -1318,6 +1318,7 @@ func SystemRebuildManage(ctx context.Context, rpcClient UnaryInvoker, req *Syste
type SystemSelfHealEvalReq struct {
unaryRequest
msRequest
retryableRequest
}

// SystemSelfHealEvalResp contains the response.
Expand All @@ -1341,6 +1342,10 @@ func SystemSelfHealEval(ctx context.Context, rpcClient UnaryInvoker, req *System
req.setRPC(func(ctx context.Context, conn *grpc.ClientConn) (proto.Message, error) {
return mgmtpb.NewMgmtSvcClient(conn).SystemSelfHealEval(ctx, pbReq)
})
req.retryTestFn = func(err error, _ uint) bool {
return (system.IsUnavailable(err) || IsRetryableConnErr(err) ||
system.IsNotLeader(err) || system.IsNotReplica(err))
Comment on lines +1346 to +1347
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not a change request now, but I think it would be good to generalize this in the future. We have different retryTestFns for a lot of commands where I think the circumstances under which we'd retry are largely the same. Couldn't a lot of MS commands be similarly affected by a leadership change?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mjmac may be able to provide guidance for a future change with regard to this. @kjacque can you approve this PR unless you have blocking requests please?

}

rpcClient.Debugf("DAOS system self-heal eval request: %s", pbUtil.Debug(pbReq))
ur, err := rpcClient.InvokeUnaryRPC(ctx, req)
Expand Down
81 changes: 80 additions & 1 deletion src/control/lib/control/system_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
//
// (C) Copyright 2020-2024 Intel Corporation.
// (C) Copyright 2025 Hewlett Packard Enterprise Development LP
// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand All @@ -19,6 +19,8 @@ import (
mgmtpb "github.com/daos-stack/daos/src/control/common/proto/mgmt"
sharedpb "github.com/daos-stack/daos/src/control/common/proto/shared"
"github.com/daos-stack/daos/src/control/common/test"
"github.com/daos-stack/daos/src/control/fault"
"github.com/daos-stack/daos/src/control/fault/code"
"github.com/daos-stack/daos/src/control/lib/hostlist"
"github.com/daos-stack/daos/src/control/lib/ranklist"
"github.com/daos-stack/daos/src/control/logging"
Expand Down Expand Up @@ -2075,3 +2077,80 @@ func TestControl_SystemSelfHealEval(t *testing.T) {
})
}
}

func TestControl_SystemSelfHealEval_RetryableErrors(t *testing.T) {
for name, testErr := range map[string]error{
"system unavailable": system.ErrRaftUnavail,
"leader step-up": system.ErrLeaderStepUpInProgress,
"connection closed": FaultConnectionClosed(""),
"connection refused": FaultConnectionRefused(""),
"not leader": &system.ErrNotLeader{LeaderHint: "host1", Replicas: []string{"host2"}},
"not replica": &system.ErrNotReplica{Replicas: []string{"host1", "host2"}},
"data plane not started": &fault.Fault{Code: code.ServerDataPlaneNotStarted},
} {
t.Run(name, func(t *testing.T) {
log, buf := logging.NewTestLogger(name)
defer test.ShowBufferOnFailure(t, buf)

client := NewMockInvoker(log, &MockInvokerConfig{
UnaryResponseSet: []*UnaryResponse{
MockMSResponse("", testErr, nil),
MockMSResponse("", nil, &mgmtpb.DaosResp{}),
},
})

gotResp, gotErr := SystemSelfHealEval(test.Context(t), client, &SystemSelfHealEvalReq{})
if gotErr != nil {
t.Fatalf("unexpected error: %v", gotErr)
}

expResp := new(SystemSelfHealEvalResp)
if diff := cmp.Diff(expResp, gotResp, cmpopts.IgnoreUnexported(SystemSelfHealEvalResp{})); diff != "" {
t.Fatalf("unexpected response (-want, +got):\n%s\n", diff)
}
})
}
}

func TestControl_SystemSelfHealEval_NonRetryableErrors(t *testing.T) {
for name, tc := range map[string]struct {
testErr error
expErr error
}{
"system uninitialized": {
testErr: system.ErrUninitialized,
expErr: system.ErrUninitialized,
},
"generic error": {
testErr: errors.New("something went wrong"),
expErr: errors.New("something went wrong"),
},
"connection bad host": {
testErr: FaultConnectionBadHost("badhost"),
expErr: FaultConnectionBadHost("badhost"),
},
"connection no route": {
testErr: FaultConnectionNoRoute("10.0.0.1"),
expErr: FaultConnectionNoRoute("10.0.0.1"),
},
"member exists": {
testErr: system.ErrRankExists(1),
expErr: system.ErrRankExists(1),
},
} {
t.Run(name, func(t *testing.T) {
log, buf := logging.NewTestLogger(name)
defer test.ShowBufferOnFailure(t, buf)

client := NewMockInvoker(log, &MockInvokerConfig{
UnaryResponseSet: []*UnaryResponse{
MockMSResponse("", tc.testErr, nil),
MockMSResponse("", nil, &mgmtpb.DaosResp{}),
},
})

_, gotErr := SystemSelfHealEval(test.Context(t), client, &SystemSelfHealEvalReq{})
test.CmpErr(t, tc.expErr, gotErr)
})
}
}
10 changes: 7 additions & 3 deletions src/control/system/errors.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
//
// (C) Copyright 2020-2024 Intel Corporation.
// (C) Copyright 2025 Hewlett Packard Enterprise Development LP
// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand All @@ -17,6 +17,8 @@ import (
"github.com/pkg/errors"

"github.com/daos-stack/daos/src/control/build"
"github.com/daos-stack/daos/src/control/fault"
"github.com/daos-stack/daos/src/control/fault/code"
"github.com/daos-stack/daos/src/control/lib/ranklist"
)

Expand All @@ -39,8 +41,10 @@ func IsUnavailable(err error) bool {
if err == nil {
return false
}
cause := errors.Cause(err).Error()
return strings.Contains(cause, ErrRaftUnavail.Error()) || strings.Contains(cause, ErrLeaderStepUpInProgress.Error())
cause := errors.Cause(err)
return strings.Contains(cause.Error(), ErrRaftUnavail.Error()) ||
strings.Contains(cause.Error(), ErrLeaderStepUpInProgress.Error()) ||
fault.IsFaultCode(cause, code.ServerDataPlaneNotStarted)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems reasonable to me, but @mjmac may have more context on why this function was set up this way to begin with, checking the strings instead of comparing errors. If the assumption is that server-side errors got flattened somehow, would we actually get the error in the form of a Fault? Or just a string representation of a Fault?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change works in that it addresses the issue and it makes sense that the other errors are defined in this file as sentinel errors rather than faults. Please approve unless there is anything else blocking.

}

// IsEmptyGroupMap returns a boolean indicating whether or not the
Expand Down
23 changes: 23 additions & 0 deletions src/control/system/errors_test.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
//
// (C) Copyright 2024 Intel Corporation.
// (C) Copyright 2026 Hewlett Packard Enterprise Development LP
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand All @@ -12,6 +13,8 @@ import (
"github.com/pkg/errors"

"github.com/daos-stack/daos/src/control/common/test"
"github.com/daos-stack/daos/src/control/fault"
"github.com/daos-stack/daos/src/control/fault/code"
)

func TestSystem_Errors_IsNotReady(t *testing.T) {
Expand Down Expand Up @@ -79,12 +82,32 @@ func TestSystem_Errors_IsUnavailable(t *testing.T) {
err: ErrLeaderStepUpInProgress,
expResult: true,
},
"data plane not started": {
err: &fault.Fault{Code: code.ServerDataPlaneNotStarted},
expResult: true,
},
"wrapped data plane not started": {
err: errors.Wrap(&fault.Fault{Code: code.ServerDataPlaneNotStarted}, "wrapped error"),
expResult: true,
},
"uninitialized not unavailable": {
err: ErrUninitialized,
},
"something else": {
err: errors.New("something is wrong"),
},
"member exists not unavailable": {
err: ErrRankExists(1),
},
"member not found not unavailable": {
err: ErrMemberRankNotFound(1),
},
"pool not found not unavailable": {
err: ErrPoolRankNotFound(1),
},
"different fault code not unavailable": {
err: &fault.Fault{Code: code.ClientUnknown},
},
} {
t.Run(name, func(t *testing.T) {
test.AssertEqual(t, tc.expResult, IsUnavailable(tc.err), "")
Expand Down
Loading