Skip to content

Commit 58801cf

Browse files
committed
fix(enterprise): mark nodes from unhealthy coordinators as lost
Instead of removing the mappings of unhealthy coordinators entirely, mark them as lost instead. This prevents peers from disappearing from other peers if a coordinator misses a heartbeat.
1 parent 3ff9cef commit 58801cf

File tree

2 files changed

+50
-5
lines changed

2 files changed

+50
-5
lines changed

enterprise/tailnet/pgcoord.go

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1485,10 +1485,17 @@ func (h *heartbeats) filter(mappings []mapping) []mapping {
14851485
ok := m.coordinator == h.self
14861486
if !ok {
14871487
_, ok = h.coordinators[m.coordinator]
1488+
if !ok {
1489+
// If a mapping exists to a coordinator lost to heartbeats,
1490+
// still add the mapping as LOST. If a coordinator misses
1491+
// heartbeats but a client is still connected to it, this may be
1492+
// the only mapping available for it. Newer mappings will take
1493+
// precedence.
1494+
m.kind = proto.CoordinateResponse_PeerUpdate_LOST
1495+
}
14881496
}
1489-
if ok {
1490-
out = append(out, m)
1491-
}
1497+
1498+
out = append(out, m)
14921499
}
14931500
return out
14941501
}

enterprise/tailnet/pgcoord_internal_test.go

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"time"
1212

1313
"github.com/google/uuid"
14+
"github.com/stretchr/testify/assert"
1415
"github.com/stretchr/testify/require"
1516
"go.uber.org/mock/gomock"
1617
"golang.org/x/xerrors"
@@ -33,9 +34,9 @@ import (
3334
// make update-golden-files
3435
var UpdateGoldenFiles = flag.Bool("update", false, "update .golden files")
3536

36-
// TestHeartbeat_Cleanup is internal so that we can overwrite the cleanup period and not wait an hour for the timed
37+
// TestHeartbeats_Cleanup is internal so that we can overwrite the cleanup period and not wait an hour for the timed
3738
// cleanup.
38-
func TestHeartbeat_Cleanup(t *testing.T) {
39+
func TestHeartbeats_Cleanup(t *testing.T) {
3940
t.Parallel()
4041

4142
ctrl := gomock.NewController(t)
@@ -78,6 +79,43 @@ func TestHeartbeat_Cleanup(t *testing.T) {
7879
close(waitForCleanup)
7980
}
8081

82+
func TestHeartbeats_LostCoordinator_MarkLost(t *testing.T) {
83+
t.Parallel()
84+
85+
ctrl := gomock.NewController(t)
86+
mStore := dbmock.NewMockStore(ctrl)
87+
88+
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
89+
defer cancel()
90+
logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
91+
92+
coordID := uuid.New()
93+
94+
uut := &heartbeats{
95+
ctx: ctx,
96+
logger: logger,
97+
store: mStore,
98+
cleanupPeriod: time.Millisecond,
99+
coordinators: map[uuid.UUID]time.Time{
100+
coordID: time.Now(),
101+
},
102+
}
103+
104+
mpngs := []mapping{{
105+
peer: uuid.New(),
106+
coordinator: uuid.New(),
107+
updatedAt: time.Now(),
108+
node: &proto.Node{},
109+
kind: proto.CoordinateResponse_PeerUpdate_NODE,
110+
}}
111+
112+
// Filter should still return the mapping without a coordinator, but marked
113+
// as LOST.
114+
got := uut.filter(mpngs)
115+
require.Len(t, got, 1)
116+
assert.Equal(t, proto.CoordinateResponse_PeerUpdate_LOST, got[0].kind)
117+
}
118+
81119
// TestLostPeerCleanupQueries tests that our SQL queries to clean up lost peers do what we expect,
82120
// that is, clean up peers and associated tunnels that have been lost for over 24 hours.
83121
func TestLostPeerCleanupQueries(t *testing.T) {

0 commit comments

Comments
 (0)