Skip to content

Commit 7683f88

Browse files
committed
fix(uffd-tests): release harnessState.mu before blocking on Serve drain
Address review findings from PR #2519: * rpc_services_test.go: stopServe now snapshots and clears s.stop under the lock, then drops the lock before calling stop() (which blocks on <-done). Holding s.mu across the drain blocked every concurrent RPC handler that takes s.mu, and would deadlock the upcoming race tests where WaitFaultHeld needs s.mu while a worker is parked at a barrier waiting for a paired Pause to drain. * harness_child_test.go: guard the child's rpc conn close with a sync.Once so the deferred safety-net close (needed for early returns from rpc.Register) does not double-close after the explicit close that unblocks server.ServeCodec on the success path. Audit of the rest of rpc_services_test.go found no other lock-during-blocking patterns: releaseAllBarriers, Paging.States, and Barriers.{Install,WaitHeld,Release} already snapshot under the lock and call out unlocked; Lifecycle.{Bootstrap,Shutdown,Resume} only do non-blocking work under the lock (channel send via context.cancel, fdexit.New, goroutine spawn).
1 parent d264863 commit 7683f88

2 files changed

Lines changed: 21 additions & 6 deletions

File tree

packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_child_test.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"net/rpc"
77
"net/rpc/jsonrpc"
88
"os"
9+
"sync"
910
"testing"
1011
)
1112

@@ -38,7 +39,12 @@ func crossProcessServe() error {
3839
if err != nil {
3940
return fmt.Errorf("net.FileConn rpc: %w", err)
4041
}
41-
defer conn.Close()
42+
// Close once: the explicit close before <-codecDone unblocks
43+
// server.ServeCodec on the success path; the deferred close is the
44+
// safety net for early returns from the rpc.Register calls below.
45+
var closeConnOnce sync.Once
46+
closeConn := func() { closeConnOnce.Do(func() { _ = conn.Close() }) }
47+
defer closeConn()
4248

4349
state := newHarnessState(uffdFile.Fd())
4450

@@ -70,7 +76,7 @@ func crossProcessServe() error {
7076
state.releaseAllBarriers()
7177
state.stopServe()
7278

73-
_ = conn.Close()
79+
closeConn()
7480
<-codecDone
7581

7682
return nil

packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,20 @@ func (s *harnessState) startServeLocked() error {
7676
}
7777

7878
func (s *harnessState) stopServe() {
79+
// Snapshot s.stop and clear it under the lock, then drop the lock
80+
// before calling stop() — stop() blocks on <-done draining the Serve
81+
// goroutine, and any concurrent RPC handler that needs s.mu
82+
// (Barriers.registry, Paging.States/Resume, Lifecycle.Bootstrap/Shutdown)
83+
// would otherwise be blocked for the full drain. Required for the
84+
// upcoming barrier-race RPCs where WaitFaultHeld must run while a
85+
// worker parked at a barrier holds Pause's drain.
7986
s.mu.Lock()
80-
defer s.mu.Unlock()
81-
if s.stop != nil {
82-
s.stop()
83-
s.stop = nil
87+
stop := s.stop
88+
s.stop = nil
89+
s.mu.Unlock()
90+
91+
if stop != nil {
92+
stop()
8493
}
8594
}
8695

0 commit comments

Comments
 (0)