From ab71d89fe7443eb18813ebb53b410f79f989480e Mon Sep 17 00:00:00 2001 From: ValentaTomas Date: Tue, 21 Apr 2026 15:26:14 -0700 Subject: [PATCH 01/22] test(uffd): add cross-process scaffolding for gated and async ops Pre-work for the upcoming REMOVE event handling PR. Lands as a runtime no-op on main: no production code changes, no new UFFD features enabled. Test-helper additions (all *_test.go): - testConfig.gated and per-op `async bool` - operationModeRemove / ServePause / ServeResume / Sleep - testHandler.servePause / serveResume hooks driven by control pipes (fd 7/8) into the helper subprocess - helper subprocess pause/resume goroutine that tears down and restarts the UFFD serve loop on demand - executeAll() async path that fans out goroutines and joins on test context - expectRemoved + checkDirtiness handling for read/write/remove ordering - executeRemove via unix.MADV_DONTNEED - unregister() helper around UFFDIO_UNREGISTER for cleanup - pageStateEntries() now drains settleRequests then takes the pageTracker RLock directly so the snapshot stays correct for future writers (e.g. REMOVE) that don't go through settleRequests operationModeRemove and the gated/async paths are exercised only by the follow-up PR; here they are dead code at runtime, kept to minimise the diff of that PR. --- .../userfaultfd/cross_process_helpers_test.go | 152 ++++++++++++++---- .../uffd/userfaultfd/fd_helpers_test.go | 11 ++ .../sandbox/uffd/userfaultfd/helpers_test.go | 74 ++++++++- 3 files changed, 203 insertions(+), 34 deletions(-) diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/cross_process_helpers_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/cross_process_helpers_test.go index 5cc3fc527e..faf4ed0371 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/cross_process_helpers_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/cross_process_helpers_test.go @@ -93,10 +93,6 @@ func configureCrossProcessTest(t *testing.T, tt testConfig) (*testHandler, error uffdFd, err := newFd(syscall.O_CLOEXEC | syscall.O_NONBLOCK) require.NoError(t, err) - t.Cleanup(func() { - uffdFd.close() - }) - err = configureApi(uffdFd, tt.pagesize) require.NoError(t, err) @@ -110,6 +106,9 @@ func configureCrossProcessTest(t *testing.T, tt testConfig) (*testHandler, error if tt.alwaysWP { cmd.Env = append(cmd.Env, "GO_ALWAYS_WP=1") } + if tt.gated { + cmd.Env = append(cmd.Env, "GO_GATED=1") + } dup, err := syscall.Dup(int(uffdFd)) require.NoError(t, err) @@ -153,12 +152,34 @@ func configureCrossProcessTest(t *testing.T, tt testConfig) (*testHandler, error readySignal <- struct{}{} }() - cmd.ExtraFiles = []*os.File{ + extraFiles := []*os.File{ uffdFile, contentReader, offsetsWriter, readyWriter, } + + var gateCmdWriter *os.File + var gateSyncReader *os.File + if tt.gated { + var gateCmdReader *os.File + gateCmdReader, gateCmdWriter, err = os.Pipe() + require.NoError(t, err) + + var gateSyncWriter *os.File + gateSyncReader, gateSyncWriter, err = os.Pipe() + require.NoError(t, err) + + t.Cleanup(func() { + gateCmdWriter.Close() + gateSyncReader.Close() + }) + + extraFiles = append(extraFiles, gateCmdReader) // fd 7 + extraFiles = append(extraFiles, gateSyncWriter) // fd 8 + } + + cmd.ExtraFiles = extraFiles cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr @@ -169,6 +190,10 @@ func configureCrossProcessTest(t *testing.T, tt testConfig) (*testHandler, error offsetsWriter.Close() readyWriter.Close() uffdFile.Close() + if tt.gated { + extraFiles[4].Close() // gateCmdReader + extraFiles[5].Close() // gateSyncWriter + } t.Cleanup(func() { signalErr := cmd.Process.Signal(syscall.SIGUSR1) @@ -187,6 +212,12 @@ func configureCrossProcessTest(t *testing.T, tt testConfig) (*testHandler, error waitErr == nil, "unexpected error: %v", waitErr, ) + + // Unregister the uffd range before closing the fd. This is a + // no-op for the existing tests but is required by the upcoming + // REMOVE event tests so munmap doesn't block on un-acked events. + unregister(uffdFd, memoryStart, uint64(size)) + uffdFd.close() }) // pageStatesOnce asks the serving process for a snapshot of its pageTracker @@ -230,12 +261,31 @@ func configureCrossProcessTest(t *testing.T, tt testConfig) (*testHandler, error case <-readySignal: } - return &testHandler{ + h := &testHandler{ memoryArea: &memoryArea, pagesize: tt.pagesize, data: data, pageStatesOnce: pageStatesOnce, - }, nil + } + + if tt.gated { + h.servePause = func() error { + if _, err := gateCmdWriter.Write([]byte{'P'}); err != nil { + return err + } + var buf [1]byte + _, err := gateSyncReader.Read(buf[:]) + + return err + } + h.serveResume = func() error { + _, err := gateCmdWriter.Write([]byte{'R'}) + + return err + } + } + + return h, nil } // Secondary process, orchestrator in our case @@ -295,9 +345,6 @@ func crossProcessServe() error { }, }) - exitUffd := make(chan struct{}, 1) - defer close(exitUffd) - l, err := logger.NewDevelopmentLogger() if err != nil { return fmt.Errorf("exit creating logger: %w", err) @@ -353,39 +400,78 @@ func crossProcessServe() error { } defer fdExit.Close() + exitUffd := make(chan struct{}, 1) + go func() { - defer func() { - exitUffd <- struct{}{} - }() + defer func() { exitUffd <- struct{}{} }() serverErr := uffd.Serve(ctx, fdExit) if serverErr != nil { msg := fmt.Errorf("error serving: %w", serverErr) - fmt.Fprint(os.Stderr, msg.Error()) - cancel(msg) - - return } }() cleanup := func() { - err := fdExit.SignalExit() - if err != nil { - msg := fmt.Errorf("error signaling exit: %w", err) + fdExit.SignalExit() + <-exitUffd + } + defer func() { cleanup() }() - fmt.Fprint(os.Stderr, msg.Error()) + if os.Getenv("GO_GATED") == "1" { + gateCmdFile := os.NewFile(uintptr(7), "gate-cmd") + defer gateCmdFile.Close() - cancel(msg) + gateSyncFile := os.NewFile(uintptr(8), "gate-sync") + defer gateSyncFile.Close() + + startServe := func() func() { + newExit, fdErr := fdexit.New() + if fdErr != nil { + cancel(fmt.Errorf("error creating fd exit: %w", fdErr)) + + return func() {} + } + + done := make(chan struct{}) + go func() { + defer close(done) + if err := uffd.Serve(ctx, newExit); err != nil { + cancel(fmt.Errorf("error serving: %w", err)) + } + }() - return + return func() { + newExit.SignalExit() + <-done + newExit.Close() + } } - <-exitUffd - } + stopServe := func() { + cleanup() + } - defer cleanup() + go func() { + var buf [1]byte + for { + if _, err := gateCmdFile.Read(buf[:]); err != nil { + return + } + + switch buf[0] { + case 'P': + stopServe() + gateSyncFile.Write([]byte{1}) + case 'R': + newStop := startServe() + stopServe = newStop + cleanup = newStop + } + } + }() + } exitSignal := make(chan os.Signal, 1) signal.Notify(exitSignal, syscall.SIGUSR1) @@ -415,13 +501,19 @@ type pageStateEntry struct { } // pageStateEntries returns a snapshot of every tracked page and its state. -// It holds the settleRequests write lock so no in-flight faultPage worker -// can mutate the pageTracker while we iterate. +// We first take the settleRequests writer lock just as a fence to drain +// in-flight faultPage workers (which hold settleRequests.RLock()), then +// release it and re-lock pageTracker directly. This lets the snapshot stay +// correct even if future writers to pageTracker (e.g. REMOVE event +// handlers) don't go through settleRequests. func (u *Userfaultfd) pageStateEntries() ([]pageStateEntry, error) { u.settleRequests.Lock() - defer u.settleRequests.Unlock() + u.settleRequests.Unlock() //nolint:staticcheck // SA2001: intentional — drain in-flight faultPage workers. + + u.pageTracker.mu.RLock() + defer u.pageTracker.mu.RUnlock() - entries := make([]pageStateEntry, 0, len(u.pageTracker.m)) + var entries []pageStateEntry for addr, state := range u.pageTracker.m { offset, err := u.ma.GetOffset(addr) if err != nil { diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/fd_helpers_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/fd_helpers_test.go index fbd98f1ce2..e28f83fe72 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/fd_helpers_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/fd_helpers_test.go @@ -41,6 +41,17 @@ func configureApi(f Fd, pagesize uint64) error { return nil } +func unregister(f Fd, addr uintptr, size uint64) error { + r := newUffdioRange(CULong(addr), CULong(size)) + + ret, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(f), UFFDIO_UNREGISTER, uintptr(unsafe.Pointer(&r))) + if errno != 0 { + return fmt.Errorf("UFFDIO_UNREGISTER ioctl failed: %w (ret=%d)", errno, ret) + } + + return nil +} + // mode: UFFDIO_REGISTER_MODE_WP|UFFDIO_REGISTER_MODE_MISSING // This is already called by the FC, but only with the UFFDIO_REGISTER_MODE_MISSING // We need to call it with UFFDIO_REGISTER_MODE_WP when we use both missing and wp diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go index 054aa9e333..48fed479eb 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go @@ -7,11 +7,13 @@ import ( "slices" "sync" "testing" + "time" "unsafe" "github.com/RoaringBitmap/roaring/v2" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "golang.org/x/sys/unix" "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/testutils" ) @@ -26,6 +28,8 @@ type testConfig struct { operations []operation // alwaysWP makes the handler copy with UFFDIO_COPY_MODE_WP for all faults. alwaysWP bool + // gated enables pause/resume control over the handler's serve loop. + gated bool } type operationMode uint32 @@ -33,12 +37,20 @@ type operationMode uint32 const ( operationModeRead operationMode = 1 << iota operationModeWrite + operationModeRemove + operationModeServePause + operationModeServeResume + // operationModeSleep pauses for a short duration to let async goroutines + // enter their blocking syscalls before proceeding. + operationModeSleep ) type operation struct { // Offset in bytes. Must be smaller than the (numberOfPages-1) * pagesize as it reads a page and it must be aligned to the pagesize from the testConfig. offset int64 mode operationMode + // async runs the operation in a background goroutine. + async bool } // handlerPageStates is a snapshot of the pageTracker grouped by state. It @@ -71,23 +83,51 @@ type testHandler struct { // pageStatesOnce returns a per-state snapshot of the handler's pageTracker. // It can only be called once. pageStatesOnce func() (handlerPageStates, error) - mutex sync.Mutex + // servePause and serveResume gate the UFFD event loop in the child process. + // Tests use them to deterministically drain a batch of events before + // more faults are processed. + servePause func() error + serveResume func() error + mutex sync.Mutex } func (h *testHandler) executeAll(t *testing.T, operations []operation) { t.Helper() + var asyncErrors []chan error + for i, op := range operations { + if op.async { + errCh := make(chan error, 1) + asyncErrors = append(asyncErrors, errCh) + + go func() { + errCh <- h.executeOperation(t.Context(), op) + }() + + continue + } + err := h.executeOperation(t.Context(), op) require.NoError(t, err, "step %d: %v at offset %d", i, op.mode, op.offset) } + + for _, errCh := range asyncErrors { + select { + case err := <-errCh: + require.NoError(t, err, "async operation") + case <-t.Context().Done(): + t.Fatal("timed out waiting for async operation") + } + } } type pageExpectation uint8 const ( - expectClean pageExpectation = iota // read-only: present + WP set - expectDirty // written: present + WP cleared + expectClean pageExpectation = iota // read-only: present + WP set + expectDirty // written: present + WP cleared + expectRemoved // removed: not present ) func (h *testHandler) checkDirtiness(t *testing.T, operations []operation) { @@ -100,17 +140,25 @@ func (h *testHandler) checkDirtiness(t *testing.T, operations []operation) { memStart := uintptr(unsafe.Pointer(&(*h.memoryArea)[0])) // Track the final expected state per offset by replaying operations in order. + // A remove after a read/write makes the page not present. + // A read/write after a remove makes it present again. expected := make(map[uint]pageExpectation) for _, op := range operations { off := uint(op.offset) switch op.mode { case operationModeRead: - if _, seen := expected[off]; !seen { + curr, seen := expected[off] + // If we haven't seen this page before or the page + // has previously been removed then the page should be clean + // after this read operation. + if !seen || curr == expectRemoved { expected[off] = expectClean } case operationModeWrite: expected[off] = expectDirty + case operationModeRemove: + expected[off] = expectRemoved } } @@ -119,6 +167,8 @@ func (h *testHandler) checkDirtiness(t *testing.T, operations []operation) { require.NoError(t, err, "pagemap read at offset %d", off) switch expect { + case expectRemoved: + assert.False(t, entry.IsPresent(), "removed page at offset %d should not be present", off) case expectDirty: assert.True(t, entry.IsPresent(), "written page at offset %d should be present", off) assert.False(t, entry.IsWriteProtected(), "written page at offset %d should be dirty", off) @@ -135,11 +185,27 @@ func (h *testHandler) executeOperation(ctx context.Context, op operation) error return h.executeRead(ctx, op) case operationModeWrite: return h.executeWrite(ctx, op) + case operationModeRemove: + return h.executeRemove(op) + case operationModeServePause: + return h.servePause() + case operationModeServeResume: + return h.serveResume() + case operationModeSleep: + time.Sleep(50 * time.Millisecond) + + return nil default: return fmt.Errorf("invalid operation mode: %d", op.mode) } } +func (h *testHandler) executeRemove(op operation) error { + page := (*h.memoryArea)[op.offset : op.offset+int64(h.pagesize)] + + return unix.Madvise(page, unix.MADV_DONTNEED) +} + func (h *testHandler) executeRead(ctx context.Context, op operation) error { readBytes := (*h.memoryArea)[op.offset : op.offset+int64(h.pagesize)] From b14a18caa769e453d94fb71144804d1209a7091d Mon Sep 17 00:00:00 2001 From: ValentaTomas Date: Tue, 21 Apr 2026 15:29:56 -0700 Subject: [PATCH 02/22] test(uffd): restore early uffd close cleanup, keep unregister late --- .../uffd/userfaultfd/cross_process_helpers_test.go | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/cross_process_helpers_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/cross_process_helpers_test.go index faf4ed0371..f46b3cfbd1 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/cross_process_helpers_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/cross_process_helpers_test.go @@ -93,6 +93,10 @@ func configureCrossProcessTest(t *testing.T, tt testConfig) (*testHandler, error uffdFd, err := newFd(syscall.O_CLOEXEC | syscall.O_NONBLOCK) require.NoError(t, err) + t.Cleanup(func() { + uffdFd.close() + }) + err = configureApi(uffdFd, tt.pagesize) require.NoError(t, err) @@ -213,11 +217,11 @@ func configureCrossProcessTest(t *testing.T, tt testConfig) (*testHandler, error "unexpected error: %v", waitErr, ) - // Unregister the uffd range before closing the fd. This is a - // no-op for the existing tests but is required by the upcoming - // REMOVE event tests so munmap doesn't block on un-acked events. + // Unregister the uffd range before the fd-close cleanup runs. + // This is a no-op for the existing tests but is required by the + // upcoming REMOVE event tests so munmap doesn't block on + // un-acked events. unregister(uffdFd, memoryStart, uint64(size)) - uffdFd.close() }) // pageStatesOnce asks the serving process for a snapshot of its pageTracker From 620b1111ec8569fe7b19d51c6d08d891d6459173 Mon Sep 17 00:00:00 2001 From: ValentaTomas Date: Tue, 21 Apr 2026 15:33:09 -0700 Subject: [PATCH 03/22] test(uffd): make gated cleanup idempotent to avoid pause-then-exit hang MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex review caught a deadlock: in GO_GATED mode, handling 'P' replaced the stop closure with one that drained `exitUffd`, but the deferred final cleanup still pointed at the original closure. If the helper exited between 'P' and 'R' (e.g. parent test failure, ctx cancel, SIGUSR1), the defer ran the original cleanup again and blocked forever on `<-exitUffd` — the channel had already been drained and the original Serve goroutine had already exited. The hung helper would in turn hang the parent's cmd.Wait(). Replace the two-variable (cleanup / stopServe) layout with a single mutex-guarded `stopFn` that is reset to a no-op after firing, and have both the gated 'P' handler and the deferred final cleanup go through the same `stopServe` wrapper. 'R' now just installs a fresh `stopFn` that drains the new goroutine. --- .../userfaultfd/cross_process_helpers_test.go | 44 +++++++++++++------ 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/cross_process_helpers_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/cross_process_helpers_test.go index f46b3cfbd1..308efb1b7c 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/cross_process_helpers_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/cross_process_helpers_test.go @@ -19,6 +19,7 @@ import ( "slices" "strconv" "strings" + "sync" "syscall" "testing" @@ -417,11 +418,30 @@ func crossProcessServe() error { } }() - cleanup := func() { - fdExit.SignalExit() - <-exitUffd + // stopFn drains whichever Serve goroutine is currently running and + // is reset to a no-op once it has run. This makes both pause-then-exit + // (no resume in between) and pause-resume-pause-exit safe: every + // caller — gated 'P', gated 'R' replacing the previous stop, and the + // final defer below — sees a stop function that matches the goroutine + // actually running, and never blocks on an already-drained channel. + var ( + stopMu sync.Mutex + stopFn = func() { + fdExit.SignalExit() + <-exitUffd + } + ) + + stopServe := func() { + stopMu.Lock() + fn := stopFn + stopFn = func() {} + stopMu.Unlock() + + fn() } - defer func() { cleanup() }() + + defer stopServe() if os.Getenv("GO_GATED") == "1" { gateCmdFile := os.NewFile(uintptr(7), "gate-cmd") @@ -430,12 +450,12 @@ func crossProcessServe() error { gateSyncFile := os.NewFile(uintptr(8), "gate-sync") defer gateSyncFile.Close() - startServe := func() func() { + startServe := func() { newExit, fdErr := fdexit.New() if fdErr != nil { cancel(fmt.Errorf("error creating fd exit: %w", fdErr)) - return func() {} + return } done := make(chan struct{}) @@ -446,15 +466,13 @@ func crossProcessServe() error { } }() - return func() { + stopMu.Lock() + stopFn = func() { newExit.SignalExit() <-done newExit.Close() } - } - - stopServe := func() { - cleanup() + stopMu.Unlock() } go func() { @@ -469,9 +487,7 @@ func crossProcessServe() error { stopServe() gateSyncFile.Write([]byte{1}) case 'R': - newStop := startServe() - stopServe = newStop - cleanup = newStop + startServe() } } }() From bbdc8f921fb39007611ec7631ebaf491f1b88009 Mon Sep 17 00:00:00 2001 From: ValentaTomas Date: Tue, 21 Apr 2026 15:41:47 -0700 Subject: [PATCH 04/22] test(uffd): drop REMOVE-specific bits, keep only gated/async scaffolding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pull out everything that's not strictly needed for "gated pause/resume + async ops" so PR T is the smallest possible scaffolding diff and the follow-up REMOVE PR carries those pieces alongside the production code that actually needs them: - helpers_test.go: drop operationModeRemove, executeRemove, expectRemoved and the REMOVE-aware checkDirtiness branch — none of the PR T tests call them, and they only become meaningful once UFFD_FEATURE_EVENT_REMOVE is set in the follow-up. - fd_helpers_test.go: revert; drop unregister() helper. Without UFFD_FEATURE_EVENT_REMOVE, MADV_DONTNEED never queues UFFD events, so munmap doesn't block on un-acked events and there's nothing to unregister around. - cross_process_helpers_test.go: - drop unregister(uffdFd, ...) call in late cleanup (same reason as above). - revert pageStateEntries() to the simple settleRequests.Lock pattern; the rework was forward-looking for a REMOVE handler that doesn't go through settleRequests and is dead code on main. - keep cleanup → stopFn/stopServe restructure (mandatory for the gated tear-down/respawn path) but restore the original fdExit.SignalExit error handling that the previous version had silently dropped. - revert incidental cosmetic reformats (exitUffd placement, defer multi-line) so the diff is purely additive in the unchanged paths. --- .../userfaultfd/cross_process_helpers_test.go | 45 ++++++++++--------- .../uffd/userfaultfd/fd_helpers_test.go | 11 ----- .../sandbox/uffd/userfaultfd/helpers_test.go | 31 +++---------- 3 files changed, 30 insertions(+), 57 deletions(-) diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/cross_process_helpers_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/cross_process_helpers_test.go index 308efb1b7c..521851b1e9 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/cross_process_helpers_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/cross_process_helpers_test.go @@ -217,12 +217,6 @@ func configureCrossProcessTest(t *testing.T, tt testConfig) (*testHandler, error waitErr == nil, "unexpected error: %v", waitErr, ) - - // Unregister the uffd range before the fd-close cleanup runs. - // This is a no-op for the existing tests but is required by the - // upcoming REMOVE event tests so munmap doesn't block on - // un-acked events. - unregister(uffdFd, memoryStart, uint64(size)) }) // pageStatesOnce asks the serving process for a snapshot of its pageTracker @@ -350,6 +344,9 @@ func crossProcessServe() error { }, }) + exitUffd := make(chan struct{}, 1) + defer close(exitUffd) + l, err := logger.NewDevelopmentLogger() if err != nil { return fmt.Errorf("exit creating logger: %w", err) @@ -405,16 +402,20 @@ func crossProcessServe() error { } defer fdExit.Close() - exitUffd := make(chan struct{}, 1) - go func() { - defer func() { exitUffd <- struct{}{} }() + defer func() { + exitUffd <- struct{}{} + }() serverErr := uffd.Serve(ctx, fdExit) if serverErr != nil { msg := fmt.Errorf("error serving: %w", serverErr) + fmt.Fprint(os.Stderr, msg.Error()) + cancel(msg) + + return } }() @@ -427,7 +428,17 @@ func crossProcessServe() error { var ( stopMu sync.Mutex stopFn = func() { - fdExit.SignalExit() + err := fdExit.SignalExit() + if err != nil { + msg := fmt.Errorf("error signaling exit: %w", err) + + fmt.Fprint(os.Stderr, msg.Error()) + + cancel(msg) + + return + } + <-exitUffd } ) @@ -521,19 +532,13 @@ type pageStateEntry struct { } // pageStateEntries returns a snapshot of every tracked page and its state. -// We first take the settleRequests writer lock just as a fence to drain -// in-flight faultPage workers (which hold settleRequests.RLock()), then -// release it and re-lock pageTracker directly. This lets the snapshot stay -// correct even if future writers to pageTracker (e.g. REMOVE event -// handlers) don't go through settleRequests. +// It holds the settleRequests write lock so no in-flight faultPage worker +// can mutate the pageTracker while we iterate. func (u *Userfaultfd) pageStateEntries() ([]pageStateEntry, error) { u.settleRequests.Lock() - u.settleRequests.Unlock() //nolint:staticcheck // SA2001: intentional — drain in-flight faultPage workers. - - u.pageTracker.mu.RLock() - defer u.pageTracker.mu.RUnlock() + defer u.settleRequests.Unlock() - var entries []pageStateEntry + entries := make([]pageStateEntry, 0, len(u.pageTracker.m)) for addr, state := range u.pageTracker.m { offset, err := u.ma.GetOffset(addr) if err != nil { diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/fd_helpers_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/fd_helpers_test.go index e28f83fe72..fbd98f1ce2 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/fd_helpers_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/fd_helpers_test.go @@ -41,17 +41,6 @@ func configureApi(f Fd, pagesize uint64) error { return nil } -func unregister(f Fd, addr uintptr, size uint64) error { - r := newUffdioRange(CULong(addr), CULong(size)) - - ret, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(f), UFFDIO_UNREGISTER, uintptr(unsafe.Pointer(&r))) - if errno != 0 { - return fmt.Errorf("UFFDIO_UNREGISTER ioctl failed: %w (ret=%d)", errno, ret) - } - - return nil -} - // mode: UFFDIO_REGISTER_MODE_WP|UFFDIO_REGISTER_MODE_MISSING // This is already called by the FC, but only with the UFFDIO_REGISTER_MODE_MISSING // We need to call it with UFFDIO_REGISTER_MODE_WP when we use both missing and wp diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go index 48fed479eb..8d59f3b5ca 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go @@ -13,7 +13,6 @@ import ( "github.com/RoaringBitmap/roaring/v2" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "golang.org/x/sys/unix" "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/testutils" ) @@ -37,7 +36,6 @@ type operationMode uint32 const ( operationModeRead operationMode = 1 << iota operationModeWrite - operationModeRemove operationModeServePause operationModeServeResume // operationModeSleep pauses for a short duration to let async goroutines @@ -84,8 +82,8 @@ type testHandler struct { // It can only be called once. pageStatesOnce func() (handlerPageStates, error) // servePause and serveResume gate the UFFD event loop in the child process. - // Tests use them to deterministically drain a batch of events before - // more faults are processed. + // Tests use them to deterministically batch a sequence of UFFD events + // before more faults are processed. servePause func() error serveResume func() error mutex sync.Mutex @@ -125,9 +123,8 @@ func (h *testHandler) executeAll(t *testing.T, operations []operation) { type pageExpectation uint8 const ( - expectClean pageExpectation = iota // read-only: present + WP set - expectDirty // written: present + WP cleared - expectRemoved // removed: not present + expectClean pageExpectation = iota // read-only: present + WP set + expectDirty // written: present + WP cleared ) func (h *testHandler) checkDirtiness(t *testing.T, operations []operation) { @@ -140,25 +137,17 @@ func (h *testHandler) checkDirtiness(t *testing.T, operations []operation) { memStart := uintptr(unsafe.Pointer(&(*h.memoryArea)[0])) // Track the final expected state per offset by replaying operations in order. - // A remove after a read/write makes the page not present. - // A read/write after a remove makes it present again. expected := make(map[uint]pageExpectation) for _, op := range operations { off := uint(op.offset) switch op.mode { case operationModeRead: - curr, seen := expected[off] - // If we haven't seen this page before or the page - // has previously been removed then the page should be clean - // after this read operation. - if !seen || curr == expectRemoved { + if _, seen := expected[off]; !seen { expected[off] = expectClean } case operationModeWrite: expected[off] = expectDirty - case operationModeRemove: - expected[off] = expectRemoved } } @@ -167,8 +156,6 @@ func (h *testHandler) checkDirtiness(t *testing.T, operations []operation) { require.NoError(t, err, "pagemap read at offset %d", off) switch expect { - case expectRemoved: - assert.False(t, entry.IsPresent(), "removed page at offset %d should not be present", off) case expectDirty: assert.True(t, entry.IsPresent(), "written page at offset %d should be present", off) assert.False(t, entry.IsWriteProtected(), "written page at offset %d should be dirty", off) @@ -185,8 +172,6 @@ func (h *testHandler) executeOperation(ctx context.Context, op operation) error return h.executeRead(ctx, op) case operationModeWrite: return h.executeWrite(ctx, op) - case operationModeRemove: - return h.executeRemove(op) case operationModeServePause: return h.servePause() case operationModeServeResume: @@ -200,12 +185,6 @@ func (h *testHandler) executeOperation(ctx context.Context, op operation) error } } -func (h *testHandler) executeRemove(op operation) error { - page := (*h.memoryArea)[op.offset : op.offset+int64(h.pagesize)] - - return unix.Madvise(page, unix.MADV_DONTNEED) -} - func (h *testHandler) executeRead(ctx context.Context, op operation) error { readBytes := (*h.memoryArea)[op.offset : op.offset+int64(h.pagesize)] From c39d391711317108a8f63da45af3808370d34621 Mon Sep 17 00:00:00 2001 From: ValentaTomas Date: Tue, 28 Apr 2026 17:14:05 -0700 Subject: [PATCH 05/22] refactor(uffd-tests): replace SIGUSR/env-var/pipe IPC with net/rpc/jsonrpc over unix socket Replace the parent<->child cross-process userfaultfd test harness's pile of single-purpose pipes (offsets, ready, gate-cmd, gate-sync) plus a SIGUSR1 shutdown signal and a SIGUSR2 page-state-snapshot trigger with one bidirectional Unix domain socket carrying stdlib net/rpc + net/rpc/jsonrpc. Per-call request IDs are handled by the standard library, so the parent can issue concurrent RPCs while a handler is parked (which the upcoming fault-barrier methods need). The only fd we still hand off out-of-band is the userfaultfd itself (kernel object, has to go through ExtraFiles); the initial source data is written to a temp file under t.TempDir() because base64-stuffing megabytes through JSON would be silly. Replaced surface: - 4 pipes + 2 signals -> 1 socket + 1 RPC channel - SIGUSR1 graceful exit -> Service.Shutdown - SIGUSR2 + offsets pipe + binary.Write -> Service.PageStates - ready pipe + ReadAll-on-close handshake -> Service.WaitReady - gate-cmd / gate-sync byte protocol -> Service.ServePause / Service.ServeResume Added on top of the same channel (used only by stacked race-tests PR): - Service.InstallFaultBarrier(addr, point) -> token: arms a deterministic barrier in the child's worker goroutine at one of two test-only hook points (beforeWorkerRLockHook, beforeFaultPageHook). - Service.WaitFaultHeld(token): blocks until the worker reaches the barrier - the RPC reply IS the rendezvous. - Service.ReleaseFault(token): lets the parked worker proceed. Production code change is intentionally minimal: two nil-by-default hook fields on Userfaultfd plus three nil-checked calls. Hooks are only assigned in the child's crossProcessServe wiring, so production callers see a single un-predictable load + branch per fault. --- .../sandbox/uffd/userfaultfd/async_wp_test.go | 2 +- .../userfaultfd/cross_process_helpers_test.go | 866 +++++++++++------- .../sandbox/uffd/userfaultfd/helpers_test.go | 26 +- .../sandbox/uffd/userfaultfd/missing_test.go | 8 +- .../uffd/userfaultfd/missing_write_test.go | 8 +- .../sandbox/uffd/userfaultfd/userfaultfd.go | 31 + 6 files changed, 580 insertions(+), 361 deletions(-) diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/async_wp_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/async_wp_test.go index ab8941abea..f779f963c8 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/async_wp_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/async_wp_test.go @@ -289,7 +289,7 @@ func TestAsyncWriteProtection(t *testing.T) { t.Run(tt.name, func(t *testing.T) { t.Parallel() - h, err := configureCrossProcessTest(t, testConfig{ + h, err := configureCrossProcessTest(t.Context(), t, testConfig{ pagesize: tt.pagesize, numberOfPages: tt.numberOfPages, alwaysWP: tt.alwaysWP, diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/cross_process_helpers_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/cross_process_helpers_test.go index 521851b1e9..fcb53a636c 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/cross_process_helpers_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/cross_process_helpers_test.go @@ -1,29 +1,39 @@ package userfaultfd -// This tests is creating uffd in the main process and handling the page faults in another process. -// It prevents problems with Go mmap during testing (https://pojntfx.github.io/networked-linux-memsync/main.html#limitations) and also more accurately simulates what we do with Firecracker. -// These problems are not affecting Firecracker, because: -// 1. It is a different process that handles the page faults -// 2. Does not use garbage collection +// This test creates the userfaultfd in the parent test process and +// drives it from a child helper process. We do this so the actual +// page-fault handling runs in a process where we can fully control +// memory layout (no Go GC scanning / touching the registered region) +// — which mirrors how Firecracker uses UFFD in production. +// +// All parent ↔ child coordination — readiness, page-state queries, +// pause/resume, fault barriers, shutdown — flows over a single Unix +// domain socket using the standard-library net/rpc + jsonrpc codec. +// Each in-flight RPC runs in its own server-side goroutine, so a +// blocking handler (e.g. WaitFaultHeld) does not stall other RPCs. +// The only fd we still hand off out-of-band is the userfaultfd +// itself (kernel object, has to go through ExtraFiles); the initial +// source data is written to a temp file under t.TempDir() because +// base64-stuffing megabytes through the JSON envelope would be silly. import ( "context" "crypto/rand" - "encoding/binary" "errors" "fmt" - "io" + "net" + "net/rpc" + "net/rpc/jsonrpc" "os" "os/exec" - "os/signal" + "path/filepath" "slices" "strconv" - "strings" "sync" "syscall" "testing" + "time" - "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "golang.org/x/sys/unix" @@ -34,8 +44,8 @@ import ( "github.com/e2b-dev/infra/packages/shared/pkg/logger" ) -// MemorySlicer exposes byte slice via the Slicer interface. -// This is used for testing purposes. +// MemorySlicer exposes a byte slice via the Slicer interface. +// Test-only. type MemorySlicer struct { content []byte pagesize int64 @@ -44,10 +54,7 @@ type MemorySlicer struct { var _ block.Slicer = (*MemorySlicer)(nil) func NewMemorySlicer(content []byte, pagesize int64) *MemorySlicer { - return &MemorySlicer{ - content: content, - pagesize: pagesize, - } + return &MemorySlicer{content: content, pagesize: pagesize} } func (s *MemorySlicer) Slice(_ context.Context, offset, size int64) ([]byte, error) { @@ -68,9 +75,7 @@ func (s *MemorySlicer) BlockSize() int64 { func RandomPages(pagesize, numberOfPages uint64) *MemorySlicer { size := pagesize * numberOfPages - - n := int(size) - buf := make([]byte, n) + buf := make([]byte, int(size)) if _, err := rand.Read(buf); err != nil { panic(err) } @@ -78,8 +83,76 @@ func RandomPages(pagesize, numberOfPages uint64) *MemorySlicer { return NewMemorySlicer(buf, int64(pagesize)) } -// Main process, FC in our case -func configureCrossProcessTest(t *testing.T, tt testConfig) (*testHandler, error) { +// Env vars used by the child helper process. +const ( + envHelperFlag = "GO_TEST_HELPER_PROCESS" + envSocketPath = "GO_UFFD_SOCKET" + envContentPath = "GO_UFFD_CONTENT" + envMmapStart = "GO_UFFD_MMAP_START" + envMmapPagesize = "GO_UFFD_MMAP_PAGESIZE" + envMmapTotalSize = "GO_UFFD_MMAP_SIZE" + envAlwaysWP = "GO_UFFD_ALWAYS_WP" + envGated = "GO_UFFD_GATED" + // envBarriers gates the test-only worker hooks. Only race tests + // need them; for everyone else we leave the hook fields nil so + // the hot path stays a single nil-pointer load + branch. + envBarriers = "GO_UFFD_BARRIERS" +) + +// ---- RPC method types --------------------------------------------------- +// +// net/rpc requires methods of the form: +// +// func (s *Service) Method(args *ArgsT, reply *ReplyT) error +// +// where both args and reply are exported pointer types. For methods +// that take or return nothing meaningful we still need a type — Empty +// fills that role. + +type Empty struct{} + +type PageStatesReply struct { + Entries []pageStateEntry +} + +type FaultBarrierArgs struct { + Addr uint64 + Point uint8 +} + +type FaultBarrierReply struct { + Token uint64 +} + +type TokenArgs struct { + Token uint64 +} + +// pageStateEntry is the wire format for PageStates RPC results. +type pageStateEntry struct { + State uint8 + Offset uint64 +} + +// ---- Parent side -------------------------------------------------------- + +// childForkMu serialises the cmd.Start() call across all parallel +// cross-process tests in this binary. Without it, the duplicated +// uffd fd we hand to one child via ExtraFiles is briefly visible in +// the parent's fd table while ANOTHER concurrent test calls fork() +// — so that other test's child inherits a uffd fd it does not own. +// The leaked fd keeps the original test's uffd kernel object alive +// after its owner closes its end and produces hard-to-diagnose +// -parallel-only deadlocks. +// +// Holding the mutex only across cmd.Start (which itself holds the +// process lock for the underlying syscall.ForkExec) is enough — by +// the time Start returns the dup'd fd is already mapped into fd 3 +// in the new child and we close it immediately in the parent below. +var childForkMu sync.Mutex + +// Main process, FC in our case. +func configureCrossProcessTest(ctx context.Context, t *testing.T, tt testConfig) (*testHandler, error) { t.Helper() data := RandomPages(tt.pagesize, tt.numberOfPages) @@ -90,453 +163,414 @@ func configureCrossProcessTest(t *testing.T, tt testConfig) (*testHandler, error memoryArea, memoryStart, err := testutils.NewPageMmap(t, uint64(size), tt.pagesize) require.NoError(t, err) - // We can pass mapping nil as the serve is used only in the helper process. uffdFd, err := newFd(syscall.O_CLOEXEC | syscall.O_NONBLOCK) require.NoError(t, err) - t.Cleanup(func() { uffdFd.close() }) - err = configureApi(uffdFd, tt.pagesize) - require.NoError(t, err) + require.NoError(t, configureApi(uffdFd, tt.pagesize)) + require.NoError(t, register(uffdFd, memoryStart, uint64(size), UFFDIO_REGISTER_MODE_MISSING|UFFDIO_REGISTER_MODE_WP)) - err = register(uffdFd, memoryStart, uint64(size), UFFDIO_REGISTER_MODE_MISSING|UFFDIO_REGISTER_MODE_WP) + tmpDir := t.TempDir() + + contentPath := filepath.Join(tmpDir, "content.bin") + require.NoError(t, os.WriteFile(contentPath, data.Content(), 0o600)) + + socketPath := filepath.Join(tmpDir, "rpc.sock") + listenCfg := net.ListenConfig{} + listener, err := listenCfg.Listen(ctx, "unix", socketPath) require.NoError(t, err) - cmd := exec.CommandContext(t.Context(), os.Args[0], "-test.run=TestHelperServingProcess", "-test.timeout=0") - cmd.Env = append(os.Environ(), "GO_TEST_HELPER_PROCESS=1") - cmd.Env = append(cmd.Env, fmt.Sprintf("GO_MMAP_START=%d", memoryStart)) - cmd.Env = append(cmd.Env, fmt.Sprintf("GO_MMAP_PAGE_SIZE=%d", tt.pagesize)) + cmd := exec.CommandContext(ctx, os.Args[0], "-test.run=TestHelperServingProcess", "-test.timeout=0") + cmd.Env = append(os.Environ(), + envHelperFlag+"=1", + envSocketPath+"="+socketPath, + envContentPath+"="+contentPath, + fmt.Sprintf("%s=%d", envMmapStart, memoryStart), + fmt.Sprintf("%s=%d", envMmapPagesize, tt.pagesize), + fmt.Sprintf("%s=%d", envMmapTotalSize, size), + ) if tt.alwaysWP { - cmd.Env = append(cmd.Env, "GO_ALWAYS_WP=1") + cmd.Env = append(cmd.Env, envAlwaysWP+"=1") } if tt.gated { - cmd.Env = append(cmd.Env, "GO_GATED=1") + cmd.Env = append(cmd.Env, envGated+"=1") + } + if tt.barriers { + cmd.Env = append(cmd.Env, envBarriers+"=1") } - dup, err := syscall.Dup(int(uffdFd)) - require.NoError(t, err) + // We hand the uffd fd to the child via ExtraFiles. The child- + // side dup3 inside fork+exec clears CLOEXEC on the destination + // fd (i.e. fd 3 in the child) automatically, so the SOURCE fd + // in our parent should remain CLOEXEC — otherwise every other + // test fork()'d concurrently from this binary inherits a uffd + // it does not own and hangs surface at higher -parallel. + childForkMu.Lock() - // clear FD_CLOEXEC on the dup we pass across exec - _, err = unix.FcntlInt(uintptr(dup), unix.F_SETFD, 0) + dup, err := syscall.Dup(int(uffdFd)) require.NoError(t, err) + if _, err := unix.FcntlInt(uintptr(dup), unix.F_SETFD, unix.FD_CLOEXEC); err != nil { + childForkMu.Unlock() + require.NoError(t, err) + } uffdFile := os.NewFile(uintptr(dup), "uffd") + cmd.ExtraFiles = []*os.File{uffdFile} + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr - contentReader, contentWriter, err := os.Pipe() - require.NoError(t, err) - - go func() { - _, writeErr := contentWriter.Write(data.Content()) - assert.NoError(t, writeErr) - - closeErr := contentWriter.Close() - assert.NoError(t, closeErr) - }() - - offsetsReader, offsetsWriter, err := os.Pipe() - require.NoError(t, err) - - t.Cleanup(func() { - offsetsReader.Close() - }) - - readyReader, readyWriter, err := os.Pipe() - require.NoError(t, err) + startErr := cmd.Start() + uffdFile.Close() + childForkMu.Unlock() - t.Cleanup(func() { - readyReader.Close() - }) + require.NoError(t, startErr) - readySignal := make(chan struct{}, 1) + // Accept the child's connection. Tight deadline so a wedged + // child surfaces fast instead of hanging the test. + type acceptResult struct { + conn net.Conn + err error + } + acceptCh := make(chan acceptResult, 1) go func() { - _, err := io.ReadAll(readyReader) - assert.NoError(t, err) - - readySignal <- struct{}{} + c, err := listener.Accept() + acceptCh <- acceptResult{conn: c, err: err} }() - extraFiles := []*os.File{ - uffdFile, - contentReader, - offsetsWriter, - readyWriter, + var conn net.Conn + select { + case res := <-acceptCh: + require.NoError(t, res.err) + conn = res.conn + case <-time.After(10 * time.Second): + listener.Close() + _ = cmd.Process.Kill() + _, _ = cmd.Process.Wait() + + return nil, errors.New("child did not connect within 10s") } + listener.Close() - var gateCmdWriter *os.File - var gateSyncReader *os.File - if tt.gated { - var gateCmdReader *os.File - gateCmdReader, gateCmdWriter, err = os.Pipe() - require.NoError(t, err) - - var gateSyncWriter *os.File - gateSyncReader, gateSyncWriter, err = os.Pipe() - require.NoError(t, err) + client := jsonrpc.NewClient(conn) - t.Cleanup(func() { - gateCmdWriter.Close() - gateSyncReader.Close() - }) - - extraFiles = append(extraFiles, gateCmdReader) // fd 7 - extraFiles = append(extraFiles, gateSyncWriter) // fd 8 + h := &testHandler{ + memoryArea: &memoryArea, + pagesize: tt.pagesize, + data: data, + client: client, + conn: conn, + cmd: cmd, } - cmd.ExtraFiles = extraFiles - cmd.Stdout = os.Stdout - cmd.Stderr = os.Stderr - - err = cmd.Start() - require.NoError(t, err) - - contentReader.Close() - offsetsWriter.Close() - readyWriter.Close() - uffdFile.Close() - if tt.gated { - extraFiles[4].Close() // gateCmdReader - extraFiles[5].Close() // gateSyncWriter - } + // WaitReady blocks on the child until its initial setup is done + // (uffd serve goroutine running, hooks installed). The RPC reply + // IS the readiness signal — no separate ready pipe / signal + // needed. + require.NoError(t, h.client.Call("Service.WaitReady", &Empty{}, &Empty{})) t.Cleanup(func() { - signalErr := cmd.Process.Signal(syscall.SIGUSR1) - assert.NoError(t, signalErr) + // Best-effort graceful shutdown via RPC. If the child has + // already crashed the RPC will error and we fall back to + // killing the process below. + _ = h.client.Call("Service.Shutdown", &Empty{}, &Empty{}) + _ = client.Close() waitErr := cmd.Wait() - // It can be either nil, an ExitError, a context.Canceled error, or "signal: killed" - assert.True(t, - (waitErr != nil && func(err error) bool { - var exitErr *exec.ExitError - - return errors.As(err, &exitErr) - }(waitErr)) || - errors.Is(waitErr, context.Canceled) || - (waitErr != nil && strings.Contains(waitErr.Error(), "signal: killed")) || - waitErr == nil, - "unexpected error: %v", waitErr, - ) + if waitErr != nil { + var exitErr *exec.ExitError + if !errors.As(waitErr, &exitErr) { + t.Logf("helper process Wait: %v", waitErr) + } + } }) - // pageStatesOnce asks the serving process for a snapshot of its pageTracker - // and decodes it into a per-state view. It can only be called once. - pageStatesOnce := func() (handlerPageStates, error) { - err := cmd.Process.Signal(syscall.SIGUSR2) - if err != nil { - return handlerPageStates{}, err + if tt.gated { + h.servePause = func() error { + return h.client.Call("Service.ServePause", &Empty{}, &Empty{}) } - - var result handlerPageStates - - for { - var entry pageStateEntry - - // binary.Read uses the same field layout as binary.Write on - // the producer side (sum of fixed-size fields, no struct - // padding), so we never have to hard-code the wire size. - err := binary.Read(offsetsReader, binary.LittleEndian, &entry) - if errors.Is(err, io.EOF) { - break - } - - if err != nil { - return handlerPageStates{}, fmt.Errorf("decoding page state entry: %w", err) - } - - if pageState(entry.State) == faulted { - result.faulted = append(result.faulted, uint(entry.Offset)) - } + h.serveResume = func() error { + return h.client.Call("Service.ServeResume", &Empty{}, &Empty{}) } - - slices.Sort(result.faulted) - - return result, nil - } - - select { - case <-t.Context().Done(): - return nil, t.Context().Err() - case <-readySignal: } - h := &testHandler{ - memoryArea: &memoryArea, - pagesize: tt.pagesize, - data: data, - pageStatesOnce: pageStatesOnce, - } + h.pageStatesOnce = func() (handlerPageStates, error) { + var reply PageStatesReply + if err := h.client.Call("Service.PageStates", &Empty{}, &reply); err != nil { + return handlerPageStates{}, err + } - if tt.gated { - h.servePause = func() error { - if _, err := gateCmdWriter.Write([]byte{'P'}); err != nil { - return err + var states handlerPageStates + for _, e := range reply.Entries { + if pageState(e.State) == faulted { + states.faulted = append(states.faulted, uint(e.Offset)) } - var buf [1]byte - _, err := gateSyncReader.Read(buf[:]) - - return err } - h.serveResume = func() error { - _, err := gateCmdWriter.Write([]byte{'R'}) + slices.Sort(states.faulted) - return err - } + return states, nil } return h, nil } -// Secondary process, orchestrator in our case +// ---- Child side --------------------------------------------------------- + +// Secondary process, orchestrator in our case. func TestHelperServingProcess(t *testing.T) { t.Parallel() - if os.Getenv("GO_TEST_HELPER_PROCESS") != "1" { + if os.Getenv(envHelperFlag) != "1" { t.Skip("this is a helper process, skipping direct execution") } - err := crossProcessServe() - if err != nil { - fmt.Println("exit serving process", err) + if err := crossProcessServe(); err != nil { + fmt.Fprintln(os.Stderr, "exit serving process:", err) os.Exit(1) } os.Exit(0) } +// crossProcessServe wires up the child side: connects back to the +// parent socket, registers the RPC service, and runs uffd.Serve in a +// background goroutine that pause/resume RPCs can stop and restart. func crossProcessServe() error { - ctx, cancel := context.WithCancelCause(context.Background()) - defer cancel(nil) + socketPath := os.Getenv(envSocketPath) + if socketPath == "" { + return fmt.Errorf("missing %s", envSocketPath) + } - startRaw, err := strconv.Atoi(os.Getenv("GO_MMAP_START")) + dialer := net.Dialer{} + conn, err := dialer.DialContext(context.Background(), "unix", socketPath) if err != nil { - return fmt.Errorf("exit parsing mmap start: %w", err) + return fmt.Errorf("dial parent socket: %w", err) } + defer conn.Close() + startRaw, err := strconv.ParseUint(os.Getenv(envMmapStart), 10, 64) + if err != nil { + return fmt.Errorf("parse %s: %w", envMmapStart, err) + } memoryStart := uintptr(startRaw) - uffdFile := os.NewFile(uintptr(3), os.Getenv("GO_UFFD_FILE")) - defer uffdFile.Close() - - uffdFd := uffdFile.Fd() - - contentFile := os.NewFile(uintptr(4), "content") - defer contentFile.Close() + pagesize, err := strconv.ParseInt(os.Getenv(envMmapPagesize), 10, 64) + if err != nil { + return fmt.Errorf("parse %s: %w", envMmapPagesize, err) + } - content, err := io.ReadAll(contentFile) + totalSize, err := strconv.ParseInt(os.Getenv(envMmapTotalSize), 10, 64) if err != nil { - return fmt.Errorf("exit reading content: %w", err) + return fmt.Errorf("parse %s: %w", envMmapTotalSize, err) } - pageSize, err := strconv.ParseInt(os.Getenv("GO_MMAP_PAGE_SIZE"), 10, 64) + content, err := os.ReadFile(os.Getenv(envContentPath)) if err != nil { - return fmt.Errorf("exit parsing page size: %w", err) + return fmt.Errorf("read content: %w", err) + } + if int64(len(content)) != totalSize { + return fmt.Errorf("content size %d != expected %d", len(content), totalSize) } - data := NewMemorySlicer(content, pageSize) + data := NewMemorySlicer(content, pagesize) + + uffdFile := os.NewFile(uintptr(3), "uffd") + defer uffdFile.Close() + uffdFd := uffdFile.Fd() - m := memory.NewMapping([]memory.Region{ + mapping := memory.NewMapping([]memory.Region{ { BaseHostVirtAddr: memoryStart, - Size: uintptr(len(content)), + Size: uintptr(totalSize), Offset: 0, - PageSize: uintptr(pageSize), + PageSize: uintptr(pagesize), }, }) - exitUffd := make(chan struct{}, 1) - defer close(exitUffd) - l, err := logger.NewDevelopmentLogger() if err != nil { - return fmt.Errorf("exit creating logger: %w", err) + return fmt.Errorf("logger: %w", err) } - uffd, err := NewUserfaultfdFromFd(uffdFd, data, m, l) + uffd, err := NewUserfaultfdFromFd(uffdFd, data, mapping, l) if err != nil { - return fmt.Errorf("exit creating uffd: %w", err) + return fmt.Errorf("NewUserfaultfdFromFd: %w", err) } - if os.Getenv("GO_ALWAYS_WP") == "1" { + if os.Getenv(envAlwaysWP) == "1" { uffd.defaultCopyMode = UFFDIO_COPY_MODE_WP } - offsetsFile := os.NewFile(uintptr(5), "offsets") + br := newBarrierRegistry() - offsetsSignal := make(chan os.Signal, 1) - signal.Notify(offsetsSignal, syscall.SIGUSR2) - defer signal.Stop(offsetsSignal) + // Hooks are only wired up when the test asked for them (race + // tests). For everyone else we leave the fields nil so the hot + // path is a single nil-pointer load + branch. + if os.Getenv(envBarriers) == "1" { + uffd.beforeWorkerRLockHook = br.hookFor(barrierBeforeRLock) + uffd.beforeFaultPageHook = br.hookFor(barrierBeforeFaultPage) + } - go func() { - defer offsetsFile.Close() - - for { - select { - case <-ctx.Done(): - return - case <-offsetsSignal: - entries, entriesErr := uffd.pageStateEntries() - if entriesErr != nil { - cancel(fmt.Errorf("error getting page state entries: %w", entriesErr)) - - return - } - - for _, entry := range entries { - writeErr := binary.Write(offsetsFile, binary.LittleEndian, entry) - if writeErr != nil { - cancel(fmt.Errorf("error writing page state entry: %w", writeErr)) - - return - } - } - - return - } - } - }() + gated := os.Getenv(envGated) == "1" - fdExit, err := fdexit.New() - if err != nil { - return fmt.Errorf("exit creating fd exit: %w", err) + svc := &Service{ + uffd: uffd, + br: br, + gated: gated, + shutdown: make(chan struct{}), + } + svc.startServe() + + server := rpc.NewServer() + if err := server.Register(svc); err != nil { + return fmt.Errorf("rpc Register: %w", err) } - defer fdExit.Close() + // Run the codec in a goroutine so we can react to Shutdown + // without depending on the codec returning. + codecDone := make(chan struct{}) go func() { - defer func() { - exitUffd <- struct{}{} - }() + defer close(codecDone) + server.ServeCodec(jsonrpc.NewServerCodec(conn)) + }() - serverErr := uffd.Serve(ctx, fdExit) - if serverErr != nil { - msg := fmt.Errorf("error serving: %w", serverErr) + select { + case <-svc.shutdown: + case <-codecDone: + } - fmt.Fprint(os.Stderr, msg.Error()) + // Release any still-parked barriers so the serve goroutine can + // finish, then stop the serve goroutine. + br.releaseAll() + svc.stopServe() - cancel(msg) + // Closing the conn is sufficient to unblock ServeCodec if it + // hasn't already returned. + _ = conn.Close() + <-codecDone - return - } - }() + return nil +} - // stopFn drains whichever Serve goroutine is currently running and - // is reset to a no-op once it has run. This makes both pause-then-exit - // (no resume in between) and pause-resume-pause-exit safe: every - // caller — gated 'P', gated 'R' replacing the previous stop, and the - // final defer below — sees a stop function that matches the goroutine - // actually running, and never blocks on an already-drained channel. - var ( - stopMu sync.Mutex - stopFn = func() { - err := fdExit.SignalExit() - if err != nil { - msg := fmt.Errorf("error signaling exit: %w", err) +// Service is the RPC surface exposed to the parent. Methods follow +// net/rpc's required signature. +type Service struct { + uffd *Userfaultfd + br *barrierRegistry - fmt.Fprint(os.Stderr, msg.Error()) + gated bool - cancel(msg) + mu sync.Mutex + stop func() // currently active serve-stop function, nil if paused + shutdown chan struct{} + closed bool +} - return - } +func (s *Service) startServe() { + exit, err := fdexit.New() + if err != nil { + fmt.Fprintln(os.Stderr, "fdexit.New:", err) - <-exitUffd - } - ) + return + } - stopServe := func() { - stopMu.Lock() - fn := stopFn - stopFn = func() {} - stopMu.Unlock() + done := make(chan struct{}) + go func() { + defer close(done) + if err := s.uffd.Serve(context.Background(), exit); err != nil { + fmt.Fprintln(os.Stderr, "uffd.Serve:", err) + } + }() - fn() + s.stop = func() { + _ = exit.SignalExit() + <-done + exit.Close() } +} - defer stopServe() +func (s *Service) stopServe() { + s.mu.Lock() + defer s.mu.Unlock() + if s.stop != nil { + s.stop() + s.stop = nil + } +} - if os.Getenv("GO_GATED") == "1" { - gateCmdFile := os.NewFile(uintptr(7), "gate-cmd") - defer gateCmdFile.Close() +// WaitReady is a no-op handler whose successful reply is the +// readiness signal for the parent. +func (s *Service) WaitReady(_ *Empty, _ *Empty) error { + return nil +} - gateSyncFile := os.NewFile(uintptr(8), "gate-sync") - defer gateSyncFile.Close() +func (s *Service) PageStates(_ *Empty, reply *PageStatesReply) error { + entries, err := s.uffd.pageStateEntries() + if err != nil { + return err + } + reply.Entries = entries - startServe := func() { - newExit, fdErr := fdexit.New() - if fdErr != nil { - cancel(fmt.Errorf("error creating fd exit: %w", fdErr)) + return nil +} - return - } +func (s *Service) ServePause(_ *Empty, _ *Empty) error { + if !s.gated { + return errors.New("ServePause called on a non-gated handler") + } + s.stopServe() - done := make(chan struct{}) - go func() { - defer close(done) - if err := uffd.Serve(ctx, newExit); err != nil { - cancel(fmt.Errorf("error serving: %w", err)) - } - }() - - stopMu.Lock() - stopFn = func() { - newExit.SignalExit() - <-done - newExit.Close() - } - stopMu.Unlock() - } + return nil +} - go func() { - var buf [1]byte - for { - if _, err := gateCmdFile.Read(buf[:]); err != nil { - return - } - - switch buf[0] { - case 'P': - stopServe() - gateSyncFile.Write([]byte{1}) - case 'R': - startServe() - } - } - }() +func (s *Service) ServeResume(_ *Empty, _ *Empty) error { + if !s.gated { + return errors.New("ServeResume called on a non-gated handler") } + s.mu.Lock() + defer s.mu.Unlock() + s.startServe() - exitSignal := make(chan os.Signal, 1) - signal.Notify(exitSignal, syscall.SIGUSR1) - defer signal.Stop(exitSignal) + return nil +} - readyFile := os.NewFile(uintptr(6), "ready") +func (s *Service) InstallFaultBarrier(args *FaultBarrierArgs, reply *FaultBarrierReply) error { + reply.Token = s.br.install(uintptr(args.Addr), barrierPoint(args.Point)) - closeErr := readyFile.Close() - if closeErr != nil { - return fmt.Errorf("error closing ready file: %w", closeErr) - } + return nil +} - select { - case <-ctx.Done(): - return fmt.Errorf("context done: %w: %w", ctx.Err(), context.Cause(ctx)) - case <-exitSignal: - return nil - } +func (s *Service) WaitFaultHeld(args *TokenArgs, _ *Empty) error { + return s.br.waitArrived(context.Background(), args.Token) } -// pageStateEntry is the wire format used between the main test process -// and the serving helper process. State is emitted as a single byte so it -// can be written directly with binary.Write and decoded on the other side. -type pageStateEntry struct { - State uint8 - Offset uint64 +func (s *Service) ReleaseFault(args *TokenArgs, _ *Empty) error { + s.br.release(args.Token) + + return nil } -// pageStateEntries returns a snapshot of every tracked page and its state. -// It holds the settleRequests write lock so no in-flight faultPage worker -// can mutate the pageTracker while we iterate. +func (s *Service) Shutdown(_ *Empty, _ *Empty) error { + s.mu.Lock() + defer s.mu.Unlock() + if !s.closed { + s.closed = true + close(s.shutdown) + } + + return nil +} + +// pageStateEntries returns a snapshot of every tracked page and its +// state. It briefly takes settleRequests.Lock so no in-flight worker +// can mutate the pageTracker while we read it. func (u *Userfaultfd) pageStateEntries() ([]pageStateEntry, error) { u.settleRequests.Lock() - defer u.settleRequests.Unlock() + u.settleRequests.Unlock() //nolint:staticcheck // SA2001: intentional — settle the read locks. + + u.pageTracker.mu.RLock() + defer u.pageTracker.mu.RUnlock() entries := make([]pageStateEntry, 0, len(u.pageTracker.m)) for addr, state := range u.pageTracker.m { @@ -544,9 +578,155 @@ func (u *Userfaultfd) pageStateEntries() ([]pageStateEntry, error) { if err != nil { return nil, fmt.Errorf("address %#x not in mapping: %w", addr, err) } - - entries = append(entries, pageStateEntry{uint8(state), uint64(offset)}) + entries = append(entries, pageStateEntry{State: uint8(state), Offset: uint64(offset)}) } return entries, nil } + +// ---- Barrier registry --------------------------------------------------- + +// barrierPoint identifies WHICH hook a barrier should park on. +type barrierPoint uint8 + +const ( + // barrierBeforeRLock parks the worker BEFORE settleRequests.RLock(), + // i.e. before it can read the page state. Use this when a parallel + // writer needs the write lock immediately because no worker holds + // the read lock. + barrierBeforeRLock barrierPoint = 1 + // barrierBeforeFaultPage parks the worker AFTER it has taken + // settleRequests.RLock, but BEFORE the actual UFFDIO_COPY syscall. + // Use this when a parent operation must still return even though + // a worker holds RLock. + barrierBeforeFaultPage barrierPoint = 2 +) + +// barrierRegistry is the child-process side of the barrier. The +// hooks installed on Userfaultfd consult this registry by addr+point +// to decide whether to park, and the RPC handlers manipulate it from +// the parent over the socket. +type barrierRegistry struct { + mu sync.Mutex + next uint64 + tokens map[uint64]*barrierSlot + byKey map[barrierKey]uint64 +} + +type barrierKey struct { + addr uintptr + point barrierPoint +} + +type barrierSlot struct { + addr uintptr + point barrierPoint + arrived chan struct{} + release chan struct{} + arrivedOnce sync.Once +} + +func newBarrierRegistry() *barrierRegistry { + return &barrierRegistry{ + tokens: make(map[uint64]*barrierSlot), + byKey: make(map[barrierKey]uint64), + } +} + +func (b *barrierRegistry) install(addr uintptr, point barrierPoint) uint64 { + b.mu.Lock() + defer b.mu.Unlock() + + b.next++ + token := b.next + slot := &barrierSlot{ + addr: addr, + point: point, + arrived: make(chan struct{}), + release: make(chan struct{}), + } + b.tokens[token] = slot + b.byKey[barrierKey{addr, point}] = token + + return token +} + +func (b *barrierRegistry) lookupByAddr(addr uintptr, point barrierPoint) *barrierSlot { + b.mu.Lock() + defer b.mu.Unlock() + + token, ok := b.byKey[barrierKey{addr, point}] + if !ok { + return nil + } + + return b.tokens[token] +} + +func (b *barrierRegistry) waitArrived(ctx context.Context, token uint64) error { + b.mu.Lock() + slot, ok := b.tokens[token] + b.mu.Unlock() + if !ok { + return fmt.Errorf("unknown barrier token %d", token) + } + + select { + case <-slot.arrived: + return nil + case <-ctx.Done(): + return ctx.Err() + } +} + +func (b *barrierRegistry) release(token uint64) { + b.mu.Lock() + slot, ok := b.tokens[token] + delete(b.tokens, token) + if ok { + delete(b.byKey, barrierKey{slot.addr, slot.point}) + } + b.mu.Unlock() + + if !ok { + return + } + + select { + case <-slot.release: + default: + close(slot.release) + } +} + +func (b *barrierRegistry) releaseAll() { + b.mu.Lock() + tokens := make([]uint64, 0, len(b.tokens)) + for t := range b.tokens { + tokens = append(tokens, t) + } + b.mu.Unlock() + + for _, t := range tokens { + b.release(t) + } +} + +// hookFor returns the function to assign to a Userfaultfd +// beforeXxxHook field. The returned function is a no-op for any +// (addr, point) pair that hasn't been Install'd, so non-targeted +// faults see no scheduling distortion. +func (b *barrierRegistry) hookFor(point barrierPoint) func(addr uintptr) { + return func(addr uintptr) { + slot := b.lookupByAddr(addr, point) + if slot == nil { + return + } + + slot.arrivedOnce.Do(func() { + close(slot.arrived) + }) + + <-slot.release + } +} diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go index 8d59f3b5ca..bf9932c258 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go @@ -4,6 +4,9 @@ import ( "bytes" "context" "fmt" + "io" + "net/rpc" + "os/exec" "slices" "sync" "testing" @@ -29,6 +32,10 @@ type testConfig struct { alwaysWP bool // gated enables pause/resume control over the handler's serve loop. gated bool + // barriers wires up the per-worker fault hooks in the child + // (used by race tests). Off by default so the worker hot path + // stays a single nil-pointer load + branch in non-race tests. + barriers bool } type operationMode uint32 @@ -63,13 +70,6 @@ type handlerPageStates struct { // allAccessed returns the sorted union of offsets that the handler touched // in any non-missing state. Tests that only care about "which pages did the // handler see" can compare directly against this. -// -// pageStatesOnce already returns each per-state slice sorted, and a page -// has exactly one state at a time in pageTracker, so the per-state slices -// are disjoint. Follow-up PRs that add more state-specific fields should -// sorted-merge them here instead of reaching for a bitset — byte offsets -// make poor bit indices (a single hugepage offset would force ~1.8 MB of -// backing storage). func (s handlerPageStates) allAccessed() []uint { return slices.Clone(s.faulted) } @@ -79,14 +79,22 @@ type testHandler struct { pagesize uint64 data *MemorySlicer // pageStatesOnce returns a per-state snapshot of the handler's pageTracker. - // It can only be called once. + // Backed by the PageStates RPC; callable any number of times. + // The "Once" suffix is kept for source-stability with the existing + // test sites. pageStatesOnce func() (handlerPageStates, error) // servePause and serveResume gate the UFFD event loop in the child process. // Tests use them to deterministically batch a sequence of UFFD events // before more faults are processed. servePause func() error serveResume func() error - mutex sync.Mutex + + // client is the RPC channel to the child helper process. + client *rpc.Client + conn io.Closer + cmd *exec.Cmd + + mutex sync.Mutex } func (h *testHandler) executeAll(t *testing.T, operations []operation) { diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_test.go index 6e99c865f9..fac233ad5f 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_test.go @@ -121,7 +121,7 @@ func TestMissing(t *testing.T) { t.Run(tt.name, func(t *testing.T) { t.Parallel() - h, err := configureCrossProcessTest(t, tt) + h, err := configureCrossProcessTest(t.Context(), t, tt) require.NoError(t, err) h.executeAll(t, tt.operations) @@ -148,7 +148,7 @@ func TestParallelMissing(t *testing.T) { numberOfPages: 2, } - h, err := configureCrossProcessTest(t, tt) + h, err := configureCrossProcessTest(t.Context(), t, tt) require.NoError(t, err) readOp := operation{ @@ -185,7 +185,7 @@ func TestParallelMissingWithPrefault(t *testing.T) { numberOfPages: 2, } - h, err := configureCrossProcessTest(t, tt) + h, err := configureCrossProcessTest(t.Context(), t, tt) require.NoError(t, err) readOp := operation{ @@ -225,7 +225,7 @@ func TestSerialMissing(t *testing.T) { numberOfPages: 2, } - h, err := configureCrossProcessTest(t, tt) + h, err := configureCrossProcessTest(t.Context(), t, tt) require.NoError(t, err) readOp := operation{ diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_write_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_write_test.go index 0a20b62f59..34e55f476a 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_write_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_write_test.go @@ -116,7 +116,7 @@ func TestMissingWrite(t *testing.T) { t.Run(tt.name, func(t *testing.T) { t.Parallel() - h, err := configureCrossProcessTest(t, tt) + h, err := configureCrossProcessTest(t.Context(), t, tt) require.NoError(t, err) h.executeAll(t, tt.operations) @@ -143,7 +143,7 @@ func TestParallelMissingWrite(t *testing.T) { numberOfPages: 2, } - h, err := configureCrossProcessTest(t, tt) + h, err := configureCrossProcessTest(t.Context(), t, tt) require.NoError(t, err) writeOp := operation{ @@ -180,7 +180,7 @@ func TestParallelMissingWriteWithPrefault(t *testing.T) { numberOfPages: 2, } - h, err := configureCrossProcessTest(t, tt) + h, err := configureCrossProcessTest(t.Context(), t, tt) require.NoError(t, err) writeOp := operation{ @@ -220,7 +220,7 @@ func TestSerialMissingWrite(t *testing.T) { numberOfPages: 2, } - h, err := configureCrossProcessTest(t, tt) + h, err := configureCrossProcessTest(t.Context(), t, tt) require.NoError(t, err) writeOp := operation{ diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go index 133af0f547..14feef0b0b 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go @@ -63,6 +63,25 @@ type Userfaultfd struct { // defaultCopyMode overrides the UFFDIO_COPY mode for all faults when non-zero. defaultCopyMode CULong + // Test-only synchronisation hooks. Both default to nil and the nil + // branch costs a single un-predictable load + branch in the hot path, + // so they are effectively free in production. They MUST only be set + // from _test.go files. They let tests park a worker goroutine at a + // known point so a racing event (REMOVE, MISSING) can be issued + // deterministically before the worker proceeds. + // + // - beforeWorkerRLockHook: called as the very first thing in the + // worker goroutine, BEFORE settleRequests.RLock(). At this point + // a test can hold the goroutine before it can claim the read lock, + // so a parallel writer can take the write lock immediately. + // + // - beforeFaultPageHook: called inside the worker AFTER RLock and + // BEFORE the actual UFFDIO_COPY/UFFDIO_ZEROPAGE syscall. Lets a + // test simulate a slow data fetch / in-flight COPY so a parent + // operation can race against an in-flight worker. + beforeWorkerRLockHook func(addr uintptr) + beforeFaultPageHook func(addr uintptr) + logger logger.Logger } @@ -250,6 +269,10 @@ func (u *Userfaultfd) Serve( // For the write to be executed, we first need to copy the page from the source to the guest memory. if flags&UFFD_PAGEFAULT_FLAG_WRITE != 0 { u.wg.Go(func() error { + if hook := u.beforeWorkerRLockHook; hook != nil { + hook(addr) + } + return u.faultPage(ctx, addr, offset, u.src, fdExit.SignalExit, block.Write) }) @@ -260,6 +283,10 @@ func (u *Userfaultfd) Serve( // If the event has no flags, it was a read to a missing page and we need to copy the page from the source to the guest memory. if flags == 0 { u.wg.Go(func() error { + if hook := u.beforeWorkerRLockHook; hook != nil { + hook(addr) + } + return u.faultPage(ctx, addr, offset, u.src, fdExit.SignalExit, block.Read) }) @@ -299,6 +326,10 @@ func (u *Userfaultfd) faultPage( u.settleRequests.RLock() defer u.settleRequests.RUnlock() + if hook := u.beforeFaultPageHook; hook != nil { + hook(addr) + } + defer func() { if r := recover(); r != nil { u.logger.Error(ctx, "UFFD serve panic", zap.Any("pagesize", u.pageSize), zap.Any("panic", r)) From 803ce4bca2f3d8cb9d5cf5089b851209a0f33b2d Mon Sep 17 00:00:00 2001 From: ValentaTomas Date: Fri, 1 May 2026 15:08:33 -0700 Subject: [PATCH 06/22] refactor(uffd): bundle test hooks into single atomic.Pointer[testHooks] Replace the two test-only func fields beforeWorkerRLockHook and beforeFaultPageHook on Userfaultfd with a single atomic.Pointer[testHooks] holding a struct of optional callbacks. Lets PR #2521's race tests add a new hook without touching the production struct, and shrinks the test-surface comment from 17 lines to 3. No production behavior change; hot-path cost is identical (one atomic load + nil check vs one nil-check per hook today). --- .../userfaultfd/cross_process_helpers_test.go | 8 ++- .../sandbox/uffd/userfaultfd/hooks_test.go | 8 +++ .../sandbox/uffd/userfaultfd/userfaultfd.go | 64 +++++++++++-------- 3 files changed, 50 insertions(+), 30 deletions(-) create mode 100644 packages/orchestrator/pkg/sandbox/uffd/userfaultfd/hooks_test.go diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/cross_process_helpers_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/cross_process_helpers_test.go index 16ea0c5a75..c95f0840ed 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/cross_process_helpers_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/cross_process_helpers_test.go @@ -414,11 +414,13 @@ func crossProcessServe() error { br := newBarrierRegistry() // Hooks are only wired up when the test asked for them (race - // tests). For everyone else we leave the fields nil so the hot + // tests). For everyone else we leave the bundle nil so the hot // path is a single nil-pointer load + branch. if os.Getenv(envBarriers) == "1" { - uffd.beforeWorkerRLockHook = br.hookFor(barrierBeforeRLock) - uffd.beforeFaultPageHook = br.hookFor(barrierBeforeFaultPage) + uffd.SetTestHooks(&testHooks{ + beforeWorkerRLock: br.hookFor(barrierBeforeRLock), + beforeFaultPage: br.hookFor(barrierBeforeFaultPage), + }) } gated := os.Getenv(envGated) == "1" diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/hooks_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/hooks_test.go new file mode 100644 index 0000000000..85384c9f1a --- /dev/null +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/hooks_test.go @@ -0,0 +1,8 @@ +package userfaultfd + +// SetTestHooks installs the given hooks atomically. Pass nil to clear. +// Only available in test builds (this file is _test.go), so production +// binaries cannot reach it. +func (u *Userfaultfd) SetTestHooks(h *testHooks) { + u.testHooks.Store(h) +} diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go index 14feef0b0b..94737507db 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go @@ -6,6 +6,7 @@ import ( "fmt" "math/rand" "sync" + "sync/atomic" "syscall" "time" "unsafe" @@ -63,28 +64,43 @@ type Userfaultfd struct { // defaultCopyMode overrides the UFFDIO_COPY mode for all faults when non-zero. defaultCopyMode CULong - // Test-only synchronisation hooks. Both default to nil and the nil - // branch costs a single un-predictable load + branch in the hot path, - // so they are effectively free in production. They MUST only be set - // from _test.go files. They let tests park a worker goroutine at a - // known point so a racing event (REMOVE, MISSING) can be issued - // deterministically before the worker proceeds. - // - // - beforeWorkerRLockHook: called as the very first thing in the - // worker goroutine, BEFORE settleRequests.RLock(). At this point - // a test can hold the goroutine before it can claim the read lock, - // so a parallel writer can take the write lock immediately. - // - // - beforeFaultPageHook: called inside the worker AFTER RLock and - // BEFORE the actual UFFDIO_COPY/UFFDIO_ZEROPAGE syscall. Lets a - // test simulate a slow data fetch / in-flight COPY so a parent - // operation can race against an in-flight worker. - beforeWorkerRLockHook func(addr uintptr) - beforeFaultPageHook func(addr uintptr) + // testHooks is nil in production; tests set it via SetTestHooks (defined in + // a _test.go file, so production binaries cannot reach the setter). All hook + // fields default to nil and the call sites no-op when unset. + testHooks atomic.Pointer[testHooks] logger logger.Logger } +// testHooks bundles the optional test-only synchronisation callbacks. A nil +// callback means the corresponding call site is a no-op. Tests use these to +// park a worker goroutine at a known point so a racing event (REMOVE, MISSING) +// can be issued deterministically before the worker proceeds. +type testHooks struct { + // beforeWorkerRLock is called as the very first thing in the worker + // goroutine, BEFORE settleRequests.RLock(). Lets a test hold the + // goroutine before it can claim the read lock so a parallel writer can + // take the write lock immediately. + beforeWorkerRLock func(addr uintptr) + // beforeFaultPage is called inside the worker AFTER RLock and BEFORE the + // actual UFFDIO_COPY/UFFDIO_ZEROPAGE syscall. Lets a test simulate a + // slow data fetch / in-flight COPY so a parent operation can race + // against an in-flight worker. + beforeFaultPage func(addr uintptr) +} + +func (u *Userfaultfd) callBeforeWorkerRLock(addr uintptr) { + if h := u.testHooks.Load(); h != nil && h.beforeWorkerRLock != nil { + h.beforeWorkerRLock(addr) + } +} + +func (u *Userfaultfd) callBeforeFaultPage(addr uintptr) { + if h := u.testHooks.Load(); h != nil && h.beforeFaultPage != nil { + h.beforeFaultPage(addr) + } +} + // NewUserfaultfdFromFd creates a new userfaultfd instance with optional configuration. func NewUserfaultfdFromFd(fd uintptr, src block.Slicer, m *memory.Mapping, logger logger.Logger) (*Userfaultfd, error) { blockSize := src.BlockSize() @@ -269,9 +285,7 @@ func (u *Userfaultfd) Serve( // For the write to be executed, we first need to copy the page from the source to the guest memory. if flags&UFFD_PAGEFAULT_FLAG_WRITE != 0 { u.wg.Go(func() error { - if hook := u.beforeWorkerRLockHook; hook != nil { - hook(addr) - } + u.callBeforeWorkerRLock(addr) return u.faultPage(ctx, addr, offset, u.src, fdExit.SignalExit, block.Write) }) @@ -283,9 +297,7 @@ func (u *Userfaultfd) Serve( // If the event has no flags, it was a read to a missing page and we need to copy the page from the source to the guest memory. if flags == 0 { u.wg.Go(func() error { - if hook := u.beforeWorkerRLockHook; hook != nil { - hook(addr) - } + u.callBeforeWorkerRLock(addr) return u.faultPage(ctx, addr, offset, u.src, fdExit.SignalExit, block.Read) }) @@ -326,9 +338,7 @@ func (u *Userfaultfd) faultPage( u.settleRequests.RLock() defer u.settleRequests.RUnlock() - if hook := u.beforeFaultPageHook; hook != nil { - hook(addr) - } + u.callBeforeFaultPage(addr) defer func() { if r := recover(); r != nil { From 89b4747f48e8600106f3adf7e30235476a094a42 Mon Sep 17 00:00:00 2001 From: ValentaTomas Date: Fri, 1 May 2026 15:48:01 -0700 Subject: [PATCH 07/22] refactor(uffd-tests): collapse cross-process harness side-channels into RPC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reduce parent ↔ child side channels from {7 env config vars, 1 helper flag, 1 uffd ExtraFile, 1 content tmp file, 1 RPC socket} to {2 env vars, 1 uffd ExtraFile, 1 RPC socket}. Everything that was configured via env or the content file now flows through a single Lifecycle.Bootstrap RPC. Drops: - 7 config env vars (GO_UFFD_SOCKET kept; GO_UFFD_CONTENT, GO_UFFD_MMAP_*, GO_UFFD_ALWAYS_WP, GO_UFFD_GATED, GO_UFFD_BARRIERS removed) - the content.bin tmp file (passed as []byte in BootstrapArgs) - the testConfig.gated flag and its conditional RPC wiring - childForkMu — replaced with a single F_DUPFD_CLOEXEC fcntl that removes the dup-then-set-cloexec race window the mutex was guarding Splits the 752-line cross_process_helpers_test.go into seven cohesive files: harness_{parent,child,client}_test.go for the wire/IPC plumbing and rpc_{lifecycle,paging,barriers}_test.go for the three RPC services. The Service god-struct becomes three small services multi-registered with net/rpc; the parent calls Lifecycle.Bootstrap, Paging.Pause, Barriers.Install, etc. via a typed *harnessClient wrapper. testHandler now holds a single *harnessClient instead of {client, conn, cmd} plus servePause/serveResume/pageStatesOnce function fields; pageStatesOnce becomes a regular method that delegates to the client. Test-only; no production code changes. go vet / golangci-lint / \`sudo go test -race\` all clean (×2). --- .../userfaultfd/cross_process_helpers_test.go | 752 ------------------ .../uffd/userfaultfd/harness_child_test.go | 100 +++ .../uffd/userfaultfd/harness_client_test.go | 81 ++ .../uffd/userfaultfd/harness_parent_test.go | 222 ++++++ .../sandbox/uffd/userfaultfd/helpers_test.go | 60 +- .../uffd/userfaultfd/rpc_barriers_test.go | 218 +++++ .../uffd/userfaultfd/rpc_lifecycle_test.go | 191 +++++ .../uffd/userfaultfd/rpc_paging_test.go | 80 ++ 8 files changed, 921 insertions(+), 783 deletions(-) delete mode 100644 packages/orchestrator/pkg/sandbox/uffd/userfaultfd/cross_process_helpers_test.go create mode 100644 packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_child_test.go create mode 100644 packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_client_test.go create mode 100644 packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go create mode 100644 packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_barriers_test.go create mode 100644 packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_lifecycle_test.go create mode 100644 packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_paging_test.go diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/cross_process_helpers_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/cross_process_helpers_test.go deleted file mode 100644 index c95f0840ed..0000000000 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/cross_process_helpers_test.go +++ /dev/null @@ -1,752 +0,0 @@ -package userfaultfd - -// This test creates the userfaultfd in the parent test process and -// drives it from a child helper process. We do this so the actual -// page-fault handling runs in a process where we can fully control -// memory layout (no Go GC scanning / touching the registered region) -// — which mirrors how Firecracker uses UFFD in production. -// -// All parent ↔ child coordination — readiness, page-state queries, -// pause/resume, fault barriers, shutdown — flows over a single Unix -// domain socket using the standard-library net/rpc + jsonrpc codec. -// Each in-flight RPC runs in its own server-side goroutine, so a -// blocking handler (e.g. WaitFaultHeld) does not stall other RPCs. -// The only fd we still hand off out-of-band is the userfaultfd -// itself (kernel object, has to go through ExtraFiles); the initial -// source data is written to a temp file under t.TempDir() because -// base64-stuffing megabytes through the JSON envelope would be silly. - -import ( - "context" - "crypto/rand" - "errors" - "fmt" - "net" - "net/rpc" - "net/rpc/jsonrpc" - "os" - "os/exec" - "path/filepath" - "slices" - "strconv" - "sync" - "syscall" - "testing" - "time" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - "golang.org/x/sys/unix" - - "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/block" - "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/fdexit" - "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/memory" - "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/testutils" - "github.com/e2b-dev/infra/packages/shared/pkg/logger" -) - -// MemorySlicer exposes a byte slice via the Slicer interface. -// Test-only. -type MemorySlicer struct { - content []byte - pagesize int64 -} - -var _ block.Slicer = (*MemorySlicer)(nil) - -func NewMemorySlicer(content []byte, pagesize int64) *MemorySlicer { - return &MemorySlicer{content: content, pagesize: pagesize} -} - -func (s *MemorySlicer) Slice(_ context.Context, offset, size int64) ([]byte, error) { - return s.content[offset : offset+size], nil -} - -func (s *MemorySlicer) Size() (int64, error) { - return int64(len(s.content)), nil -} - -func (s *MemorySlicer) Content() []byte { - return s.content -} - -func (s *MemorySlicer) BlockSize() int64 { - return s.pagesize -} - -func RandomPages(pagesize, numberOfPages uint64) *MemorySlicer { - size := pagesize * numberOfPages - buf := make([]byte, int(size)) - if _, err := rand.Read(buf); err != nil { - panic(err) - } - - return NewMemorySlicer(buf, int64(pagesize)) -} - -// Env vars used by the child helper process. -const ( - envHelperFlag = "GO_TEST_HELPER_PROCESS" - envSocketPath = "GO_UFFD_SOCKET" - envContentPath = "GO_UFFD_CONTENT" - envMmapStart = "GO_UFFD_MMAP_START" - envMmapPagesize = "GO_UFFD_MMAP_PAGESIZE" - envMmapTotalSize = "GO_UFFD_MMAP_SIZE" - envAlwaysWP = "GO_UFFD_ALWAYS_WP" - envGated = "GO_UFFD_GATED" - // envBarriers gates the test-only worker hooks. Only race tests - // need them; for everyone else we leave the hook fields nil so - // the hot path stays a single nil-pointer load + branch. - envBarriers = "GO_UFFD_BARRIERS" -) - -// ---- RPC method types --------------------------------------------------- -// -// net/rpc requires methods of the form: -// -// func (s *Service) Method(args *ArgsT, reply *ReplyT) error -// -// where both args and reply are exported pointer types. For methods -// that take or return nothing meaningful we still need a type — Empty -// fills that role. - -type Empty struct{} - -type PageStatesReply struct { - Entries []pageStateEntry -} - -type FaultBarrierArgs struct { - Addr uint64 - Point uint8 -} - -type FaultBarrierReply struct { - Token uint64 -} - -type TokenArgs struct { - Token uint64 -} - -// pageStateEntry is the wire format for PageStates RPC results. -type pageStateEntry struct { - State uint8 - Offset uint64 -} - -// ---- Parent side -------------------------------------------------------- - -// childForkMu serialises the cmd.Start() call across all parallel -// cross-process tests in this binary. Without it, the duplicated -// uffd fd we hand to one child via ExtraFiles is briefly visible in -// the parent's fd table while ANOTHER concurrent test calls fork() -// — so that other test's child inherits a uffd fd it does not own. -// The leaked fd keeps the original test's uffd kernel object alive -// after its owner closes its end and produces hard-to-diagnose -// -parallel-only deadlocks. -// -// Holding the mutex only across cmd.Start (which itself holds the -// process lock for the underlying syscall.ForkExec) is enough — by -// the time Start returns the dup'd fd is already mapped into fd 3 -// in the new child and we close it immediately in the parent below. -var childForkMu sync.Mutex - -// Main process, FC in our case. -func configureCrossProcessTest(ctx context.Context, t *testing.T, tt testConfig) (*testHandler, error) { - t.Helper() - - data := RandomPages(tt.pagesize, tt.numberOfPages) - - size, err := data.Size() - require.NoError(t, err) - - memoryArea, memoryStart, err := testutils.NewPageMmap(t, uint64(size), tt.pagesize) - require.NoError(t, err) - - uffdFd, err := newFd(syscall.O_CLOEXEC | syscall.O_NONBLOCK) - require.NoError(t, err) - t.Cleanup(func() { - uffdFd.close() - }) - - require.NoError(t, configureApi(uffdFd, tt.pagesize)) - require.NoError(t, register(uffdFd, memoryStart, uint64(size), UFFDIO_REGISTER_MODE_MISSING|UFFDIO_REGISTER_MODE_WP)) - - tmpDir := t.TempDir() - - contentPath := filepath.Join(tmpDir, "content.bin") - require.NoError(t, os.WriteFile(contentPath, data.Content(), 0o600)) - - socketPath := filepath.Join(tmpDir, "rpc.sock") - listenCfg := net.ListenConfig{} - listener, err := listenCfg.Listen(ctx, "unix", socketPath) - require.NoError(t, err) - - cmd := exec.CommandContext(ctx, os.Args[0], "-test.run=TestHelperServingProcess", "-test.timeout=0") - cmd.Env = append(os.Environ(), - envHelperFlag+"=1", - envSocketPath+"="+socketPath, - envContentPath+"="+contentPath, - fmt.Sprintf("%s=%d", envMmapStart, memoryStart), - fmt.Sprintf("%s=%d", envMmapPagesize, tt.pagesize), - fmt.Sprintf("%s=%d", envMmapTotalSize, size), - ) - if tt.alwaysWP { - cmd.Env = append(cmd.Env, envAlwaysWP+"=1") - } - if tt.gated { - cmd.Env = append(cmd.Env, envGated+"=1") - } - if tt.barriers { - cmd.Env = append(cmd.Env, envBarriers+"=1") - } - - // We hand the uffd fd to the child via ExtraFiles. The child- - // side dup3 inside fork+exec clears CLOEXEC on the destination - // fd (i.e. fd 3 in the child) automatically, so the SOURCE fd - // in our parent should remain CLOEXEC — otherwise every other - // test fork()'d concurrently from this binary inherits a uffd - // it does not own and hangs surface at higher -parallel. - childForkMu.Lock() - - dup, err := syscall.Dup(int(uffdFd)) - require.NoError(t, err) - if _, err := unix.FcntlInt(uintptr(dup), unix.F_SETFD, unix.FD_CLOEXEC); err != nil { - childForkMu.Unlock() - require.NoError(t, err) - } - - uffdFile := os.NewFile(uintptr(dup), "uffd") - cmd.ExtraFiles = []*os.File{uffdFile} - cmd.Stdout = os.Stdout - cmd.Stderr = os.Stderr - - startErr := cmd.Start() - uffdFile.Close() - childForkMu.Unlock() - - require.NoError(t, startErr) - - // Accept the child's connection. Tight deadline so a wedged - // child surfaces fast instead of hanging the test. - type acceptResult struct { - conn net.Conn - err error - } - acceptCh := make(chan acceptResult, 1) - go func() { - c, err := listener.Accept() - acceptCh <- acceptResult{conn: c, err: err} - }() - - var conn net.Conn - select { - case res := <-acceptCh: - require.NoError(t, res.err) - conn = res.conn - case <-time.After(10 * time.Second): - listener.Close() - _ = cmd.Process.Kill() - _, _ = cmd.Process.Wait() - - return nil, errors.New("child did not connect within 10s") - } - listener.Close() - - client := jsonrpc.NewClient(conn) - - h := &testHandler{ - memoryArea: &memoryArea, - pagesize: tt.pagesize, - data: data, - client: client, - conn: conn, - cmd: cmd, - } - - // WaitReady blocks on the child until its initial setup is done - // (uffd serve goroutine running, hooks installed). The RPC reply - // IS the readiness signal — no separate ready pipe / signal - // needed. - require.NoError(t, h.client.Call("Service.WaitReady", &Empty{}, &Empty{})) - - t.Cleanup(func() { - // Best-effort graceful shutdown via RPC. If the child has - // already crashed the RPC will error and we fall back to - // killing the process below. - _ = h.client.Call("Service.Shutdown", &Empty{}, &Empty{}) - _ = client.Close() - - waitErr := cmd.Wait() - if waitErr != nil { - var exitErr *exec.ExitError - if !errors.As(waitErr, &exitErr) { - t.Logf("helper process Wait: %v", waitErr) - } - } - - // Tear down the UFFD registration before the early uffdFd.close() - // cleanup runs. Today this is a no-op (no test enables - // UFFD_FEATURE_EVENT_REMOVE) but a follow-up that does will - // otherwise see munmap block on un-acked REMOVE events queued - // against the still-registered range. Cleanups run LIFO, so - // this fires before the close registered earlier. - assert.NoError(t, unregister(uffdFd, memoryStart, uint64(size))) - }) - - if tt.gated { - h.servePause = func() error { - return h.client.Call("Service.ServePause", &Empty{}, &Empty{}) - } - h.serveResume = func() error { - return h.client.Call("Service.ServeResume", &Empty{}, &Empty{}) - } - } - - h.pageStatesOnce = func() (handlerPageStates, error) { - var reply PageStatesReply - if err := h.client.Call("Service.PageStates", &Empty{}, &reply); err != nil { - return handlerPageStates{}, err - } - - var states handlerPageStates - for _, e := range reply.Entries { - if pageState(e.State) == faulted { - states.faulted = append(states.faulted, uint(e.Offset)) - } - } - slices.Sort(states.faulted) - - return states, nil - } - - return h, nil -} - -// ---- Child side --------------------------------------------------------- - -// Secondary process, orchestrator in our case. -func TestHelperServingProcess(t *testing.T) { - t.Parallel() - - if os.Getenv(envHelperFlag) != "1" { - t.Skip("this is a helper process, skipping direct execution") - } - - if err := crossProcessServe(); err != nil { - fmt.Fprintln(os.Stderr, "exit serving process:", err) - os.Exit(1) - } - - os.Exit(0) -} - -// crossProcessServe wires up the child side: connects back to the -// parent socket, registers the RPC service, and runs uffd.Serve in a -// background goroutine that pause/resume RPCs can stop and restart. -func crossProcessServe() error { - socketPath := os.Getenv(envSocketPath) - if socketPath == "" { - return fmt.Errorf("missing %s", envSocketPath) - } - - dialer := net.Dialer{} - conn, err := dialer.DialContext(context.Background(), "unix", socketPath) - if err != nil { - return fmt.Errorf("dial parent socket: %w", err) - } - defer conn.Close() - - startRaw, err := strconv.ParseUint(os.Getenv(envMmapStart), 10, 64) - if err != nil { - return fmt.Errorf("parse %s: %w", envMmapStart, err) - } - memoryStart := uintptr(startRaw) - - pagesize, err := strconv.ParseInt(os.Getenv(envMmapPagesize), 10, 64) - if err != nil { - return fmt.Errorf("parse %s: %w", envMmapPagesize, err) - } - - totalSize, err := strconv.ParseInt(os.Getenv(envMmapTotalSize), 10, 64) - if err != nil { - return fmt.Errorf("parse %s: %w", envMmapTotalSize, err) - } - - content, err := os.ReadFile(os.Getenv(envContentPath)) - if err != nil { - return fmt.Errorf("read content: %w", err) - } - if int64(len(content)) != totalSize { - return fmt.Errorf("content size %d != expected %d", len(content), totalSize) - } - - data := NewMemorySlicer(content, pagesize) - - uffdFile := os.NewFile(uintptr(3), "uffd") - defer uffdFile.Close() - uffdFd := uffdFile.Fd() - - mapping := memory.NewMapping([]memory.Region{ - { - BaseHostVirtAddr: memoryStart, - Size: uintptr(totalSize), - Offset: 0, - PageSize: uintptr(pagesize), - }, - }) - - l, err := logger.NewDevelopmentLogger() - if err != nil { - return fmt.Errorf("logger: %w", err) - } - - uffd, err := NewUserfaultfdFromFd(uffdFd, data, mapping, l) - if err != nil { - return fmt.Errorf("NewUserfaultfdFromFd: %w", err) - } - - if os.Getenv(envAlwaysWP) == "1" { - uffd.defaultCopyMode = UFFDIO_COPY_MODE_WP - } - - br := newBarrierRegistry() - - // Hooks are only wired up when the test asked for them (race - // tests). For everyone else we leave the bundle nil so the hot - // path is a single nil-pointer load + branch. - if os.Getenv(envBarriers) == "1" { - uffd.SetTestHooks(&testHooks{ - beforeWorkerRLock: br.hookFor(barrierBeforeRLock), - beforeFaultPage: br.hookFor(barrierBeforeFaultPage), - }) - } - - gated := os.Getenv(envGated) == "1" - - svc := &Service{ - uffd: uffd, - br: br, - gated: gated, - shutdown: make(chan struct{}), - } - svc.startServe() - - server := rpc.NewServer() - if err := server.Register(svc); err != nil { - return fmt.Errorf("rpc Register: %w", err) - } - - // Run the codec in a goroutine so we can react to Shutdown - // without depending on the codec returning. - codecDone := make(chan struct{}) - go func() { - defer close(codecDone) - server.ServeCodec(jsonrpc.NewServerCodec(conn)) - }() - - select { - case <-svc.shutdown: - case <-codecDone: - } - - // Release any still-parked barriers so the serve goroutine can - // finish, then stop the serve goroutine. - br.releaseAll() - svc.stopServe() - - // Closing the conn is sufficient to unblock ServeCodec if it - // hasn't already returned. - _ = conn.Close() - <-codecDone - - return nil -} - -// Service is the RPC surface exposed to the parent. Methods follow -// net/rpc's required signature. -type Service struct { - uffd *Userfaultfd - br *barrierRegistry - - gated bool - - mu sync.Mutex - stop func() // currently active serve-stop function, nil if paused - shutdown chan struct{} - closed bool -} - -// startServe spawns the uffd Serve goroutine and stores its stop fn. -// Caller must hold s.mu. Idempotent: if a serve goroutine is already -// running (s.stop != nil) this is a no-op so a stray duplicate -// ServeResume can't leak an untracked Serve goroutine and break later -// pauses. -func (s *Service) startServe() { - if s.stop != nil { - return - } - - exit, err := fdexit.New() - if err != nil { - fmt.Fprintln(os.Stderr, "fdexit.New:", err) - - return - } - - done := make(chan struct{}) - go func() { - defer close(done) - if err := s.uffd.Serve(context.Background(), exit); err != nil { - fmt.Fprintln(os.Stderr, "uffd.Serve:", err) - } - }() - - s.stop = func() { - _ = exit.SignalExit() - <-done - exit.Close() - } -} - -func (s *Service) stopServe() { - s.mu.Lock() - defer s.mu.Unlock() - if s.stop != nil { - s.stop() - s.stop = nil - } -} - -// WaitReady is a no-op handler whose successful reply is the -// readiness signal for the parent. -func (s *Service) WaitReady(_ *Empty, _ *Empty) error { - return nil -} - -func (s *Service) PageStates(_ *Empty, reply *PageStatesReply) error { - entries, err := s.uffd.pageStateEntries() - if err != nil { - return err - } - reply.Entries = entries - - return nil -} - -func (s *Service) ServePause(_ *Empty, _ *Empty) error { - if !s.gated { - return errors.New("ServePause called on a non-gated handler") - } - s.stopServe() - - return nil -} - -func (s *Service) ServeResume(_ *Empty, _ *Empty) error { - if !s.gated { - return errors.New("ServeResume called on a non-gated handler") - } - s.mu.Lock() - defer s.mu.Unlock() - s.startServe() - - return nil -} - -func (s *Service) InstallFaultBarrier(args *FaultBarrierArgs, reply *FaultBarrierReply) error { - reply.Token = s.br.install(uintptr(args.Addr), barrierPoint(args.Point)) - - return nil -} - -func (s *Service) WaitFaultHeld(args *TokenArgs, _ *Empty) error { - return s.br.waitArrived(context.Background(), args.Token) -} - -func (s *Service) ReleaseFault(args *TokenArgs, _ *Empty) error { - s.br.release(args.Token) - - return nil -} - -func (s *Service) Shutdown(_ *Empty, _ *Empty) error { - s.mu.Lock() - defer s.mu.Unlock() - if !s.closed { - s.closed = true - close(s.shutdown) - } - - return nil -} - -// pageStateEntries returns a snapshot of every tracked page and its -// state. It briefly takes settleRequests.Lock so no in-flight worker -// can mutate the pageTracker while we read it. -func (u *Userfaultfd) pageStateEntries() ([]pageStateEntry, error) { - u.settleRequests.Lock() - u.settleRequests.Unlock() //nolint:staticcheck // SA2001: intentional — settle the read locks. - - u.pageTracker.mu.RLock() - defer u.pageTracker.mu.RUnlock() - - entries := make([]pageStateEntry, 0, len(u.pageTracker.m)) - for addr, state := range u.pageTracker.m { - offset, err := u.ma.GetOffset(addr) - if err != nil { - return nil, fmt.Errorf("address %#x not in mapping: %w", addr, err) - } - entries = append(entries, pageStateEntry{State: uint8(state), Offset: uint64(offset)}) - } - - return entries, nil -} - -// ---- Barrier registry --------------------------------------------------- - -// barrierPoint identifies WHICH hook a barrier should park on. -type barrierPoint uint8 - -const ( - // barrierBeforeRLock parks the worker BEFORE settleRequests.RLock(), - // i.e. before it can read the page state. Use this when a parallel - // writer needs the write lock immediately because no worker holds - // the read lock. - barrierBeforeRLock barrierPoint = 1 - // barrierBeforeFaultPage parks the worker AFTER it has taken - // settleRequests.RLock, but BEFORE the actual UFFDIO_COPY syscall. - // Use this when a parent operation must still return even though - // a worker holds RLock. - barrierBeforeFaultPage barrierPoint = 2 -) - -// barrierRegistry is the child-process side of the barrier. The -// hooks installed on Userfaultfd consult this registry by addr+point -// to decide whether to park, and the RPC handlers manipulate it from -// the parent over the socket. -type barrierRegistry struct { - mu sync.Mutex - next uint64 - tokens map[uint64]*barrierSlot - byKey map[barrierKey]uint64 -} - -type barrierKey struct { - addr uintptr - point barrierPoint -} - -type barrierSlot struct { - addr uintptr - point barrierPoint - arrived chan struct{} - release chan struct{} - arrivedOnce sync.Once -} - -func newBarrierRegistry() *barrierRegistry { - return &barrierRegistry{ - tokens: make(map[uint64]*barrierSlot), - byKey: make(map[barrierKey]uint64), - } -} - -func (b *barrierRegistry) install(addr uintptr, point barrierPoint) uint64 { - b.mu.Lock() - defer b.mu.Unlock() - - b.next++ - token := b.next - slot := &barrierSlot{ - addr: addr, - point: point, - arrived: make(chan struct{}), - release: make(chan struct{}), - } - b.tokens[token] = slot - b.byKey[barrierKey{addr, point}] = token - - return token -} - -func (b *barrierRegistry) lookupByAddr(addr uintptr, point barrierPoint) *barrierSlot { - b.mu.Lock() - defer b.mu.Unlock() - - token, ok := b.byKey[barrierKey{addr, point}] - if !ok { - return nil - } - - return b.tokens[token] -} - -func (b *barrierRegistry) waitArrived(ctx context.Context, token uint64) error { - b.mu.Lock() - slot, ok := b.tokens[token] - b.mu.Unlock() - if !ok { - return fmt.Errorf("unknown barrier token %d", token) - } - - select { - case <-slot.arrived: - return nil - case <-ctx.Done(): - return ctx.Err() - } -} - -func (b *barrierRegistry) release(token uint64) { - b.mu.Lock() - slot, ok := b.tokens[token] - delete(b.tokens, token) - if ok { - delete(b.byKey, barrierKey{slot.addr, slot.point}) - } - b.mu.Unlock() - - if !ok { - return - } - - select { - case <-slot.release: - default: - close(slot.release) - } -} - -func (b *barrierRegistry) releaseAll() { - b.mu.Lock() - tokens := make([]uint64, 0, len(b.tokens)) - for t := range b.tokens { - tokens = append(tokens, t) - } - b.mu.Unlock() - - for _, t := range tokens { - b.release(t) - } -} - -// hookFor returns the function to assign to a Userfaultfd -// beforeXxxHook field. The returned function is a no-op for any -// (addr, point) pair that hasn't been Install'd, so non-targeted -// faults see no scheduling distortion. -func (b *barrierRegistry) hookFor(point barrierPoint) func(addr uintptr) { - return func(addr uintptr) { - slot := b.lookupByAddr(addr, point) - if slot == nil { - return - } - - slot.arrivedOnce.Do(func() { - close(slot.arrived) - }) - - <-slot.release - } -} diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_child_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_child_test.go new file mode 100644 index 0000000000..b016bb1dfe --- /dev/null +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_child_test.go @@ -0,0 +1,100 @@ +package userfaultfd + +// Child side of the cross-process UFFD test harness. The child is a +// re-execed copy of the test binary entered through +// TestHelperServingProcess; it dials the parent's rendezvous socket, +// registers the three RPC services that share a single +// *harnessState container, and serves JSON-RPC until the parent +// issues Lifecycle.Shutdown. All actual work (build mapping, +// construct *Userfaultfd, install hooks, run Serve) is driven by +// Lifecycle.Bootstrap rather than env vars or extra fds. + +import ( + "context" + "fmt" + "net" + "net/rpc" + "net/rpc/jsonrpc" + "os" + "testing" +) + +// TestHelperServingProcess is the entry point for the child helper +// process spawned by configureCrossProcessTest. The parent re-execs +// the test binary with envHelperFlag=1 and a socket path; this test +// hands off to crossProcessServe and exits with its result. +func TestHelperServingProcess(t *testing.T) { + t.Parallel() + + if os.Getenv(envHelperFlag) != "1" { + t.Skip("this is a helper process, skipping direct execution") + } + + if err := crossProcessServe(); err != nil { + fmt.Fprintln(os.Stderr, "exit serving process:", err) + os.Exit(1) + } + + os.Exit(0) +} + +// crossProcessServe wires up the child side: dial the parent socket, +// register the three RPC services that share a single harnessState, +// and run jsonrpc.ServeCodec until the parent shuts us down. +func crossProcessServe() error { + socketPath := os.Getenv(envSocketPath) + if socketPath == "" { + return fmt.Errorf("missing %s", envSocketPath) + } + + dialer := net.Dialer{} + conn, err := dialer.DialContext(context.Background(), "unix", socketPath) + if err != nil { + return fmt.Errorf("dial parent socket: %w", err) + } + defer conn.Close() + + // The parent handed us the userfaultfd via cmd.ExtraFiles; the + // child-side dup3 inside fork+exec lands it at fd 3 with CLOEXEC + // cleared automatically. + uffdFile := os.NewFile(uintptr(3), "uffd") + defer uffdFile.Close() + + state := newHarnessState(uffdFile.Fd()) + + server := rpc.NewServer() + if err := server.Register(&Lifecycle{state: state}); err != nil { + return fmt.Errorf("rpc Register Lifecycle: %w", err) + } + if err := server.Register(&Paging{state: state}); err != nil { + return fmt.Errorf("rpc Register Paging: %w", err) + } + if err := server.Register(&Barriers{state: state}); err != nil { + return fmt.Errorf("rpc Register Barriers: %w", err) + } + + // Run the codec in a goroutine so we can react to Shutdown + // without depending on the codec returning. + codecDone := make(chan struct{}) + go func() { + defer close(codecDone) + server.ServeCodec(jsonrpc.NewServerCodec(conn)) + }() + + select { + case <-state.shutdown: + case <-codecDone: + } + + // Release any still-parked barriers so the serve goroutine can + // finish, then stop the serve goroutine. + state.releaseAllBarriers() + state.stopServe() + + // Closing the conn is sufficient to unblock ServeCodec if it + // hasn't already returned. + _ = conn.Close() + <-codecDone + + return nil +} diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_client_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_client_test.go new file mode 100644 index 0000000000..3a944d9b5b --- /dev/null +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_client_test.go @@ -0,0 +1,81 @@ +package userfaultfd + +// Typed client wrapper around the parent ↔ child JSON-RPC channel. +// Hides the method-name strings, the Empty placeholder pointers, and +// the wire-vs-domain type translation from the testHandler and from +// the call sites in individual tests. + +import ( + "io" + "net" + "net/rpc" + "net/rpc/jsonrpc" + "os/exec" +) + +type harnessClient struct { + rpc *rpc.Client + conn io.Closer + // cmd is retained so a future cleanup or diagnostic can reach + // the underlying process; today the test cleanup in the parent + // drives Wait directly. + cmd *exec.Cmd +} + +func newHarnessClient(conn net.Conn, cmd *exec.Cmd) *harnessClient { + return &harnessClient{ + rpc: jsonrpc.NewClient(conn), + conn: conn, + cmd: cmd, + } +} + +func (c *harnessClient) Bootstrap(args BootstrapArgs) error { + return c.rpc.Call("Lifecycle.Bootstrap", &args, &BootstrapReply{}) +} + +func (c *harnessClient) WaitReady() error { + return c.rpc.Call("Lifecycle.WaitReady", &Empty{}, &Empty{}) +} + +func (c *harnessClient) Shutdown() error { + return c.rpc.Call("Lifecycle.Shutdown", &Empty{}, &Empty{}) +} + +func (c *harnessClient) Pause() error { + return c.rpc.Call("Paging.Pause", &Empty{}, &Empty{}) +} + +func (c *harnessClient) Resume() error { + return c.rpc.Call("Paging.Resume", &Empty{}, &Empty{}) +} + +func (c *harnessClient) PageStates() ([]pageStateEntry, error) { + var reply PageStatesReply + if err := c.rpc.Call("Paging.States", &Empty{}, &reply); err != nil { + return nil, err + } + + return reply.Entries, nil +} + +func (c *harnessClient) InstallBarrier(addr uint64, point barrierPoint) (uint64, error) { + var reply FaultBarrierReply + if err := c.rpc.Call("Barriers.Install", &FaultBarrierArgs{Addr: addr, Point: uint8(point)}, &reply); err != nil { + return 0, err + } + + return reply.Token, nil +} + +func (c *harnessClient) WaitFaultHeld(token uint64) error { + return c.rpc.Call("Barriers.WaitHeld", &TokenArgs{Token: token}, &Empty{}) +} + +func (c *harnessClient) ReleaseFault(token uint64) error { + return c.rpc.Call("Barriers.Release", &TokenArgs{Token: token}, &Empty{}) +} + +func (c *harnessClient) Close() error { + return c.conn.Close() +} diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go new file mode 100644 index 0000000000..eeb4a4fa20 --- /dev/null +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go @@ -0,0 +1,222 @@ +package userfaultfd + +// Parent side of the cross-process UFFD test harness. The parent +// owns the userfaultfd (created and registered against an mmap that +// lives in the parent's address space) and drives the in-VM page +// fault servicing logic from a child helper process. We re-exec the +// test binary as the child so the actual page-fault handling runs +// in a process where we can fully control memory layout (no Go GC +// scanning / touching the registered region) — which mirrors how +// Firecracker uses UFFD in production. +// +// The side channels between parent and child are intentionally +// minimal: a single env var flagging the child as the helper, an +// env var carrying the rendezvous socket path, the userfaultfd +// itself handed off via ExtraFiles (it's a kernel object, has to go +// through that), and the unix domain socket used for JSON-RPC. All +// configuration (mmap geometry, source content, feature toggles) +// flows over a single Lifecycle.Bootstrap RPC. + +import ( + "context" + "crypto/rand" + "errors" + "fmt" + "net" + "os" + "os/exec" + "path/filepath" + "syscall" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "golang.org/x/sys/unix" + + "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/block" + "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/testutils" +) + +// MemorySlicer exposes a byte slice via the Slicer interface. +// Test-only. +type MemorySlicer struct { + content []byte + pagesize int64 +} + +var _ block.Slicer = (*MemorySlicer)(nil) + +func NewMemorySlicer(content []byte, pagesize int64) *MemorySlicer { + return &MemorySlicer{content: content, pagesize: pagesize} +} + +func (s *MemorySlicer) Slice(_ context.Context, offset, size int64) ([]byte, error) { + return s.content[offset : offset+size], nil +} + +func (s *MemorySlicer) Size() (int64, error) { + return int64(len(s.content)), nil +} + +func (s *MemorySlicer) Content() []byte { + return s.content +} + +func (s *MemorySlicer) BlockSize() int64 { + return s.pagesize +} + +func RandomPages(pagesize, numberOfPages uint64) *MemorySlicer { + size := pagesize * numberOfPages + buf := make([]byte, int(size)) + if _, err := rand.Read(buf); err != nil { + panic(err) + } + + return NewMemorySlicer(buf, int64(pagesize)) +} + +// Env vars used by the child helper process. The set is deliberately +// small: anything else lives in BootstrapArgs and flows over RPC. +const ( + envHelperFlag = "GO_TEST_HELPER_PROCESS" + envSocketPath = "GO_UFFD_SOCKET" +) + +// configureCrossProcessTest spawns the helper child, hands it the +// userfaultfd, and drives initial setup via Lifecycle.Bootstrap. +// All subsequent test interaction goes through the returned +// testHandler's *harnessClient. +func configureCrossProcessTest(ctx context.Context, t *testing.T, tt testConfig) (*testHandler, error) { + t.Helper() + + data := RandomPages(tt.pagesize, tt.numberOfPages) + + size, err := data.Size() + require.NoError(t, err) + + memoryArea, memoryStart, err := testutils.NewPageMmap(t, uint64(size), tt.pagesize) + require.NoError(t, err) + + uffdFd, err := newFd(syscall.O_CLOEXEC | syscall.O_NONBLOCK) + require.NoError(t, err) + t.Cleanup(func() { + uffdFd.close() + }) + + require.NoError(t, configureApi(uffdFd, tt.pagesize)) + require.NoError(t, register(uffdFd, memoryStart, uint64(size), UFFDIO_REGISTER_MODE_MISSING|UFFDIO_REGISTER_MODE_WP)) + + socketPath := filepath.Join(t.TempDir(), "rpc.sock") + listenCfg := net.ListenConfig{} + listener, err := listenCfg.Listen(ctx, "unix", socketPath) + require.NoError(t, err) + + cmd := exec.CommandContext(ctx, os.Args[0], "-test.run=TestHelperServingProcess", "-test.timeout=0") + cmd.Env = append(os.Environ(), + envHelperFlag+"=1", + envSocketPath+"="+socketPath, + ) + + // F_DUPFD_CLOEXEC duplicates uffdFd into a new fd that is born + // with CLOEXEC set, atomically. The previous incarnation used + // syscall.Dup followed by F_SETFD, which left a brief window + // where the dup'd fd was visible without CLOEXEC; under + // -parallel that window let other concurrent forks inherit a + // uffd they did not own and produced hard-to-diagnose, + // parallel-only deadlocks. Removing the window removes the + // need for the childForkMu serialising lock the previous + // version used to wrap cmd.Start. + dup, err := unix.FcntlInt(uintptr(uffdFd), unix.F_DUPFD_CLOEXEC, 0) + require.NoError(t, err) + + uffdFile := os.NewFile(uintptr(dup), "uffd") + cmd.ExtraFiles = []*os.File{uffdFile} + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + + startErr := cmd.Start() + uffdFile.Close() + require.NoError(t, startErr) + + // Accept the child's connection. Tight deadline so a wedged + // child surfaces fast instead of hanging the test. + type acceptResult struct { + conn net.Conn + err error + } + acceptCh := make(chan acceptResult, 1) + go func() { + c, err := listener.Accept() + acceptCh <- acceptResult{conn: c, err: err} + }() + + var conn net.Conn + select { + case res := <-acceptCh: + require.NoError(t, res.err) + conn = res.conn + case <-time.After(10 * time.Second): + listener.Close() + _ = cmd.Process.Kill() + _, _ = cmd.Process.Wait() + + return nil, errors.New("child did not connect within 10s") + } + listener.Close() + + client := newHarnessClient(conn, cmd) + + h := &testHandler{ + memoryArea: &memoryArea, + pagesize: tt.pagesize, + data: data, + client: client, + } + + if err := client.Bootstrap(BootstrapArgs{ + MmapStart: uint64(memoryStart), + Pagesize: int64(tt.pagesize), + TotalSize: size, + AlwaysWP: tt.alwaysWP, + Barriers: tt.barriers, + Content: data.Content(), + }); err != nil { + return nil, fmt.Errorf("Lifecycle.Bootstrap: %w", err) + } + + // WaitReady's successful reply IS the readiness signal. Bootstrap + // is synchronous, so today this is largely a smoke check, but + // the explicit RPC is kept so that future async-Bootstrap variants + // can hold the parent here without breaking the call site. + if err := client.WaitReady(); err != nil { + return nil, fmt.Errorf("Lifecycle.WaitReady: %w", err) + } + + t.Cleanup(func() { + // Best-effort graceful shutdown via RPC. If the child has + // already crashed the RPC will error and we fall back to + // killing the process below. + _ = client.Shutdown() + _ = client.Close() + + waitErr := cmd.Wait() + if waitErr != nil { + var exitErr *exec.ExitError + if !errors.As(waitErr, &exitErr) { + t.Logf("helper process Wait: %v", waitErr) + } + } + + // Tear down the UFFD registration before the early uffdFd.close() + // cleanup runs. Today this is a no-op (no test enables + // UFFD_FEATURE_EVENT_REMOVE) but a follow-up that does will + // otherwise see munmap block on un-acked REMOVE events queued + // against the still-registered range. Cleanups run LIFO, so + // this fires before the close registered earlier. + assert.NoError(t, unregister(uffdFd, memoryStart, uint64(size))) + }) + + return h, nil +} diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go index 43b615f314..9b56622a52 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go @@ -3,11 +3,7 @@ package userfaultfd import ( "bytes" "context" - "errors" "fmt" - "io" - "net/rpc" - "os/exec" "slices" "sync" "testing" @@ -31,8 +27,6 @@ type testConfig struct { operations []operation // alwaysWP makes the handler copy with UFFDIO_COPY_MODE_WP for all faults. alwaysWP bool - // gated enables pause/resume control over the handler's serve loop. - gated bool // barriers wires up the per-worker fault hooks in the child // (used by race tests). Off by default so the worker hot path // stays a single nil-pointer load + branch in non-race tests. @@ -79,25 +73,37 @@ type testHandler struct { memoryArea *[]byte pagesize uint64 data *MemorySlicer - // pageStatesOnce returns a per-state snapshot of the handler's pageTracker. - // Backed by the PageStates RPC; callable any number of times. - // The "Once" suffix is kept for source-stability with the existing - // test sites. - pageStatesOnce func() (handlerPageStates, error) - // servePause and serveResume gate the UFFD event loop in the child process. - // Tests use them to deterministically batch a sequence of UFFD events - // before more faults are processed. - servePause func() error - serveResume func() error - - // client is the RPC channel to the child helper process. - client *rpc.Client - conn io.Closer - cmd *exec.Cmd + + // client is the typed RPC channel to the child helper process. + // Pause/Resume/PageStates/etc. flow through it; tests should + // reach for the convenience methods on testHandler rather than + // poking at client directly. + client *harnessClient mutex sync.RWMutex } +// pageStatesOnce returns a per-state snapshot of the handler's +// pageTracker, fetched via the Paging.States RPC. The "Once" suffix +// is kept for source-stability with the existing test sites; the +// method is safely callable any number of times. +func (h *testHandler) pageStatesOnce() (handlerPageStates, error) { + entries, err := h.client.PageStates() + if err != nil { + return handlerPageStates{}, err + } + + var states handlerPageStates + for _, e := range entries { + if pageState(e.State) == faulted { + states.faulted = append(states.faulted, uint(e.Offset)) + } + } + slices.Sort(states.faulted) + + return states, nil +} + func (h *testHandler) executeAll(t *testing.T, operations []operation) { t.Helper() @@ -182,17 +188,9 @@ func (h *testHandler) executeOperation(ctx context.Context, op operation) error case operationModeWrite: return h.executeWrite(ctx, op) case operationModeServePause: - if h.servePause == nil { - return errors.New("operationModeServePause requires testConfig.gated = true") - } - - return h.servePause() + return h.client.Pause() case operationModeServeResume: - if h.serveResume == nil { - return errors.New("operationModeServeResume requires testConfig.gated = true") - } - - return h.serveResume() + return h.client.Resume() case operationModeSleep: time.Sleep(50 * time.Millisecond) diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_barriers_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_barriers_test.go new file mode 100644 index 0000000000..1a39d95201 --- /dev/null +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_barriers_test.go @@ -0,0 +1,218 @@ +package userfaultfd + +// Barriers RPC service plus the in-child barrierRegistry it +// manipulates. The hooks installed on Userfaultfd consult this +// registry by (addr, point) to decide whether to park a worker +// goroutine; the parent drives Install / WaitHeld / Release over RPC +// to deterministically race events against an in-flight worker. + +import ( + "context" + "errors" + "fmt" + "sync" +) + +type FaultBarrierArgs struct { + Addr uint64 + Point uint8 +} + +type FaultBarrierReply struct { + Token uint64 +} + +type TokenArgs struct { + Token uint64 +} + +// Barriers is the RPC service exposing the barrier registry to the +// parent. +type Barriers struct { + state *harnessState +} + +func (b *Barriers) Install(args *FaultBarrierArgs, reply *FaultBarrierReply) error { + br, err := b.registry() + if err != nil { + return err + } + reply.Token = br.install(uintptr(args.Addr), barrierPoint(args.Point)) + + return nil +} + +func (b *Barriers) WaitHeld(args *TokenArgs, _ *Empty) error { + br, err := b.registry() + if err != nil { + return err + } + + return br.waitArrived(context.Background(), args.Token) +} + +func (b *Barriers) Release(args *TokenArgs, _ *Empty) error { + br, err := b.registry() + if err != nil { + return err + } + br.release(args.Token) + + return nil +} + +func (b *Barriers) registry() (*barrierRegistry, error) { + b.state.mu.Lock() + br := b.state.br + b.state.mu.Unlock() + if br == nil { + return nil, errors.New("Barriers RPC called before Lifecycle.Bootstrap") + } + + return br, nil +} + +// barrierPoint identifies WHICH hook a barrier should park on. +type barrierPoint uint8 + +const ( + // barrierBeforeRLock parks the worker BEFORE settleRequests.RLock(), + // i.e. before it can read the page state. Use this when a parallel + // writer needs the write lock immediately because no worker holds + // the read lock. + barrierBeforeRLock barrierPoint = 1 + // barrierBeforeFaultPage parks the worker AFTER it has taken + // settleRequests.RLock, but BEFORE the actual UFFDIO_COPY syscall. + // Use this when a parent operation must still return even though + // a worker holds RLock. + barrierBeforeFaultPage barrierPoint = 2 +) + +// barrierRegistry is the child-process side of the barrier. The +// hooks installed on Userfaultfd consult this registry by addr+point +// to decide whether to park, and the RPC handlers manipulate it from +// the parent over the socket. +type barrierRegistry struct { + mu sync.Mutex + next uint64 + tokens map[uint64]*barrierSlot + byKey map[barrierKey]uint64 +} + +type barrierKey struct { + addr uintptr + point barrierPoint +} + +type barrierSlot struct { + addr uintptr + point barrierPoint + arrived chan struct{} + release chan struct{} + arrivedOnce sync.Once +} + +func newBarrierRegistry() *barrierRegistry { + return &barrierRegistry{ + tokens: make(map[uint64]*barrierSlot), + byKey: make(map[barrierKey]uint64), + } +} + +func (b *barrierRegistry) install(addr uintptr, point barrierPoint) uint64 { + b.mu.Lock() + defer b.mu.Unlock() + + b.next++ + token := b.next + slot := &barrierSlot{ + addr: addr, + point: point, + arrived: make(chan struct{}), + release: make(chan struct{}), + } + b.tokens[token] = slot + b.byKey[barrierKey{addr, point}] = token + + return token +} + +func (b *barrierRegistry) lookupByAddr(addr uintptr, point barrierPoint) *barrierSlot { + b.mu.Lock() + defer b.mu.Unlock() + + token, ok := b.byKey[barrierKey{addr, point}] + if !ok { + return nil + } + + return b.tokens[token] +} + +func (b *barrierRegistry) waitArrived(ctx context.Context, token uint64) error { + b.mu.Lock() + slot, ok := b.tokens[token] + b.mu.Unlock() + if !ok { + return fmt.Errorf("unknown barrier token %d", token) + } + + select { + case <-slot.arrived: + return nil + case <-ctx.Done(): + return ctx.Err() + } +} + +func (b *barrierRegistry) release(token uint64) { + b.mu.Lock() + slot, ok := b.tokens[token] + delete(b.tokens, token) + if ok { + delete(b.byKey, barrierKey{slot.addr, slot.point}) + } + b.mu.Unlock() + + if !ok { + return + } + + select { + case <-slot.release: + default: + close(slot.release) + } +} + +func (b *barrierRegistry) releaseAll() { + b.mu.Lock() + tokens := make([]uint64, 0, len(b.tokens)) + for t := range b.tokens { + tokens = append(tokens, t) + } + b.mu.Unlock() + + for _, t := range tokens { + b.release(t) + } +} + +// hookFor returns the function to assign to a Userfaultfd +// beforeXxxHook field. The returned function is a no-op for any +// (addr, point) pair that hasn't been Install'd, so non-targeted +// faults see no scheduling distortion. +func (b *barrierRegistry) hookFor(point barrierPoint) func(addr uintptr) { + return func(addr uintptr) { + slot := b.lookupByAddr(addr, point) + if slot == nil { + return + } + + slot.arrivedOnce.Do(func() { + close(slot.arrived) + }) + + <-slot.release + } +} diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_lifecycle_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_lifecycle_test.go new file mode 100644 index 0000000000..2669394723 --- /dev/null +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_lifecycle_test.go @@ -0,0 +1,191 @@ +package userfaultfd + +// Lifecycle RPC service plus the harnessState container that the +// three RPC services share. Lifecycle.Bootstrap is the single point +// where the in-child *Userfaultfd is constructed; Paging and +// Barriers consume the resulting state via *harnessState. + +import ( + "context" + "fmt" + "os" + "sync" + + "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/fdexit" + "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/memory" + "github.com/e2b-dev/infra/packages/shared/pkg/logger" +) + +// Empty is the stand-in args/reply type for net/rpc methods that +// take or return nothing meaningful. net/rpc still requires both +// pointers to be exported. +type Empty struct{} + +// harnessState is the per-child container holding the resources +// created by Lifecycle.Bootstrap and consumed by Paging/Barriers. +// All three RPC services hold a *harnessState rather than +// duplicating fields. Mutable fields are guarded by mu. +type harnessState struct { + uffdFd uintptr + + mu sync.Mutex + uffd *Userfaultfd + br *barrierRegistry + stop func() // currently active serve-stop function, nil if paused + shutdown chan struct{} + closed bool +} + +func newHarnessState(uffdFd uintptr) *harnessState { + return &harnessState{ + uffdFd: uffdFd, + shutdown: make(chan struct{}), + } +} + +// startServeLocked spawns the uffd Serve goroutine and stores its +// stop fn. Caller must hold s.mu. Idempotent: if the serve goroutine +// is already running (s.stop != nil) this is a no-op so a stray +// duplicate Resume can't leak an untracked Serve goroutine and break +// later pauses. +func (s *harnessState) startServeLocked() { + if s.stop != nil { + return + } + + exit, err := fdexit.New() + if err != nil { + fmt.Fprintln(os.Stderr, "fdexit.New:", err) + + return + } + + uffd := s.uffd + done := make(chan struct{}) + go func() { + defer close(done) + if err := uffd.Serve(context.Background(), exit); err != nil { + fmt.Fprintln(os.Stderr, "uffd.Serve:", err) + } + }() + + s.stop = func() { + _ = exit.SignalExit() + <-done + exit.Close() + } +} + +func (s *harnessState) stopServe() { + s.mu.Lock() + defer s.mu.Unlock() + if s.stop != nil { + s.stop() + s.stop = nil + } +} + +func (s *harnessState) releaseAllBarriers() { + s.mu.Lock() + br := s.br + s.mu.Unlock() + if br != nil { + br.releaseAll() + } +} + +// BootstrapArgs is the single message the parent sends to drive +// child initialisation. Everything that used to flow over env vars +// or the content tmp file now lives in this struct, base64-encoded +// by the JSON-RPC codec for byte slices. For typical test sizes +// (≤1MB) the encoding overhead is irrelevant; if a future test needs +// >10MB of source content, that PR can revisit and add chunking. +type BootstrapArgs struct { + MmapStart uint64 + Pagesize int64 + TotalSize int64 + AlwaysWP bool + // Barriers gates the test-only worker hooks. Off by default so + // the worker hot path stays a single nil-pointer load + branch + // in non-race tests. + Barriers bool + Content []byte +} + +type BootstrapReply struct{} + +// Lifecycle owns the boot/shutdown of the in-child uffd. Bootstrap +// builds the *Userfaultfd, optionally installs the test hooks, and +// kicks off the initial Serve goroutine. Shutdown signals the +// crossProcessServe loop to exit; the goroutine + barrier registry +// are torn down there so we don't have to hold any mutex while +// joining the serve goroutine. +type Lifecycle struct { + state *harnessState +} + +func (l *Lifecycle) Bootstrap(args *BootstrapArgs, _ *BootstrapReply) error { + if int64(len(args.Content)) != args.TotalSize { + return fmt.Errorf("content size %d != expected %d", len(args.Content), args.TotalSize) + } + + data := NewMemorySlicer(args.Content, args.Pagesize) + + mapping := memory.NewMapping([]memory.Region{ + { + BaseHostVirtAddr: uintptr(args.MmapStart), + Size: uintptr(args.TotalSize), + Offset: 0, + PageSize: uintptr(args.Pagesize), + }, + }) + + log, err := logger.NewDevelopmentLogger() + if err != nil { + return fmt.Errorf("logger: %w", err) + } + + uffd, err := NewUserfaultfdFromFd(l.state.uffdFd, data, mapping, log) + if err != nil { + return fmt.Errorf("NewUserfaultfdFromFd: %w", err) + } + + if args.AlwaysWP { + uffd.defaultCopyMode = UFFDIO_COPY_MODE_WP + } + + br := newBarrierRegistry() + if args.Barriers { + uffd.SetTestHooks(&testHooks{ + beforeWorkerRLock: br.hookFor(barrierBeforeRLock), + beforeFaultPage: br.hookFor(barrierBeforeFaultPage), + }) + } + + l.state.mu.Lock() + defer l.state.mu.Unlock() + l.state.uffd = uffd + l.state.br = br + l.state.startServeLocked() + + return nil +} + +// WaitReady is a no-op: Bootstrap is synchronous, so its successful +// reply already implies readiness. WaitReady is kept as a separate +// RPC so that an async-Bootstrap variant can be slotted in later +// without touching every call site. +func (l *Lifecycle) WaitReady(_ *Empty, _ *Empty) error { + return nil +} + +func (l *Lifecycle) Shutdown(_ *Empty, _ *Empty) error { + l.state.mu.Lock() + defer l.state.mu.Unlock() + if !l.state.closed { + l.state.closed = true + close(l.state.shutdown) + } + + return nil +} diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_paging_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_paging_test.go new file mode 100644 index 0000000000..a4bb7d9fad --- /dev/null +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_paging_test.go @@ -0,0 +1,80 @@ +package userfaultfd + +// Paging RPC service. Exposes the per-page state snapshot consumed +// by tests asserting which offsets the handler observed, plus the +// pause/resume handles used by gated tests to drain the kernel's +// UFFD event queue deterministically. + +import ( + "errors" + "fmt" +) + +// pageStateEntry is the wire format for the Paging.States RPC. +type pageStateEntry struct { + State uint8 + Offset uint64 +} + +type PageStatesReply struct { + Entries []pageStateEntry +} + +// Paging is the RPC service exposing page-state introspection and +// the gated-serve pause/resume controls. +type Paging struct { + state *harnessState +} + +func (p *Paging) States(_ *Empty, reply *PageStatesReply) error { + p.state.mu.Lock() + uffd := p.state.uffd + p.state.mu.Unlock() + if uffd == nil { + return errors.New("Paging.States called before Lifecycle.Bootstrap") + } + + entries, err := uffd.pageStateEntries() + if err != nil { + return err + } + reply.Entries = entries + + return nil +} + +func (p *Paging) Pause(_ *Empty, _ *Empty) error { + p.state.stopServe() + + return nil +} + +func (p *Paging) Resume(_ *Empty, _ *Empty) error { + p.state.mu.Lock() + defer p.state.mu.Unlock() + p.state.startServeLocked() + + return nil +} + +// pageStateEntries returns a snapshot of every tracked page and its +// state. It briefly takes settleRequests.Lock so no in-flight worker +// can mutate the pageTracker while we read it. +func (u *Userfaultfd) pageStateEntries() ([]pageStateEntry, error) { + u.settleRequests.Lock() + u.settleRequests.Unlock() //nolint:staticcheck // SA2001: intentional — settle the read locks. + + u.pageTracker.mu.RLock() + defer u.pageTracker.mu.RUnlock() + + entries := make([]pageStateEntry, 0, len(u.pageTracker.m)) + for addr, state := range u.pageTracker.m { + offset, err := u.ma.GetOffset(addr) + if err != nil { + return nil, fmt.Errorf("address %#x not in mapping: %w", addr, err) + } + entries = append(entries, pageStateEntry{State: uint8(state), Offset: uint64(offset)}) + } + + return entries, nil +} From 0b4c5905ac7537c6c81d6c45175a3d05a90dd9d3 Mon Sep 17 00:00:00 2001 From: ValentaTomas Date: Fri, 1 May 2026 16:00:31 -0700 Subject: [PATCH 08/22] refactor(uffd-tests): pass rpc socket via socketpair fd, drop env path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace net.Listen/Accept/Dial over a tmp Unix socket with a socketpair(2) fd pair handed to the child via cmd.ExtraFiles. Removes: - envSocketPath ("GO_UFFD_SOCKET") — only envHelperFlag remains - net.ListenConfig.Listen on a t.TempDir() path - listener.Accept with its 10s timeout select race - the temp socket path inode under t.TempDir() Side channels are now {GO_TEST_HELPER_PROCESS env var, uffd fd at fd 3, rpc socket fd at fd 4}. --- .../uffd/userfaultfd/harness_child_test.go | 50 +++++------ .../uffd/userfaultfd/harness_parent_test.go | 82 ++++++++----------- 2 files changed, 57 insertions(+), 75 deletions(-) diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_child_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_child_test.go index b016bb1dfe..71fcde1fa8 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_child_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_child_test.go @@ -2,15 +2,15 @@ package userfaultfd // Child side of the cross-process UFFD test harness. The child is a // re-execed copy of the test binary entered through -// TestHelperServingProcess; it dials the parent's rendezvous socket, -// registers the three RPC services that share a single -// *harnessState container, and serves JSON-RPC until the parent -// issues Lifecycle.Shutdown. All actual work (build mapping, -// construct *Userfaultfd, install hooks, run Serve) is driven by -// Lifecycle.Bootstrap rather than env vars or extra fds. +// TestHelperServingProcess; it adopts the inherited rpc socket fd +// (fd 4, the parent's socketpair half), registers the three RPC +// services that share a single *harnessState container, and serves +// JSON-RPC until the parent issues Lifecycle.Shutdown. All actual +// work (build mapping, construct *Userfaultfd, install hooks, run +// Serve) is driven by Lifecycle.Bootstrap rather than env vars or +// extra fds. import ( - "context" "fmt" "net" "net/rpc" @@ -21,8 +21,9 @@ import ( // TestHelperServingProcess is the entry point for the child helper // process spawned by configureCrossProcessTest. The parent re-execs -// the test binary with envHelperFlag=1 and a socket path; this test -// hands off to crossProcessServe and exits with its result. +// the test binary with envHelperFlag=1 and the uffd / rpc fds in +// ExtraFiles; this test hands off to crossProcessServe and exits +// with its result. func TestHelperServingProcess(t *testing.T) { t.Parallel() @@ -38,28 +39,27 @@ func TestHelperServingProcess(t *testing.T) { os.Exit(0) } -// crossProcessServe wires up the child side: dial the parent socket, -// register the three RPC services that share a single harnessState, -// and run jsonrpc.ServeCodec until the parent shuts us down. +// crossProcessServe wires up the child side: adopt the inherited +// rpc socket fd, register the three RPC services that share a single +// harnessState, and run jsonrpc.ServeCodec until the parent shuts +// us down. func crossProcessServe() error { - socketPath := os.Getenv(envSocketPath) - if socketPath == "" { - return fmt.Errorf("missing %s", envSocketPath) - } + // The parent handed us two fds via cmd.ExtraFiles; the child-side + // dup3 inside fork+exec lands them at fd 3 (uffd) and fd 4 (rpc + // socketpair half) with CLOEXEC cleared automatically. + uffdFile := os.NewFile(uintptr(3), "uffd") + defer uffdFile.Close() - dialer := net.Dialer{} - conn, err := dialer.DialContext(context.Background(), "unix", socketPath) + rpcFile := os.NewFile(uintptr(4), "rpc") + conn, err := net.FileConn(rpcFile) + // FileConn dups the underlying fd; close rpcFile in both the + // success and error paths so we don't leak an fd. + rpcFile.Close() if err != nil { - return fmt.Errorf("dial parent socket: %w", err) + return fmt.Errorf("net.FileConn rpc: %w", err) } defer conn.Close() - // The parent handed us the userfaultfd via cmd.ExtraFiles; the - // child-side dup3 inside fork+exec lands it at fd 3 with CLOEXEC - // cleared automatically. - uffdFile := os.NewFile(uintptr(3), "uffd") - defer uffdFile.Close() - state := newHarnessState(uffdFile.Fd()) server := rpc.NewServer() diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go index eeb4a4fa20..72bec240e5 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go @@ -10,10 +10,9 @@ package userfaultfd // Firecracker uses UFFD in production. // // The side channels between parent and child are intentionally -// minimal: a single env var flagging the child as the helper, an -// env var carrying the rendezvous socket path, the userfaultfd -// itself handed off via ExtraFiles (it's a kernel object, has to go -// through that), and the unix domain socket used for JSON-RPC. All +// minimal: a single env var flagging the child as the helper, the +// userfaultfd handed off via ExtraFiles at fd 3, and a socketpair(2) +// half handed off via ExtraFiles at fd 4 carrying JSON-RPC. All // configuration (mmap geometry, source content, feature toggles) // flows over a single Lifecycle.Bootstrap RPC. @@ -25,10 +24,8 @@ import ( "net" "os" "os/exec" - "path/filepath" "syscall" "testing" - "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -77,12 +74,9 @@ func RandomPages(pagesize, numberOfPages uint64) *MemorySlicer { return NewMemorySlicer(buf, int64(pagesize)) } -// Env vars used by the child helper process. The set is deliberately -// small: anything else lives in BootstrapArgs and flows over RPC. -const ( - envHelperFlag = "GO_TEST_HELPER_PROCESS" - envSocketPath = "GO_UFFD_SOCKET" -) +// Env var used by the child helper process. The set is deliberately +// minimal: anything else lives in BootstrapArgs and flows over RPC. +const envHelperFlag = "GO_TEST_HELPER_PROCESS" // configureCrossProcessTest spawns the helper child, hands it the // userfaultfd, and drives initial setup via Lifecycle.Bootstrap. @@ -108,16 +102,8 @@ func configureCrossProcessTest(ctx context.Context, t *testing.T, tt testConfig) require.NoError(t, configureApi(uffdFd, tt.pagesize)) require.NoError(t, register(uffdFd, memoryStart, uint64(size), UFFDIO_REGISTER_MODE_MISSING|UFFDIO_REGISTER_MODE_WP)) - socketPath := filepath.Join(t.TempDir(), "rpc.sock") - listenCfg := net.ListenConfig{} - listener, err := listenCfg.Listen(ctx, "unix", socketPath) - require.NoError(t, err) - cmd := exec.CommandContext(ctx, os.Args[0], "-test.run=TestHelperServingProcess", "-test.timeout=0") - cmd.Env = append(os.Environ(), - envHelperFlag+"=1", - envSocketPath+"="+socketPath, - ) + cmd.Env = append(os.Environ(), envHelperFlag+"=1") // F_DUPFD_CLOEXEC duplicates uffdFd into a new fd that is born // with CLOEXEC set, atomically. The previous incarnation used @@ -130,43 +116,39 @@ func configureCrossProcessTest(ctx context.Context, t *testing.T, tt testConfig) // version used to wrap cmd.Start. dup, err := unix.FcntlInt(uintptr(uffdFd), unix.F_DUPFD_CLOEXEC, 0) require.NoError(t, err) - uffdFile := os.NewFile(uintptr(dup), "uffd") - cmd.ExtraFiles = []*os.File{uffdFile} + + // socketpair gives us a connected AF_UNIX/SOCK_STREAM pair in one + // syscall, no listener / accept / dial / temp inode required. + // SOCK_CLOEXEC on the pair is the same rationale as F_DUPFD_CLOEXEC + // on the uffd fd above: both ends are born CLOEXEC so a concurrent + // fork in another goroutine cannot leak them. cmd.ExtraFiles will + // clear CLOEXEC on the destination fd in the child via dup3 inside + // ForkExec, which is exactly what we want for fd 4. + fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) + require.NoError(t, err) + parentEnd := os.NewFile(uintptr(fds[0]), "rpc-parent") + childEnd := os.NewFile(uintptr(fds[1]), "rpc-child") + + cmd.ExtraFiles = []*os.File{uffdFile, childEnd} cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr startErr := cmd.Start() uffdFile.Close() - require.NoError(t, startErr) - - // Accept the child's connection. Tight deadline so a wedged - // child surfaces fast instead of hanging the test. - type acceptResult struct { - conn net.Conn - err error - } - acceptCh := make(chan acceptResult, 1) - go func() { - c, err := listener.Accept() - acceptCh <- acceptResult{conn: c, err: err} - }() - - var conn net.Conn - select { - case res := <-acceptCh: - require.NoError(t, res.err) - conn = res.conn - case <-time.After(10 * time.Second): - listener.Close() - _ = cmd.Process.Kill() - _, _ = cmd.Process.Wait() - - return nil, errors.New("child did not connect within 10s") + childEnd.Close() + if startErr != nil { + parentEnd.Close() + require.NoError(t, startErr) } - listener.Close() - client := newHarnessClient(conn, cmd) + // FileConn dups the underlying fd, so closing parentEnd after is + // both correct and necessary to avoid leaking the original fd. + parentConn, err := net.FileConn(parentEnd) + parentEnd.Close() + require.NoError(t, err) + + client := newHarnessClient(parentConn, cmd) h := &testHandler{ memoryArea: &memoryArea, From a55bc31941b0127027e2ac86b3a8b573421919c1 Mon Sep 17 00:00:00 2001 From: ValentaTomas Date: Fri, 1 May 2026 16:38:05 -0700 Subject: [PATCH 09/22] =?UTF-8?q?test(uffd):=20rename=20pageStatesOnce=20?= =?UTF-8?q?=E2=86=92=20pageStates?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous Once-suffix was kept temporarily for call-site stability; the method has always been callable any number of times. Rename the method and update the eight call sites. --- .../pkg/sandbox/uffd/userfaultfd/helpers_test.go | 8 +++----- .../pkg/sandbox/uffd/userfaultfd/missing_test.go | 8 ++++---- .../pkg/sandbox/uffd/userfaultfd/missing_write_test.go | 8 ++++---- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go index 9b56622a52..d3d0dc21bd 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go @@ -83,11 +83,9 @@ type testHandler struct { mutex sync.RWMutex } -// pageStatesOnce returns a per-state snapshot of the handler's -// pageTracker, fetched via the Paging.States RPC. The "Once" suffix -// is kept for source-stability with the existing test sites; the -// method is safely callable any number of times. -func (h *testHandler) pageStatesOnce() (handlerPageStates, error) { +// pageStates returns a per-state snapshot of the handler's +// pageTracker, fetched via the Paging.States RPC. +func (h *testHandler) pageStates() (handlerPageStates, error) { entries, err := h.client.PageStates() if err != nil { return handlerPageStates{}, err diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_test.go index fac233ad5f..989915bfd3 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_test.go @@ -128,7 +128,7 @@ func TestMissing(t *testing.T) { expectedAccessedOffsets := getOperationsOffsets(tt.operations, operationModeRead|operationModeWrite) - states, err := h.pageStatesOnce() + states, err := h.pageStates() require.NoError(t, err) assert.Equal(t, expectedAccessedOffsets, states.allAccessed(), "checking which pages were faulted") @@ -169,7 +169,7 @@ func TestParallelMissing(t *testing.T) { expectedAccessedOffsets := getOperationsOffsets([]operation{readOp}, operationModeRead) - states, err := h.pageStatesOnce() + states, err := h.pageStates() require.NoError(t, err) assert.Equal(t, expectedAccessedOffsets, states.allAccessed(), "checking which pages were faulted") @@ -209,7 +209,7 @@ func TestParallelMissingWithPrefault(t *testing.T) { expectedAccessedOffsets := getOperationsOffsets([]operation{readOp}, operationModeRead) - states, err := h.pageStatesOnce() + states, err := h.pageStates() require.NoError(t, err) assert.Equal(t, expectedAccessedOffsets, states.allAccessed(), "checking which pages were faulted") @@ -240,7 +240,7 @@ func TestSerialMissing(t *testing.T) { expectedAccessedOffsets := getOperationsOffsets([]operation{readOp}, operationModeRead) - states, err := h.pageStatesOnce() + states, err := h.pageStates() require.NoError(t, err) assert.Equal(t, expectedAccessedOffsets, states.allAccessed(), "checking which pages were faulted") diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_write_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_write_test.go index 34e55f476a..caa835cbdb 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_write_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_write_test.go @@ -123,7 +123,7 @@ func TestMissingWrite(t *testing.T) { expectedAccessedOffsets := getOperationsOffsets(tt.operations, operationModeRead|operationModeWrite) - states, err := h.pageStatesOnce() + states, err := h.pageStates() require.NoError(t, err) assert.Equal(t, expectedAccessedOffsets, states.allAccessed(), "checking which pages were faulted") @@ -164,7 +164,7 @@ func TestParallelMissingWrite(t *testing.T) { expectedAccessedOffsets := getOperationsOffsets([]operation{writeOp}, operationModeRead|operationModeWrite) - states, err := h.pageStatesOnce() + states, err := h.pageStates() require.NoError(t, err) assert.Equal(t, expectedAccessedOffsets, states.allAccessed(), "checking which pages were faulted") @@ -204,7 +204,7 @@ func TestParallelMissingWriteWithPrefault(t *testing.T) { expectedAccessedOffsets := getOperationsOffsets([]operation{writeOp}, operationModeRead|operationModeWrite) - states, err := h.pageStatesOnce() + states, err := h.pageStates() require.NoError(t, err) assert.Equal(t, expectedAccessedOffsets, states.allAccessed(), "checking which pages were faulted") @@ -235,7 +235,7 @@ func TestSerialMissingWrite(t *testing.T) { expectedAccessedOffsets := getOperationsOffsets([]operation{writeOp}, operationModeRead|operationModeWrite) - states, err := h.pageStatesOnce() + states, err := h.pageStates() require.NoError(t, err) assert.Equal(t, expectedAccessedOffsets, states.allAccessed(), "checking which pages were faulted") From aa1d10c9b58daeedd6c10d4251d0b4e1b2918125 Mon Sep 17 00:00:00 2001 From: ValentaTomas Date: Fri, 1 May 2026 16:43:08 -0700 Subject: [PATCH 10/22] refactor(uffd-tests): extract rpc wire types/client/barrier registry to internal/rpcharness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the parts of the test harness that don't touch *Userfaultfd internals into a new internal subpackage: packages/.../uffd/userfaultfd/internal/rpcharness/ wire.go — Bootstrap/PageStates/FaultBarrier/Token args+replies client.go — typed Client wrapping *rpc.Client (Bootstrap, Pause, Resume, PageStates, InstallBarrier, WaitFaultHeld, ReleaseFault, Shutdown, Close) barriers.go — Registry + Point + HookFor The RPC service implementations stay as _test.go in userfaultfd (they need access to unexported pageStateEntries, settleRequests, pageTracker, defaultCopyMode etc.). They become smaller — wire types no longer locally defined; barrier registry consumed via interface boundary; testHandler holds a single *rpcharness.Client instead of {*rpc.Client, servePause, serveResume, pageStates} func fields. internal/ scoping ensures the testutil package can never leak into a production import path. Test-only. --- .../uffd/userfaultfd/harness_client_test.go | 81 ----- .../uffd/userfaultfd/harness_parent_test.go | 7 +- .../sandbox/uffd/userfaultfd/helpers_test.go | 3 +- .../internal/rpcharness/barriers.go | 161 ++++++++++ .../userfaultfd/internal/rpcharness/client.go | 76 +++++ .../userfaultfd/internal/rpcharness/wire.go | 63 ++++ .../uffd/userfaultfd/rpc_barriers_test.go | 218 -------------- .../uffd/userfaultfd/rpc_lifecycle_test.go | 191 ------------ .../uffd/userfaultfd/rpc_paging_test.go | 80 ----- .../uffd/userfaultfd/rpc_services_test.go | 285 ++++++++++++++++++ 10 files changed, 591 insertions(+), 574 deletions(-) delete mode 100644 packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_client_test.go create mode 100644 packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/barriers.go create mode 100644 packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/client.go create mode 100644 packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/wire.go delete mode 100644 packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_barriers_test.go delete mode 100644 packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_lifecycle_test.go delete mode 100644 packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_paging_test.go create mode 100644 packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_client_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_client_test.go deleted file mode 100644 index 3a944d9b5b..0000000000 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_client_test.go +++ /dev/null @@ -1,81 +0,0 @@ -package userfaultfd - -// Typed client wrapper around the parent ↔ child JSON-RPC channel. -// Hides the method-name strings, the Empty placeholder pointers, and -// the wire-vs-domain type translation from the testHandler and from -// the call sites in individual tests. - -import ( - "io" - "net" - "net/rpc" - "net/rpc/jsonrpc" - "os/exec" -) - -type harnessClient struct { - rpc *rpc.Client - conn io.Closer - // cmd is retained so a future cleanup or diagnostic can reach - // the underlying process; today the test cleanup in the parent - // drives Wait directly. - cmd *exec.Cmd -} - -func newHarnessClient(conn net.Conn, cmd *exec.Cmd) *harnessClient { - return &harnessClient{ - rpc: jsonrpc.NewClient(conn), - conn: conn, - cmd: cmd, - } -} - -func (c *harnessClient) Bootstrap(args BootstrapArgs) error { - return c.rpc.Call("Lifecycle.Bootstrap", &args, &BootstrapReply{}) -} - -func (c *harnessClient) WaitReady() error { - return c.rpc.Call("Lifecycle.WaitReady", &Empty{}, &Empty{}) -} - -func (c *harnessClient) Shutdown() error { - return c.rpc.Call("Lifecycle.Shutdown", &Empty{}, &Empty{}) -} - -func (c *harnessClient) Pause() error { - return c.rpc.Call("Paging.Pause", &Empty{}, &Empty{}) -} - -func (c *harnessClient) Resume() error { - return c.rpc.Call("Paging.Resume", &Empty{}, &Empty{}) -} - -func (c *harnessClient) PageStates() ([]pageStateEntry, error) { - var reply PageStatesReply - if err := c.rpc.Call("Paging.States", &Empty{}, &reply); err != nil { - return nil, err - } - - return reply.Entries, nil -} - -func (c *harnessClient) InstallBarrier(addr uint64, point barrierPoint) (uint64, error) { - var reply FaultBarrierReply - if err := c.rpc.Call("Barriers.Install", &FaultBarrierArgs{Addr: addr, Point: uint8(point)}, &reply); err != nil { - return 0, err - } - - return reply.Token, nil -} - -func (c *harnessClient) WaitFaultHeld(token uint64) error { - return c.rpc.Call("Barriers.WaitHeld", &TokenArgs{Token: token}, &Empty{}) -} - -func (c *harnessClient) ReleaseFault(token uint64) error { - return c.rpc.Call("Barriers.Release", &TokenArgs{Token: token}, &Empty{}) -} - -func (c *harnessClient) Close() error { - return c.conn.Close() -} diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go index 72bec240e5..8027558ab4 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go @@ -33,6 +33,7 @@ import ( "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/block" "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/testutils" + "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness" ) // MemorySlicer exposes a byte slice via the Slicer interface. @@ -81,7 +82,7 @@ const envHelperFlag = "GO_TEST_HELPER_PROCESS" // configureCrossProcessTest spawns the helper child, hands it the // userfaultfd, and drives initial setup via Lifecycle.Bootstrap. // All subsequent test interaction goes through the returned -// testHandler's *harnessClient. +// testHandler's *rpcharness.Client. func configureCrossProcessTest(ctx context.Context, t *testing.T, tt testConfig) (*testHandler, error) { t.Helper() @@ -148,7 +149,7 @@ func configureCrossProcessTest(ctx context.Context, t *testing.T, tt testConfig) parentEnd.Close() require.NoError(t, err) - client := newHarnessClient(parentConn, cmd) + client := rpcharness.NewClient(parentConn) h := &testHandler{ memoryArea: &memoryArea, @@ -157,7 +158,7 @@ func configureCrossProcessTest(ctx context.Context, t *testing.T, tt testConfig) client: client, } - if err := client.Bootstrap(BootstrapArgs{ + if err := client.Bootstrap(rpcharness.BootstrapArgs{ MmapStart: uint64(memoryStart), Pagesize: int64(tt.pagesize), TotalSize: size, diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go index d3d0dc21bd..06a58f9ee5 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go @@ -15,6 +15,7 @@ import ( "github.com/stretchr/testify/require" "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/testutils" + "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness" ) type testConfig struct { @@ -78,7 +79,7 @@ type testHandler struct { // Pause/Resume/PageStates/etc. flow through it; tests should // reach for the convenience methods on testHandler rather than // poking at client directly. - client *harnessClient + client *rpcharness.Client mutex sync.RWMutex } diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/barriers.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/barriers.go new file mode 100644 index 0000000000..4771a0f3a1 --- /dev/null +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/barriers.go @@ -0,0 +1,161 @@ +package rpcharness + +import ( + "context" + "fmt" + "sync" +) + +// Point identifies WHICH worker hook a barrier should park on. +type Point uint8 + +const ( + // BeforeRLock parks the worker BEFORE settleRequests.RLock(), + // i.e. before it can read the page state. Use this when a parallel + // writer needs the write lock immediately because no worker holds + // the read lock. + BeforeRLock Point = 1 + // BeforeFaultPage parks the worker AFTER it has taken + // settleRequests.RLock, but BEFORE the actual UFFDIO_COPY syscall. + // Use this when a parent operation must still return even though a + // worker holds RLock. + BeforeFaultPage Point = 2 +) + +// Registry is the child-process side of the barrier mechanism. The +// hooks installed on *Userfaultfd consult this registry by addr+point +// to decide whether to park, and the Barriers RPC handlers manipulate +// it from the parent over the socket. +type Registry struct { + mu sync.Mutex + next uint64 + tokens map[uint64]*slot + byKey map[key]uint64 +} + +type key struct { + addr uintptr + point Point +} + +type slot struct { + addr uintptr + point Point + arrived chan struct{} + release chan struct{} + arrivedOnce sync.Once +} + +func NewRegistry() *Registry { + return &Registry{ + tokens: make(map[uint64]*slot), + byKey: make(map[key]uint64), + } +} + +// Install registers a barrier at (addr, point) and returns the opaque +// token used by subsequent WaitArrived/Release calls. +func (r *Registry) Install(addr uintptr, point Point) uint64 { + r.mu.Lock() + defer r.mu.Unlock() + + r.next++ + token := r.next + s := &slot{ + addr: addr, + point: point, + arrived: make(chan struct{}), + release: make(chan struct{}), + } + r.tokens[token] = s + r.byKey[key{addr, point}] = token + + return token +} + +func (r *Registry) lookupByAddr(addr uintptr, point Point) *slot { + r.mu.Lock() + defer r.mu.Unlock() + + token, ok := r.byKey[key{addr, point}] + if !ok { + return nil + } + + return r.tokens[token] +} + +// WaitArrived blocks until the worker hook for token has reached the +// barrier point, or until ctx is cancelled. +func (r *Registry) WaitArrived(ctx context.Context, token uint64) error { + r.mu.Lock() + s, ok := r.tokens[token] + r.mu.Unlock() + if !ok { + return fmt.Errorf("unknown barrier token %d", token) + } + + select { + case <-s.arrived: + return nil + case <-ctx.Done(): + return ctx.Err() + } +} + +// Release frees the barrier identified by token, allowing any parked +// worker to proceed. Releasing an unknown token is a no-op. +func (r *Registry) Release(token uint64) { + r.mu.Lock() + s, ok := r.tokens[token] + delete(r.tokens, token) + if ok { + delete(r.byKey, key{s.addr, s.point}) + } + r.mu.Unlock() + + if !ok { + return + } + + select { + case <-s.release: + default: + close(s.release) + } +} + +// ReleaseAll releases every still-installed barrier. Called during +// child shutdown so that any parked worker can finish before the +// serve goroutine is joined. +func (r *Registry) ReleaseAll() { + r.mu.Lock() + tokens := make([]uint64, 0, len(r.tokens)) + for t := range r.tokens { + tokens = append(tokens, t) + } + r.mu.Unlock() + + for _, t := range tokens { + r.Release(t) + } +} + +// HookFor returns the function to assign to the matching test-hook +// field on *Userfaultfd. The returned function is a no-op for any +// (addr, point) pair that hasn't been Install'd, so non-targeted +// faults see no scheduling distortion. +func (r *Registry) HookFor(point Point) func(addr uintptr) { + return func(addr uintptr) { + s := r.lookupByAddr(addr, point) + if s == nil { + return + } + + s.arrivedOnce.Do(func() { + close(s.arrived) + }) + + <-s.release + } +} diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/client.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/client.go new file mode 100644 index 0000000000..6c2b3a934c --- /dev/null +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/client.go @@ -0,0 +1,76 @@ +package rpcharness + +import ( + "io" + "net/rpc" + "net/rpc/jsonrpc" +) + +// Client is the typed parent-side wrapper around the JSON-RPC channel +// to the child helper process. Hides method-name strings, the Empty +// placeholder pointers, and the wire-vs-domain type translation from +// every call site. +type Client struct { + rpc *rpc.Client + conn io.Closer +} + +// NewClient wraps an already-connected duplex stream (typically one +// half of a socketpair handed off via cmd.ExtraFiles) in a typed +// client. Closing the returned Client closes the underlying conn. +func NewClient(conn io.ReadWriteCloser) *Client { + return &Client{ + rpc: jsonrpc.NewClient(conn), + conn: conn, + } +} + +func (c *Client) Bootstrap(args BootstrapArgs) error { + return c.rpc.Call("Lifecycle.Bootstrap", &args, &BootstrapReply{}) +} + +func (c *Client) WaitReady() error { + return c.rpc.Call("Lifecycle.WaitReady", &Empty{}, &Empty{}) +} + +func (c *Client) Shutdown() error { + return c.rpc.Call("Lifecycle.Shutdown", &Empty{}, &Empty{}) +} + +func (c *Client) Pause() error { + return c.rpc.Call("Paging.Pause", &Empty{}, &Empty{}) +} + +func (c *Client) Resume() error { + return c.rpc.Call("Paging.Resume", &Empty{}, &Empty{}) +} + +func (c *Client) PageStates() ([]PageStateEntry, error) { + var reply PageStatesReply + if err := c.rpc.Call("Paging.States", &Empty{}, &reply); err != nil { + return nil, err + } + + return reply.Entries, nil +} + +func (c *Client) InstallBarrier(addr uintptr, point Point) (uint64, error) { + var reply FaultBarrierReply + if err := c.rpc.Call("Barriers.Install", &FaultBarrierArgs{Addr: uint64(addr), Point: uint8(point)}, &reply); err != nil { + return 0, err + } + + return reply.Token, nil +} + +func (c *Client) WaitFaultHeld(token uint64) error { + return c.rpc.Call("Barriers.WaitHeld", &TokenArgs{Token: token}, &Empty{}) +} + +func (c *Client) ReleaseFault(token uint64) error { + return c.rpc.Call("Barriers.Release", &TokenArgs{Token: token}, &Empty{}) +} + +func (c *Client) Close() error { + return c.conn.Close() +} diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/wire.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/wire.go new file mode 100644 index 0000000000..172f5282f7 --- /dev/null +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/wire.go @@ -0,0 +1,63 @@ +// Package rpcharness provides the wire types, typed RPC client, and +// barrier registry shared between the parent and child halves of the +// userfaultfd test harness. It deliberately knows nothing about +// *Userfaultfd internals so it can sit in a sibling internal package +// and never get pulled into a production import path. +package rpcharness + +// Empty is the stand-in args/reply type for net/rpc methods that take +// or return nothing meaningful. net/rpc still requires both pointers +// to be exported. +type Empty struct{} + +// BootstrapArgs is the single message the parent sends to drive +// child initialisation. Everything that used to flow over env vars or +// the content tmp file lives in this struct, base64-encoded by the +// JSON-RPC codec for byte slices. For typical test sizes (≤1MB) the +// encoding overhead is irrelevant; if a future test needs >10MB of +// source content, that PR can revisit and add chunking. +type BootstrapArgs struct { + MmapStart uint64 + Pagesize int64 + TotalSize int64 + AlwaysWP bool + // Barriers gates the test-only worker hooks. Off by default so + // the worker hot path stays a single nil-pointer load + branch + // in non-race tests. + Barriers bool + Content []byte +} + +// BootstrapReply is empty today; kept as a named type so a future +// reply field can be added without touching every call site. +type BootstrapReply struct{} + +// PageStateEntry is the wire format for the Paging.States RPC. +// State is the raw uint8 value of the parent package's pageState +// enum; the parent translates back at the boundary. +type PageStateEntry struct { + State uint8 + Offset uint64 +} + +// PageStatesReply carries the snapshot returned by Paging.States. +type PageStatesReply struct { + Entries []PageStateEntry +} + +// FaultBarrierArgs requests installation of a barrier at (Addr, Point). +type FaultBarrierArgs struct { + Addr uint64 + Point uint8 +} + +// FaultBarrierReply carries the opaque token used by subsequent +// WaitHeld / Release calls. +type FaultBarrierReply struct { + Token uint64 +} + +// TokenArgs carries the opaque token returned by FaultBarrierReply. +type TokenArgs struct { + Token uint64 +} diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_barriers_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_barriers_test.go deleted file mode 100644 index 1a39d95201..0000000000 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_barriers_test.go +++ /dev/null @@ -1,218 +0,0 @@ -package userfaultfd - -// Barriers RPC service plus the in-child barrierRegistry it -// manipulates. The hooks installed on Userfaultfd consult this -// registry by (addr, point) to decide whether to park a worker -// goroutine; the parent drives Install / WaitHeld / Release over RPC -// to deterministically race events against an in-flight worker. - -import ( - "context" - "errors" - "fmt" - "sync" -) - -type FaultBarrierArgs struct { - Addr uint64 - Point uint8 -} - -type FaultBarrierReply struct { - Token uint64 -} - -type TokenArgs struct { - Token uint64 -} - -// Barriers is the RPC service exposing the barrier registry to the -// parent. -type Barriers struct { - state *harnessState -} - -func (b *Barriers) Install(args *FaultBarrierArgs, reply *FaultBarrierReply) error { - br, err := b.registry() - if err != nil { - return err - } - reply.Token = br.install(uintptr(args.Addr), barrierPoint(args.Point)) - - return nil -} - -func (b *Barriers) WaitHeld(args *TokenArgs, _ *Empty) error { - br, err := b.registry() - if err != nil { - return err - } - - return br.waitArrived(context.Background(), args.Token) -} - -func (b *Barriers) Release(args *TokenArgs, _ *Empty) error { - br, err := b.registry() - if err != nil { - return err - } - br.release(args.Token) - - return nil -} - -func (b *Barriers) registry() (*barrierRegistry, error) { - b.state.mu.Lock() - br := b.state.br - b.state.mu.Unlock() - if br == nil { - return nil, errors.New("Barriers RPC called before Lifecycle.Bootstrap") - } - - return br, nil -} - -// barrierPoint identifies WHICH hook a barrier should park on. -type barrierPoint uint8 - -const ( - // barrierBeforeRLock parks the worker BEFORE settleRequests.RLock(), - // i.e. before it can read the page state. Use this when a parallel - // writer needs the write lock immediately because no worker holds - // the read lock. - barrierBeforeRLock barrierPoint = 1 - // barrierBeforeFaultPage parks the worker AFTER it has taken - // settleRequests.RLock, but BEFORE the actual UFFDIO_COPY syscall. - // Use this when a parent operation must still return even though - // a worker holds RLock. - barrierBeforeFaultPage barrierPoint = 2 -) - -// barrierRegistry is the child-process side of the barrier. The -// hooks installed on Userfaultfd consult this registry by addr+point -// to decide whether to park, and the RPC handlers manipulate it from -// the parent over the socket. -type barrierRegistry struct { - mu sync.Mutex - next uint64 - tokens map[uint64]*barrierSlot - byKey map[barrierKey]uint64 -} - -type barrierKey struct { - addr uintptr - point barrierPoint -} - -type barrierSlot struct { - addr uintptr - point barrierPoint - arrived chan struct{} - release chan struct{} - arrivedOnce sync.Once -} - -func newBarrierRegistry() *barrierRegistry { - return &barrierRegistry{ - tokens: make(map[uint64]*barrierSlot), - byKey: make(map[barrierKey]uint64), - } -} - -func (b *barrierRegistry) install(addr uintptr, point barrierPoint) uint64 { - b.mu.Lock() - defer b.mu.Unlock() - - b.next++ - token := b.next - slot := &barrierSlot{ - addr: addr, - point: point, - arrived: make(chan struct{}), - release: make(chan struct{}), - } - b.tokens[token] = slot - b.byKey[barrierKey{addr, point}] = token - - return token -} - -func (b *barrierRegistry) lookupByAddr(addr uintptr, point barrierPoint) *barrierSlot { - b.mu.Lock() - defer b.mu.Unlock() - - token, ok := b.byKey[barrierKey{addr, point}] - if !ok { - return nil - } - - return b.tokens[token] -} - -func (b *barrierRegistry) waitArrived(ctx context.Context, token uint64) error { - b.mu.Lock() - slot, ok := b.tokens[token] - b.mu.Unlock() - if !ok { - return fmt.Errorf("unknown barrier token %d", token) - } - - select { - case <-slot.arrived: - return nil - case <-ctx.Done(): - return ctx.Err() - } -} - -func (b *barrierRegistry) release(token uint64) { - b.mu.Lock() - slot, ok := b.tokens[token] - delete(b.tokens, token) - if ok { - delete(b.byKey, barrierKey{slot.addr, slot.point}) - } - b.mu.Unlock() - - if !ok { - return - } - - select { - case <-slot.release: - default: - close(slot.release) - } -} - -func (b *barrierRegistry) releaseAll() { - b.mu.Lock() - tokens := make([]uint64, 0, len(b.tokens)) - for t := range b.tokens { - tokens = append(tokens, t) - } - b.mu.Unlock() - - for _, t := range tokens { - b.release(t) - } -} - -// hookFor returns the function to assign to a Userfaultfd -// beforeXxxHook field. The returned function is a no-op for any -// (addr, point) pair that hasn't been Install'd, so non-targeted -// faults see no scheduling distortion. -func (b *barrierRegistry) hookFor(point barrierPoint) func(addr uintptr) { - return func(addr uintptr) { - slot := b.lookupByAddr(addr, point) - if slot == nil { - return - } - - slot.arrivedOnce.Do(func() { - close(slot.arrived) - }) - - <-slot.release - } -} diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_lifecycle_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_lifecycle_test.go deleted file mode 100644 index 2669394723..0000000000 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_lifecycle_test.go +++ /dev/null @@ -1,191 +0,0 @@ -package userfaultfd - -// Lifecycle RPC service plus the harnessState container that the -// three RPC services share. Lifecycle.Bootstrap is the single point -// where the in-child *Userfaultfd is constructed; Paging and -// Barriers consume the resulting state via *harnessState. - -import ( - "context" - "fmt" - "os" - "sync" - - "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/fdexit" - "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/memory" - "github.com/e2b-dev/infra/packages/shared/pkg/logger" -) - -// Empty is the stand-in args/reply type for net/rpc methods that -// take or return nothing meaningful. net/rpc still requires both -// pointers to be exported. -type Empty struct{} - -// harnessState is the per-child container holding the resources -// created by Lifecycle.Bootstrap and consumed by Paging/Barriers. -// All three RPC services hold a *harnessState rather than -// duplicating fields. Mutable fields are guarded by mu. -type harnessState struct { - uffdFd uintptr - - mu sync.Mutex - uffd *Userfaultfd - br *barrierRegistry - stop func() // currently active serve-stop function, nil if paused - shutdown chan struct{} - closed bool -} - -func newHarnessState(uffdFd uintptr) *harnessState { - return &harnessState{ - uffdFd: uffdFd, - shutdown: make(chan struct{}), - } -} - -// startServeLocked spawns the uffd Serve goroutine and stores its -// stop fn. Caller must hold s.mu. Idempotent: if the serve goroutine -// is already running (s.stop != nil) this is a no-op so a stray -// duplicate Resume can't leak an untracked Serve goroutine and break -// later pauses. -func (s *harnessState) startServeLocked() { - if s.stop != nil { - return - } - - exit, err := fdexit.New() - if err != nil { - fmt.Fprintln(os.Stderr, "fdexit.New:", err) - - return - } - - uffd := s.uffd - done := make(chan struct{}) - go func() { - defer close(done) - if err := uffd.Serve(context.Background(), exit); err != nil { - fmt.Fprintln(os.Stderr, "uffd.Serve:", err) - } - }() - - s.stop = func() { - _ = exit.SignalExit() - <-done - exit.Close() - } -} - -func (s *harnessState) stopServe() { - s.mu.Lock() - defer s.mu.Unlock() - if s.stop != nil { - s.stop() - s.stop = nil - } -} - -func (s *harnessState) releaseAllBarriers() { - s.mu.Lock() - br := s.br - s.mu.Unlock() - if br != nil { - br.releaseAll() - } -} - -// BootstrapArgs is the single message the parent sends to drive -// child initialisation. Everything that used to flow over env vars -// or the content tmp file now lives in this struct, base64-encoded -// by the JSON-RPC codec for byte slices. For typical test sizes -// (≤1MB) the encoding overhead is irrelevant; if a future test needs -// >10MB of source content, that PR can revisit and add chunking. -type BootstrapArgs struct { - MmapStart uint64 - Pagesize int64 - TotalSize int64 - AlwaysWP bool - // Barriers gates the test-only worker hooks. Off by default so - // the worker hot path stays a single nil-pointer load + branch - // in non-race tests. - Barriers bool - Content []byte -} - -type BootstrapReply struct{} - -// Lifecycle owns the boot/shutdown of the in-child uffd. Bootstrap -// builds the *Userfaultfd, optionally installs the test hooks, and -// kicks off the initial Serve goroutine. Shutdown signals the -// crossProcessServe loop to exit; the goroutine + barrier registry -// are torn down there so we don't have to hold any mutex while -// joining the serve goroutine. -type Lifecycle struct { - state *harnessState -} - -func (l *Lifecycle) Bootstrap(args *BootstrapArgs, _ *BootstrapReply) error { - if int64(len(args.Content)) != args.TotalSize { - return fmt.Errorf("content size %d != expected %d", len(args.Content), args.TotalSize) - } - - data := NewMemorySlicer(args.Content, args.Pagesize) - - mapping := memory.NewMapping([]memory.Region{ - { - BaseHostVirtAddr: uintptr(args.MmapStart), - Size: uintptr(args.TotalSize), - Offset: 0, - PageSize: uintptr(args.Pagesize), - }, - }) - - log, err := logger.NewDevelopmentLogger() - if err != nil { - return fmt.Errorf("logger: %w", err) - } - - uffd, err := NewUserfaultfdFromFd(l.state.uffdFd, data, mapping, log) - if err != nil { - return fmt.Errorf("NewUserfaultfdFromFd: %w", err) - } - - if args.AlwaysWP { - uffd.defaultCopyMode = UFFDIO_COPY_MODE_WP - } - - br := newBarrierRegistry() - if args.Barriers { - uffd.SetTestHooks(&testHooks{ - beforeWorkerRLock: br.hookFor(barrierBeforeRLock), - beforeFaultPage: br.hookFor(barrierBeforeFaultPage), - }) - } - - l.state.mu.Lock() - defer l.state.mu.Unlock() - l.state.uffd = uffd - l.state.br = br - l.state.startServeLocked() - - return nil -} - -// WaitReady is a no-op: Bootstrap is synchronous, so its successful -// reply already implies readiness. WaitReady is kept as a separate -// RPC so that an async-Bootstrap variant can be slotted in later -// without touching every call site. -func (l *Lifecycle) WaitReady(_ *Empty, _ *Empty) error { - return nil -} - -func (l *Lifecycle) Shutdown(_ *Empty, _ *Empty) error { - l.state.mu.Lock() - defer l.state.mu.Unlock() - if !l.state.closed { - l.state.closed = true - close(l.state.shutdown) - } - - return nil -} diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_paging_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_paging_test.go deleted file mode 100644 index a4bb7d9fad..0000000000 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_paging_test.go +++ /dev/null @@ -1,80 +0,0 @@ -package userfaultfd - -// Paging RPC service. Exposes the per-page state snapshot consumed -// by tests asserting which offsets the handler observed, plus the -// pause/resume handles used by gated tests to drain the kernel's -// UFFD event queue deterministically. - -import ( - "errors" - "fmt" -) - -// pageStateEntry is the wire format for the Paging.States RPC. -type pageStateEntry struct { - State uint8 - Offset uint64 -} - -type PageStatesReply struct { - Entries []pageStateEntry -} - -// Paging is the RPC service exposing page-state introspection and -// the gated-serve pause/resume controls. -type Paging struct { - state *harnessState -} - -func (p *Paging) States(_ *Empty, reply *PageStatesReply) error { - p.state.mu.Lock() - uffd := p.state.uffd - p.state.mu.Unlock() - if uffd == nil { - return errors.New("Paging.States called before Lifecycle.Bootstrap") - } - - entries, err := uffd.pageStateEntries() - if err != nil { - return err - } - reply.Entries = entries - - return nil -} - -func (p *Paging) Pause(_ *Empty, _ *Empty) error { - p.state.stopServe() - - return nil -} - -func (p *Paging) Resume(_ *Empty, _ *Empty) error { - p.state.mu.Lock() - defer p.state.mu.Unlock() - p.state.startServeLocked() - - return nil -} - -// pageStateEntries returns a snapshot of every tracked page and its -// state. It briefly takes settleRequests.Lock so no in-flight worker -// can mutate the pageTracker while we read it. -func (u *Userfaultfd) pageStateEntries() ([]pageStateEntry, error) { - u.settleRequests.Lock() - u.settleRequests.Unlock() //nolint:staticcheck // SA2001: intentional — settle the read locks. - - u.pageTracker.mu.RLock() - defer u.pageTracker.mu.RUnlock() - - entries := make([]pageStateEntry, 0, len(u.pageTracker.m)) - for addr, state := range u.pageTracker.m { - offset, err := u.ma.GetOffset(addr) - if err != nil { - return nil, fmt.Errorf("address %#x not in mapping: %w", addr, err) - } - entries = append(entries, pageStateEntry{State: uint8(state), Offset: uint64(offset)}) - } - - return entries, nil -} diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go new file mode 100644 index 0000000000..9d2b638574 --- /dev/null +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go @@ -0,0 +1,285 @@ +package userfaultfd + +// RPC service implementations for the cross-process UFFD test +// harness. These live in _test.go (rather than the sibling +// internal/rpcharness package) because they need access to the +// unexported pageState / pageStateEntries / settleRequests / +// pageTracker / defaultCopyMode internals on *Userfaultfd. The wire +// types, typed Client, and barrier registry they consume are exported +// from internal/rpcharness so they cannot leak into a production +// import path. +// +// Three services are registered against the same *harnessState: +// +// Lifecycle.Bootstrap / WaitReady / Shutdown +// Paging.States / Pause / Resume +// Barriers.Install / WaitHeld / Release + +import ( + "context" + "errors" + "fmt" + "os" + "sync" + + "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/fdexit" + "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/memory" + "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness" + "github.com/e2b-dev/infra/packages/shared/pkg/logger" +) + +// harnessState is the per-child container holding the resources +// created by Lifecycle.Bootstrap and consumed by Paging/Barriers. +// All three RPC services hold a *harnessState rather than duplicating +// fields. Mutable fields are guarded by mu. +type harnessState struct { + uffdFd uintptr + + mu sync.Mutex + uffd *Userfaultfd + br *rpcharness.Registry + stop func() // currently active serve-stop function, nil if paused + shutdown chan struct{} + closed bool +} + +func newHarnessState(uffdFd uintptr) *harnessState { + return &harnessState{ + uffdFd: uffdFd, + shutdown: make(chan struct{}), + } +} + +// startServeLocked spawns the uffd Serve goroutine and stores its +// stop fn. Caller must hold s.mu. Idempotent: if the serve goroutine +// is already running (s.stop != nil) this is a no-op so a stray +// duplicate Resume can't leak an untracked Serve goroutine and break +// later pauses. +func (s *harnessState) startServeLocked() { + if s.stop != nil { + return + } + + exit, err := fdexit.New() + if err != nil { + fmt.Fprintln(os.Stderr, "fdexit.New:", err) + + return + } + + uffd := s.uffd + done := make(chan struct{}) + go func() { + defer close(done) + if err := uffd.Serve(context.Background(), exit); err != nil { + fmt.Fprintln(os.Stderr, "uffd.Serve:", err) + } + }() + + s.stop = func() { + _ = exit.SignalExit() + <-done + exit.Close() + } +} + +func (s *harnessState) stopServe() { + s.mu.Lock() + defer s.mu.Unlock() + if s.stop != nil { + s.stop() + s.stop = nil + } +} + +func (s *harnessState) releaseAllBarriers() { + s.mu.Lock() + br := s.br + s.mu.Unlock() + if br != nil { + br.ReleaseAll() + } +} + +// Lifecycle owns the boot/shutdown of the in-child uffd. Bootstrap +// builds the *Userfaultfd, optionally installs the test hooks, and +// kicks off the initial Serve goroutine. Shutdown signals the +// crossProcessServe loop to exit; the goroutine + barrier registry +// are torn down there so we don't have to hold any mutex while +// joining the serve goroutine. +type Lifecycle struct { + state *harnessState +} + +func (l *Lifecycle) Bootstrap(args *rpcharness.BootstrapArgs, _ *rpcharness.BootstrapReply) error { + if int64(len(args.Content)) != args.TotalSize { + return fmt.Errorf("content size %d != expected %d", len(args.Content), args.TotalSize) + } + + data := NewMemorySlicer(args.Content, args.Pagesize) + + mapping := memory.NewMapping([]memory.Region{ + { + BaseHostVirtAddr: uintptr(args.MmapStart), + Size: uintptr(args.TotalSize), + Offset: 0, + PageSize: uintptr(args.Pagesize), + }, + }) + + log, err := logger.NewDevelopmentLogger() + if err != nil { + return fmt.Errorf("logger: %w", err) + } + + uffd, err := NewUserfaultfdFromFd(l.state.uffdFd, data, mapping, log) + if err != nil { + return fmt.Errorf("NewUserfaultfdFromFd: %w", err) + } + + if args.AlwaysWP { + uffd.defaultCopyMode = UFFDIO_COPY_MODE_WP + } + + br := rpcharness.NewRegistry() + if args.Barriers { + uffd.SetTestHooks(&testHooks{ + beforeWorkerRLock: br.HookFor(rpcharness.BeforeRLock), + beforeFaultPage: br.HookFor(rpcharness.BeforeFaultPage), + }) + } + + l.state.mu.Lock() + defer l.state.mu.Unlock() + l.state.uffd = uffd + l.state.br = br + l.state.startServeLocked() + + return nil +} + +// WaitReady is a no-op: Bootstrap is synchronous, so its successful +// reply already implies readiness. WaitReady is kept as a separate +// RPC so that an async-Bootstrap variant can be slotted in later +// without touching every call site. +func (l *Lifecycle) WaitReady(_ *rpcharness.Empty, _ *rpcharness.Empty) error { + return nil +} + +func (l *Lifecycle) Shutdown(_ *rpcharness.Empty, _ *rpcharness.Empty) error { + l.state.mu.Lock() + defer l.state.mu.Unlock() + if !l.state.closed { + l.state.closed = true + close(l.state.shutdown) + } + + return nil +} + +// Paging is the RPC service exposing page-state introspection and +// the gated-serve pause/resume controls. +type Paging struct { + state *harnessState +} + +func (p *Paging) States(_ *rpcharness.Empty, reply *rpcharness.PageStatesReply) error { + p.state.mu.Lock() + uffd := p.state.uffd + p.state.mu.Unlock() + if uffd == nil { + return errors.New("Paging.States called before Lifecycle.Bootstrap") + } + + entries, err := uffd.pageStateEntries() + if err != nil { + return err + } + reply.Entries = entries + + return nil +} + +func (p *Paging) Pause(_ *rpcharness.Empty, _ *rpcharness.Empty) error { + p.state.stopServe() + + return nil +} + +func (p *Paging) Resume(_ *rpcharness.Empty, _ *rpcharness.Empty) error { + p.state.mu.Lock() + defer p.state.mu.Unlock() + p.state.startServeLocked() + + return nil +} + +// pageStateEntries returns a snapshot of every tracked page and its +// state, translated into the rpcharness wire format. Briefly takes +// settleRequests.Lock so no in-flight worker can mutate the +// pageTracker while we read it. +func (u *Userfaultfd) pageStateEntries() ([]rpcharness.PageStateEntry, error) { + u.settleRequests.Lock() + u.settleRequests.Unlock() //nolint:staticcheck // SA2001: intentional — settle the read locks. + + u.pageTracker.mu.RLock() + defer u.pageTracker.mu.RUnlock() + + entries := make([]rpcharness.PageStateEntry, 0, len(u.pageTracker.m)) + for addr, state := range u.pageTracker.m { + offset, err := u.ma.GetOffset(addr) + if err != nil { + return nil, fmt.Errorf("address %#x not in mapping: %w", addr, err) + } + entries = append(entries, rpcharness.PageStateEntry{State: uint8(state), Offset: uint64(offset)}) + } + + return entries, nil +} + +// Barriers is the RPC service exposing the rpcharness.Registry to the +// parent. It's a thin wrapper so all the locking/lifecycle logic stays +// in rpcharness. +type Barriers struct { + state *harnessState +} + +func (b *Barriers) Install(args *rpcharness.FaultBarrierArgs, reply *rpcharness.FaultBarrierReply) error { + br, err := b.registry() + if err != nil { + return err + } + reply.Token = br.Install(uintptr(args.Addr), rpcharness.Point(args.Point)) + + return nil +} + +func (b *Barriers) WaitHeld(args *rpcharness.TokenArgs, _ *rpcharness.Empty) error { + br, err := b.registry() + if err != nil { + return err + } + + return br.WaitArrived(context.Background(), args.Token) +} + +func (b *Barriers) Release(args *rpcharness.TokenArgs, _ *rpcharness.Empty) error { + br, err := b.registry() + if err != nil { + return err + } + br.Release(args.Token) + + return nil +} + +func (b *Barriers) registry() (*rpcharness.Registry, error) { + b.state.mu.Lock() + br := b.state.br + b.state.mu.Unlock() + if br == nil { + return nil, errors.New("Barriers RPC called before Lifecycle.Bootstrap") + } + + return br, nil +} From 1d0e876f2f69d1dd9109bcdeb1d4ca69dfcfcc0f Mon Sep 17 00:00:00 2001 From: ValentaTomas Date: Fri, 1 May 2026 18:25:29 -0700 Subject: [PATCH 11/22] refactor(uffd): collapse test-only hooks into single phase-dispatched fn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Production userfaultfd.go drops the testHooks struct, the two callBefore* methods, and their multi-line test-explaining comments. What remains is a single atomic.Pointer[func(uintptr, faultPhase)] field plus inline call sites — the floor without build tags. * rpcharness.Registry.HookFor(point) collapses into Hook(): one closure dispatches by (addr, point) instead of binding the point in the closure. * Bootstrap wires a single SetTestFaultHook with an explicit faultPhase → rpcharness.Point switch, since the two enums use different numeric values intentionally. --- .../sandbox/uffd/userfaultfd/hooks_test.go | 15 ++++-- .../internal/rpcharness/barriers.go | 11 ++-- .../uffd/userfaultfd/rpc_services_test.go | 15 ++++-- .../sandbox/uffd/userfaultfd/userfaultfd.go | 53 +++++++------------ 4 files changed, 46 insertions(+), 48 deletions(-) diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/hooks_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/hooks_test.go index 85384c9f1a..bb5dc70f41 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/hooks_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/hooks_test.go @@ -1,8 +1,13 @@ package userfaultfd -// SetTestHooks installs the given hooks atomically. Pass nil to clear. -// Only available in test builds (this file is _test.go), so production -// binaries cannot reach it. -func (u *Userfaultfd) SetTestHooks(h *testHooks) { - u.testHooks.Store(h) +// SetTestFaultHook installs the per-fault test hook atomically. Pass nil to +// clear. Defined here (in a _test.go file) so production binaries cannot +// reach it. +func (u *Userfaultfd) SetTestFaultHook(h func(uintptr, faultPhase)) { + if h == nil { + u.testFaultHook.Store(nil) + + return + } + u.testFaultHook.Store(&h) } diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/barriers.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/barriers.go index 4771a0f3a1..2b02a44028 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/barriers.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/barriers.go @@ -141,12 +141,11 @@ func (r *Registry) ReleaseAll() { } } -// HookFor returns the function to assign to the matching test-hook -// field on *Userfaultfd. The returned function is a no-op for any -// (addr, point) pair that hasn't been Install'd, so non-targeted -// faults see no scheduling distortion. -func (r *Registry) HookFor(point Point) func(addr uintptr) { - return func(addr uintptr) { +// Hook returns the function tests assign as the per-fault hook on +// *Userfaultfd. The returned closure dispatches by (addr, point): +// pages/points that haven't been Install'd see no scheduling distortion. +func (r *Registry) Hook() func(addr uintptr, point Point) { + return func(addr uintptr, point Point) { s := r.lookupByAddr(addr, point) if s == nil { return diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go index 9d2b638574..4691ce6910 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go @@ -143,9 +143,18 @@ func (l *Lifecycle) Bootstrap(args *rpcharness.BootstrapArgs, _ *rpcharness.Boot br := rpcharness.NewRegistry() if args.Barriers { - uffd.SetTestHooks(&testHooks{ - beforeWorkerRLock: br.HookFor(rpcharness.BeforeRLock), - beforeFaultPage: br.HookFor(rpcharness.BeforeFaultPage), + hook := br.Hook() + uffd.SetTestFaultHook(func(addr uintptr, p faultPhase) { + var point rpcharness.Point + switch p { + case faultPhaseBeforeRLock: + point = rpcharness.BeforeRLock + case faultPhaseBeforeFaultPage: + point = rpcharness.BeforeFaultPage + default: + return + } + hook(addr, point) }) } diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go index 94737507db..72a970797f 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go @@ -64,42 +64,21 @@ type Userfaultfd struct { // defaultCopyMode overrides the UFFDIO_COPY mode for all faults when non-zero. defaultCopyMode CULong - // testHooks is nil in production; tests set it via SetTestHooks (defined in - // a _test.go file, so production binaries cannot reach the setter). All hook - // fields default to nil and the call sites no-op when unset. - testHooks atomic.Pointer[testHooks] + // installed only by SetTestFaultHook in test builds; nil in production. + testFaultHook atomic.Pointer[func(uintptr, faultPhase)] logger logger.Logger } -// testHooks bundles the optional test-only synchronisation callbacks. A nil -// callback means the corresponding call site is a no-op. Tests use these to -// park a worker goroutine at a known point so a racing event (REMOVE, MISSING) -// can be issued deterministically before the worker proceeds. -type testHooks struct { - // beforeWorkerRLock is called as the very first thing in the worker - // goroutine, BEFORE settleRequests.RLock(). Lets a test hold the - // goroutine before it can claim the read lock so a parallel writer can - // take the write lock immediately. - beforeWorkerRLock func(addr uintptr) - // beforeFaultPage is called inside the worker AFTER RLock and BEFORE the - // actual UFFDIO_COPY/UFFDIO_ZEROPAGE syscall. Lets a test simulate a - // slow data fetch / in-flight COPY so a parent operation can race - // against an in-flight worker. - beforeFaultPage func(addr uintptr) -} +// faultPhase identifies WHEN inside the worker the (test-only) fault hook is +// invoked. Production builds never install a hook (testFaultHook is nil); the +// per-fault overhead is then a single atomic load + nil check per call site. +type faultPhase uint8 -func (u *Userfaultfd) callBeforeWorkerRLock(addr uintptr) { - if h := u.testHooks.Load(); h != nil && h.beforeWorkerRLock != nil { - h.beforeWorkerRLock(addr) - } -} - -func (u *Userfaultfd) callBeforeFaultPage(addr uintptr) { - if h := u.testHooks.Load(); h != nil && h.beforeFaultPage != nil { - h.beforeFaultPage(addr) - } -} +const ( + faultPhaseBeforeRLock faultPhase = iota + faultPhaseBeforeFaultPage +) // NewUserfaultfdFromFd creates a new userfaultfd instance with optional configuration. func NewUserfaultfdFromFd(fd uintptr, src block.Slicer, m *memory.Mapping, logger logger.Logger) (*Userfaultfd, error) { @@ -285,7 +264,9 @@ func (u *Userfaultfd) Serve( // For the write to be executed, we first need to copy the page from the source to the guest memory. if flags&UFFD_PAGEFAULT_FLAG_WRITE != 0 { u.wg.Go(func() error { - u.callBeforeWorkerRLock(addr) + if h := u.testFaultHook.Load(); h != nil { + (*h)(addr, faultPhaseBeforeRLock) + } return u.faultPage(ctx, addr, offset, u.src, fdExit.SignalExit, block.Write) }) @@ -297,7 +278,9 @@ func (u *Userfaultfd) Serve( // If the event has no flags, it was a read to a missing page and we need to copy the page from the source to the guest memory. if flags == 0 { u.wg.Go(func() error { - u.callBeforeWorkerRLock(addr) + if h := u.testFaultHook.Load(); h != nil { + (*h)(addr, faultPhaseBeforeRLock) + } return u.faultPage(ctx, addr, offset, u.src, fdExit.SignalExit, block.Read) }) @@ -338,7 +321,9 @@ func (u *Userfaultfd) faultPage( u.settleRequests.RLock() defer u.settleRequests.RUnlock() - u.callBeforeFaultPage(addr) + if h := u.testFaultHook.Load(); h != nil { + (*h)(addr, faultPhaseBeforeFaultPage) + } defer func() { if r := recover(); r != nil { From 729917b304881d2026f18d88aca0cfa8cb3ad97c Mon Sep 17 00:00:00 2001 From: ValentaTomas Date: Fri, 1 May 2026 18:47:53 -0700 Subject: [PATCH 12/22] =?UTF-8?q?chore(uffd-tests):=20tighten=20harness=20?= =?UTF-8?q?diff=20=E2=80=94=20trim=20comments,=20drop=20dead=20code,=20ren?= =?UTF-8?q?ame=20NewUserfaultfdFromFd=20=E2=86=92=20NewFromFd?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pure cleanup with two intentional behavior-equivalent tweaks: - Flatten Paging.Resume to call state.startServe(), the locked wrapper around startServeLocked, mirroring the Pause/stopServe pattern. - Align rpcharness.Point iota with userfaultfd.faultPhase iota (both now 0/1) so the test fault hook can pass values through with a numeric cast instead of a switch. The renamed constructor keeps "wraps an existing fd" intent visible at the call site: every caller today wraps an fd received over a socket or via ExtraFiles; nobody asks the constructor to also run the userfaultfd(2) syscall and register the region. --- .../orchestrator/pkg/sandbox/uffd/uffd.go | 2 +- .../uffd/userfaultfd/harness_child_test.go | 31 +------ .../uffd/userfaultfd/harness_parent_test.go | 71 +++------------ .../sandbox/uffd/userfaultfd/helpers_test.go | 19 ++-- .../sandbox/uffd/userfaultfd/hooks_test.go | 5 +- .../internal/rpcharness/barriers.go | 39 ++++----- .../userfaultfd/internal/rpcharness/client.go | 9 +- .../userfaultfd/internal/rpcharness/wire.go | 21 +---- .../uffd/userfaultfd/rpc_services_test.go | 86 +++++++------------ .../sandbox/uffd/userfaultfd/userfaultfd.go | 11 +-- 10 files changed, 88 insertions(+), 206 deletions(-) diff --git a/packages/orchestrator/pkg/sandbox/uffd/uffd.go b/packages/orchestrator/pkg/sandbox/uffd/uffd.go index 5fe3e5370b..4e60af28cb 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/uffd.go +++ b/packages/orchestrator/pkg/sandbox/uffd/uffd.go @@ -164,7 +164,7 @@ func (u *Uffd) handle(ctx context.Context, sandboxId string, fdExit *fdexit.FdEx m := memory.NewMapping(regions) - uffd, err := userfaultfd.NewUserfaultfdFromFd( + uffd, err := userfaultfd.NewFromFd( uintptr(fds[0]), u.memfile, m, diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_child_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_child_test.go index 71fcde1fa8..07582b3740 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_child_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_child_test.go @@ -1,15 +1,5 @@ package userfaultfd -// Child side of the cross-process UFFD test harness. The child is a -// re-execed copy of the test binary entered through -// TestHelperServingProcess; it adopts the inherited rpc socket fd -// (fd 4, the parent's socketpair half), registers the three RPC -// services that share a single *harnessState container, and serves -// JSON-RPC until the parent issues Lifecycle.Shutdown. All actual -// work (build mapping, construct *Userfaultfd, install hooks, run -// Serve) is driven by Lifecycle.Bootstrap rather than env vars or -// extra fds. - import ( "fmt" "net" @@ -20,10 +10,7 @@ import ( ) // TestHelperServingProcess is the entry point for the child helper -// process spawned by configureCrossProcessTest. The parent re-execs -// the test binary with envHelperFlag=1 and the uffd / rpc fds in -// ExtraFiles; this test hands off to crossProcessServe and exits -// with its result. +// process spawned by configureCrossProcessTest. func TestHelperServingProcess(t *testing.T) { t.Parallel() @@ -39,21 +26,14 @@ func TestHelperServingProcess(t *testing.T) { os.Exit(0) } -// crossProcessServe wires up the child side: adopt the inherited -// rpc socket fd, register the three RPC services that share a single -// harnessState, and run jsonrpc.ServeCodec until the parent shuts -// us down. func crossProcessServe() error { - // The parent handed us two fds via cmd.ExtraFiles; the child-side - // dup3 inside fork+exec lands them at fd 3 (uffd) and fd 4 (rpc - // socketpair half) with CLOEXEC cleared automatically. + // fork+exec dup3's the parent's ExtraFiles to fd 3 (uffd) and fd 4 + // (rpc socketpair half) with CLOEXEC cleared. uffdFile := os.NewFile(uintptr(3), "uffd") defer uffdFile.Close() rpcFile := os.NewFile(uintptr(4), "rpc") conn, err := net.FileConn(rpcFile) - // FileConn dups the underlying fd; close rpcFile in both the - // success and error paths so we don't leak an fd. rpcFile.Close() if err != nil { return fmt.Errorf("net.FileConn rpc: %w", err) @@ -86,13 +66,10 @@ func crossProcessServe() error { case <-codecDone: } - // Release any still-parked barriers so the serve goroutine can - // finish, then stop the serve goroutine. + // Release parked barriers so the serve goroutine can drain. state.releaseAllBarriers() state.stopServe() - // Closing the conn is sufficient to unblock ServeCodec if it - // hasn't already returned. _ = conn.Close() <-codecDone diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go index 8027558ab4..8557b95b3f 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go @@ -1,21 +1,5 @@ package userfaultfd -// Parent side of the cross-process UFFD test harness. The parent -// owns the userfaultfd (created and registered against an mmap that -// lives in the parent's address space) and drives the in-VM page -// fault servicing logic from a child helper process. We re-exec the -// test binary as the child so the actual page-fault handling runs -// in a process where we can fully control memory layout (no Go GC -// scanning / touching the registered region) — which mirrors how -// Firecracker uses UFFD in production. -// -// The side channels between parent and child are intentionally -// minimal: a single env var flagging the child as the helper, the -// userfaultfd handed off via ExtraFiles at fd 3, and a socketpair(2) -// half handed off via ExtraFiles at fd 4 carrying JSON-RPC. All -// configuration (mmap geometry, source content, feature toggles) -// flows over a single Lifecycle.Bootstrap RPC. - import ( "context" "crypto/rand" @@ -37,7 +21,6 @@ import ( ) // MemorySlicer exposes a byte slice via the Slicer interface. -// Test-only. type MemorySlicer struct { content []byte pagesize int64 @@ -75,14 +58,8 @@ func RandomPages(pagesize, numberOfPages uint64) *MemorySlicer { return NewMemorySlicer(buf, int64(pagesize)) } -// Env var used by the child helper process. The set is deliberately -// minimal: anything else lives in BootstrapArgs and flows over RPC. const envHelperFlag = "GO_TEST_HELPER_PROCESS" -// configureCrossProcessTest spawns the helper child, hands it the -// userfaultfd, and drives initial setup via Lifecycle.Bootstrap. -// All subsequent test interaction goes through the returned -// testHandler's *rpcharness.Client. func configureCrossProcessTest(ctx context.Context, t *testing.T, tt testConfig) (*testHandler, error) { t.Helper() @@ -96,9 +73,6 @@ func configureCrossProcessTest(ctx context.Context, t *testing.T, tt testConfig) uffdFd, err := newFd(syscall.O_CLOEXEC | syscall.O_NONBLOCK) require.NoError(t, err) - t.Cleanup(func() { - uffdFd.close() - }) require.NoError(t, configureApi(uffdFd, tt.pagesize)) require.NoError(t, register(uffdFd, memoryStart, uint64(size), UFFDIO_REGISTER_MODE_MISSING|UFFDIO_REGISTER_MODE_WP)) @@ -106,26 +80,16 @@ func configureCrossProcessTest(ctx context.Context, t *testing.T, tt testConfig) cmd := exec.CommandContext(ctx, os.Args[0], "-test.run=TestHelperServingProcess", "-test.timeout=0") cmd.Env = append(os.Environ(), envHelperFlag+"=1") - // F_DUPFD_CLOEXEC duplicates uffdFd into a new fd that is born - // with CLOEXEC set, atomically. The previous incarnation used - // syscall.Dup followed by F_SETFD, which left a brief window - // where the dup'd fd was visible without CLOEXEC; under - // -parallel that window let other concurrent forks inherit a - // uffd they did not own and produced hard-to-diagnose, - // parallel-only deadlocks. Removing the window removes the - // need for the childForkMu serialising lock the previous - // version used to wrap cmd.Start. + // F_DUPFD_CLOEXEC dup's atomically with CLOEXEC set, so a concurrent + // fork in another goroutine cannot inherit the dup'd fd before we + // hand it off via ExtraFiles. dup, err := unix.FcntlInt(uintptr(uffdFd), unix.F_DUPFD_CLOEXEC, 0) require.NoError(t, err) uffdFile := os.NewFile(uintptr(dup), "uffd") - // socketpair gives us a connected AF_UNIX/SOCK_STREAM pair in one - // syscall, no listener / accept / dial / temp inode required. - // SOCK_CLOEXEC on the pair is the same rationale as F_DUPFD_CLOEXEC - // on the uffd fd above: both ends are born CLOEXEC so a concurrent - // fork in another goroutine cannot leak them. cmd.ExtraFiles will - // clear CLOEXEC on the destination fd in the child via dup3 inside - // ForkExec, which is exactly what we want for fd 4. + // Socketpair gives a connected AF_UNIX pair in one syscall, with both + // ends born CLOEXEC; cmd.ExtraFiles clears CLOEXEC on the child end + // inside ForkExec. fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) require.NoError(t, err) parentEnd := os.NewFile(uintptr(fds[0]), "rpc-parent") @@ -143,8 +107,7 @@ func configureCrossProcessTest(ctx context.Context, t *testing.T, tt testConfig) require.NoError(t, startErr) } - // FileConn dups the underlying fd, so closing parentEnd after is - // both correct and necessary to avoid leaking the original fd. + // FileConn dups the underlying fd; close parentEnd to avoid leaking it. parentConn, err := net.FileConn(parentEnd) parentEnd.Close() require.NoError(t, err) @@ -169,18 +132,14 @@ func configureCrossProcessTest(ctx context.Context, t *testing.T, tt testConfig) return nil, fmt.Errorf("Lifecycle.Bootstrap: %w", err) } - // WaitReady's successful reply IS the readiness signal. Bootstrap - // is synchronous, so today this is largely a smoke check, but - // the explicit RPC is kept so that future async-Bootstrap variants - // can hold the parent here without breaking the call site. + // Bootstrap is synchronous, so its reply already implies readiness; + // WaitReady is kept as a separate RPC so an async-Bootstrap variant + // can hold the parent here without touching call sites. if err := client.WaitReady(); err != nil { return nil, fmt.Errorf("Lifecycle.WaitReady: %w", err) } t.Cleanup(func() { - // Best-effort graceful shutdown via RPC. If the child has - // already crashed the RPC will error and we fall back to - // killing the process below. _ = client.Shutdown() _ = client.Close() @@ -192,13 +151,11 @@ func configureCrossProcessTest(ctx context.Context, t *testing.T, tt testConfig) } } - // Tear down the UFFD registration before the early uffdFd.close() - // cleanup runs. Today this is a no-op (no test enables - // UFFD_FEATURE_EVENT_REMOVE) but a follow-up that does will - // otherwise see munmap block on un-acked REMOVE events queued - // against the still-registered range. Cleanups run LIFO, so - // this fires before the close registered earlier. + // Unregister before close so a future test that enables + // UFFD_FEATURE_EVENT_REMOVE does not see munmap block on + // un-acked REMOVE events queued against the registered range. assert.NoError(t, unregister(uffdFd, memoryStart, uint64(size))) + uffdFd.close() }) return h, nil diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go index 06a58f9ee5..0be882b7c5 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go @@ -54,18 +54,15 @@ type operation struct { async bool } -// handlerPageStates is a snapshot of the pageTracker grouped by state. It -// lets tests assert on the set of pages that the handler observed in each -// state, rather than a flat list of "accessed" offsets. Follow-up PRs can -// add more state-specific fields (e.g. removed) without touching the +// handlerPageStates groups a pageTracker snapshot by state so follow-up +// PRs can add more per-state fields (e.g. removed) without touching // existing call sites. type handlerPageStates struct { faulted []uint } -// allAccessed returns the sorted union of offsets that the handler touched -// in any non-missing state. Tests that only care about "which pages did the -// handler see" can compare directly against this. +// allAccessed returns the sorted offsets the handler touched in any +// non-missing state. func (s handlerPageStates) allAccessed() []uint { return slices.Clone(s.faulted) } @@ -76,16 +73,14 @@ type testHandler struct { data *MemorySlicer // client is the typed RPC channel to the child helper process. - // Pause/Resume/PageStates/etc. flow through it; tests should - // reach for the convenience methods on testHandler rather than - // poking at client directly. + // Tests should prefer the convenience methods on testHandler. client *rpcharness.Client mutex sync.RWMutex } -// pageStates returns a per-state snapshot of the handler's -// pageTracker, fetched via the Paging.States RPC. +// pageStates fetches a per-state snapshot of the child's pageTracker +// via the Paging.States RPC. func (h *testHandler) pageStates() (handlerPageStates, error) { entries, err := h.client.PageStates() if err != nil { diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/hooks_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/hooks_test.go index bb5dc70f41..5a6e81c7ca 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/hooks_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/hooks_test.go @@ -1,8 +1,7 @@ package userfaultfd -// SetTestFaultHook installs the per-fault test hook atomically. Pass nil to -// clear. Defined here (in a _test.go file) so production binaries cannot -// reach it. +// SetTestFaultHook installs the per-fault hook atomically; pass nil to +// clear. Lives in a _test.go file so production binaries cannot link it. func (u *Userfaultfd) SetTestFaultHook(h func(uintptr, faultPhase)) { if h == nil { u.testFaultHook.Store(nil) diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/barriers.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/barriers.go index 2b02a44028..bdd2d3aabb 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/barriers.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/barriers.go @@ -6,26 +6,24 @@ import ( "sync" ) -// Point identifies WHICH worker hook a barrier should park on. +// Point identifies WHICH worker hook a barrier should park on. Values +// must match the parent package's faultPhase iota so the test hook can +// pass them through with a numeric cast. type Point uint8 const ( - // BeforeRLock parks the worker BEFORE settleRequests.RLock(), - // i.e. before it can read the page state. Use this when a parallel - // writer needs the write lock immediately because no worker holds - // the read lock. - BeforeRLock Point = 1 - // BeforeFaultPage parks the worker AFTER it has taken - // settleRequests.RLock, but BEFORE the actual UFFDIO_COPY syscall. - // Use this when a parent operation must still return even though a - // worker holds RLock. - BeforeFaultPage Point = 2 + // BeforeRLock parks the worker BEFORE settleRequests.RLock(), so a + // parallel writer can take the write lock immediately. + BeforeRLock Point = iota + // BeforeFaultPage parks the worker AFTER settleRequests.RLock but + // BEFORE the UFFDIO_COPY syscall, so a parent operation must still + // return even though a worker holds RLock. + BeforeFaultPage ) // Registry is the child-process side of the barrier mechanism. The -// hooks installed on *Userfaultfd consult this registry by addr+point -// to decide whether to park, and the Barriers RPC handlers manipulate -// it from the parent over the socket. +// per-fault hook on *Userfaultfd consults it by (addr, point) to decide +// whether to park. type Registry struct { mu sync.Mutex next uint64 @@ -53,8 +51,7 @@ func NewRegistry() *Registry { } } -// Install registers a barrier at (addr, point) and returns the opaque -// token used by subsequent WaitArrived/Release calls. +// Install registers a barrier at (addr, point) and returns its token. func (r *Registry) Install(addr uintptr, point Point) uint64 { r.mu.Lock() defer r.mu.Unlock() @@ -125,9 +122,8 @@ func (r *Registry) Release(token uint64) { } } -// ReleaseAll releases every still-installed barrier. Called during -// child shutdown so that any parked worker can finish before the -// serve goroutine is joined. +// ReleaseAll releases every still-installed barrier so any parked +// worker can finish before the child's serve goroutine is joined. func (r *Registry) ReleaseAll() { r.mu.Lock() tokens := make([]uint64, 0, len(r.tokens)) @@ -141,9 +137,8 @@ func (r *Registry) ReleaseAll() { } } -// Hook returns the function tests assign as the per-fault hook on -// *Userfaultfd. The returned closure dispatches by (addr, point): -// pages/points that haven't been Install'd see no scheduling distortion. +// Hook returns the per-fault hook tests install on *Userfaultfd. Faults +// at (addr, point) pairs without an Install'd slot are no-ops. func (r *Registry) Hook() func(addr uintptr, point Point) { return func(addr uintptr, point Point) { s := r.lookupByAddr(addr, point) diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/client.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/client.go index 6c2b3a934c..a52449040c 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/client.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/client.go @@ -7,17 +7,16 @@ import ( ) // Client is the typed parent-side wrapper around the JSON-RPC channel -// to the child helper process. Hides method-name strings, the Empty -// placeholder pointers, and the wire-vs-domain type translation from -// every call site. +// to the child helper process. It hides method-name strings, the Empty +// placeholder pointers, and the wire-vs-domain type translation. type Client struct { rpc *rpc.Client conn io.Closer } // NewClient wraps an already-connected duplex stream (typically one -// half of a socketpair handed off via cmd.ExtraFiles) in a typed -// client. Closing the returned Client closes the underlying conn. +// half of a socketpair handed off via cmd.ExtraFiles). Closing the +// returned Client closes the underlying conn. func NewClient(conn io.ReadWriteCloser) *Client { return &Client{ rpc: jsonrpc.NewClient(conn), diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/wire.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/wire.go index 172f5282f7..365b7b264e 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/wire.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/wire.go @@ -5,17 +5,10 @@ // and never get pulled into a production import path. package rpcharness -// Empty is the stand-in args/reply type for net/rpc methods that take -// or return nothing meaningful. net/rpc still requires both pointers -// to be exported. +// Empty stands in for net/rpc methods that take or return nothing; +// net/rpc still requires both args and reply to be exported pointers. type Empty struct{} -// BootstrapArgs is the single message the parent sends to drive -// child initialisation. Everything that used to flow over env vars or -// the content tmp file lives in this struct, base64-encoded by the -// JSON-RPC codec for byte slices. For typical test sizes (≤1MB) the -// encoding overhead is irrelevant; if a future test needs >10MB of -// source content, that PR can revisit and add chunking. type BootstrapArgs struct { MmapStart uint64 Pagesize int64 @@ -28,36 +21,28 @@ type BootstrapArgs struct { Content []byte } -// BootstrapReply is empty today; kept as a named type so a future -// reply field can be added without touching every call site. type BootstrapReply struct{} -// PageStateEntry is the wire format for the Paging.States RPC. -// State is the raw uint8 value of the parent package's pageState +// PageStateEntry is the wire form of the parent package's pageState // enum; the parent translates back at the boundary. type PageStateEntry struct { State uint8 Offset uint64 } -// PageStatesReply carries the snapshot returned by Paging.States. type PageStatesReply struct { Entries []PageStateEntry } -// FaultBarrierArgs requests installation of a barrier at (Addr, Point). type FaultBarrierArgs struct { Addr uint64 Point uint8 } -// FaultBarrierReply carries the opaque token used by subsequent -// WaitHeld / Release calls. type FaultBarrierReply struct { Token uint64 } -// TokenArgs carries the opaque token returned by FaultBarrierReply. type TokenArgs struct { Token uint64 } diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go index 4691ce6910..016b08224f 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go @@ -1,19 +1,8 @@ package userfaultfd -// RPC service implementations for the cross-process UFFD test -// harness. These live in _test.go (rather than the sibling -// internal/rpcharness package) because they need access to the -// unexported pageState / pageStateEntries / settleRequests / -// pageTracker / defaultCopyMode internals on *Userfaultfd. The wire -// types, typed Client, and barrier registry they consume are exported -// from internal/rpcharness so they cannot leak into a production -// import path. -// -// Three services are registered against the same *harnessState: -// -// Lifecycle.Bootstrap / WaitReady / Shutdown -// Paging.States / Pause / Resume -// Barriers.Install / WaitHeld / Release +// RPC service implementations for the cross-process UFFD test harness. +// These live in _test.go (rather than internal/rpcharness) because they +// need access to unexported *Userfaultfd internals. import ( "context" @@ -28,17 +17,15 @@ import ( "github.com/e2b-dev/infra/packages/shared/pkg/logger" ) -// harnessState is the per-child container holding the resources -// created by Lifecycle.Bootstrap and consumed by Paging/Barriers. -// All three RPC services hold a *harnessState rather than duplicating -// fields. Mutable fields are guarded by mu. +// harnessState is the per-child container shared by Lifecycle / Paging / +// Barriers RPCs. Mutable fields are guarded by mu. type harnessState struct { uffdFd uintptr mu sync.Mutex uffd *Userfaultfd br *rpcharness.Registry - stop func() // currently active serve-stop function, nil if paused + stop func() // serve-stop fn; nil when paused shutdown chan struct{} closed bool } @@ -51,10 +38,9 @@ func newHarnessState(uffdFd uintptr) *harnessState { } // startServeLocked spawns the uffd Serve goroutine and stores its -// stop fn. Caller must hold s.mu. Idempotent: if the serve goroutine -// is already running (s.stop != nil) this is a no-op so a stray -// duplicate Resume can't leak an untracked Serve goroutine and break -// later pauses. +// stop fn. Caller must hold s.mu. Idempotent: if a serve goroutine is +// already running (s.stop != nil), a stray duplicate Resume cannot +// leak an untracked Serve goroutine and break later pauses. func (s *harnessState) startServeLocked() { if s.stop != nil { return @@ -83,6 +69,12 @@ func (s *harnessState) startServeLocked() { } } +func (s *harnessState) startServe() { + s.mu.Lock() + defer s.mu.Unlock() + s.startServeLocked() +} + func (s *harnessState) stopServe() { s.mu.Lock() defer s.mu.Unlock() @@ -101,12 +93,9 @@ func (s *harnessState) releaseAllBarriers() { } } -// Lifecycle owns the boot/shutdown of the in-child uffd. Bootstrap -// builds the *Userfaultfd, optionally installs the test hooks, and -// kicks off the initial Serve goroutine. Shutdown signals the -// crossProcessServe loop to exit; the goroutine + barrier registry -// are torn down there so we don't have to hold any mutex while -// joining the serve goroutine. +// Lifecycle owns boot/shutdown of the in-child uffd. Shutdown signals +// crossProcessServe to exit, where the serve goroutine and barrier +// registry are torn down outside any mutex. type Lifecycle struct { state *harnessState } @@ -132,9 +121,9 @@ func (l *Lifecycle) Bootstrap(args *rpcharness.BootstrapArgs, _ *rpcharness.Boot return fmt.Errorf("logger: %w", err) } - uffd, err := NewUserfaultfdFromFd(l.state.uffdFd, data, mapping, log) + uffd, err := NewFromFd(l.state.uffdFd, data, mapping, log) if err != nil { - return fmt.Errorf("NewUserfaultfdFromFd: %w", err) + return fmt.Errorf("NewFromFd: %w", err) } if args.AlwaysWP { @@ -145,16 +134,7 @@ func (l *Lifecycle) Bootstrap(args *rpcharness.BootstrapArgs, _ *rpcharness.Boot if args.Barriers { hook := br.Hook() uffd.SetTestFaultHook(func(addr uintptr, p faultPhase) { - var point rpcharness.Point - switch p { - case faultPhaseBeforeRLock: - point = rpcharness.BeforeRLock - case faultPhaseBeforeFaultPage: - point = rpcharness.BeforeFaultPage - default: - return - } - hook(addr, point) + hook(addr, rpcharness.Point(p)) }) } @@ -167,10 +147,9 @@ func (l *Lifecycle) Bootstrap(args *rpcharness.BootstrapArgs, _ *rpcharness.Boot return nil } -// WaitReady is a no-op: Bootstrap is synchronous, so its successful -// reply already implies readiness. WaitReady is kept as a separate -// RPC so that an async-Bootstrap variant can be slotted in later -// without touching every call site. +// WaitReady is a no-op today: Bootstrap is synchronous so its reply +// already implies readiness. Kept as a separate RPC so an async +// Bootstrap variant can hold the parent here without changing callers. func (l *Lifecycle) WaitReady(_ *rpcharness.Empty, _ *rpcharness.Empty) error { return nil } @@ -186,8 +165,7 @@ func (l *Lifecycle) Shutdown(_ *rpcharness.Empty, _ *rpcharness.Empty) error { return nil } -// Paging is the RPC service exposing page-state introspection and -// the gated-serve pause/resume controls. +// Paging exposes page-state introspection and pause/resume controls. type Paging struct { state *harnessState } @@ -216,17 +194,14 @@ func (p *Paging) Pause(_ *rpcharness.Empty, _ *rpcharness.Empty) error { } func (p *Paging) Resume(_ *rpcharness.Empty, _ *rpcharness.Empty) error { - p.state.mu.Lock() - defer p.state.mu.Unlock() - p.state.startServeLocked() + p.state.startServe() return nil } // pageStateEntries returns a snapshot of every tracked page and its -// state, translated into the rpcharness wire format. Briefly takes -// settleRequests.Lock so no in-flight worker can mutate the -// pageTracker while we read it. +// state, in rpcharness wire format. Briefly takes settleRequests.Lock +// so no in-flight worker can mutate pageTracker while we read it. func (u *Userfaultfd) pageStateEntries() ([]rpcharness.PageStateEntry, error) { u.settleRequests.Lock() u.settleRequests.Unlock() //nolint:staticcheck // SA2001: intentional — settle the read locks. @@ -246,9 +221,8 @@ func (u *Userfaultfd) pageStateEntries() ([]rpcharness.PageStateEntry, error) { return entries, nil } -// Barriers is the RPC service exposing the rpcharness.Registry to the -// parent. It's a thin wrapper so all the locking/lifecycle logic stays -// in rpcharness. +// Barriers is the thin RPC wrapper exposing rpcharness.Registry to +// the parent; locking and lifecycle live in rpcharness. type Barriers struct { state *harnessState } diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go index 72a970797f..dfcc1471b9 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go @@ -70,9 +70,9 @@ type Userfaultfd struct { logger logger.Logger } -// faultPhase identifies WHEN inside the worker the (test-only) fault hook is -// invoked. Production builds never install a hook (testFaultHook is nil); the -// per-fault overhead is then a single atomic load + nil check per call site. +// faultPhase identifies WHEN inside the worker the test-only fault +// hook is invoked. testFaultHook is nil in production, so the +// per-fault cost is a single atomic load + nil check per call site. type faultPhase uint8 const ( @@ -80,8 +80,9 @@ const ( faultPhaseBeforeFaultPage ) -// NewUserfaultfdFromFd creates a new userfaultfd instance with optional configuration. -func NewUserfaultfdFromFd(fd uintptr, src block.Slicer, m *memory.Mapping, logger logger.Logger) (*Userfaultfd, error) { +// NewFromFd wraps an already-open userfaultfd file descriptor. The caller +// is responsible for creating, configuring, and registering the fd. +func NewFromFd(fd uintptr, src block.Slicer, m *memory.Mapping, logger logger.Logger) (*Userfaultfd, error) { blockSize := src.BlockSize() for _, region := range m.Regions { From 687f5c36dd43047afebf2eb48457b61668a6fffd Mon Sep 17 00:00:00 2001 From: ValentaTomas Date: Fri, 1 May 2026 18:51:50 -0700 Subject: [PATCH 13/22] =?UTF-8?q?refactor(uffd):=20revert=20NewUserfaultfd?= =?UTF-8?q?FromFd=20=E2=86=92=20NewFromFd=20rename?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Keep the public symbol unchanged across all call sites so the PR review focuses on the substantive harness simplification rather than a name change that doesn't buy anything load-bearing. --- packages/orchestrator/pkg/sandbox/uffd/uffd.go | 2 +- .../pkg/sandbox/uffd/userfaultfd/rpc_services_test.go | 4 ++-- .../orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go | 5 ++--- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/packages/orchestrator/pkg/sandbox/uffd/uffd.go b/packages/orchestrator/pkg/sandbox/uffd/uffd.go index 4e60af28cb..5fe3e5370b 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/uffd.go +++ b/packages/orchestrator/pkg/sandbox/uffd/uffd.go @@ -164,7 +164,7 @@ func (u *Uffd) handle(ctx context.Context, sandboxId string, fdExit *fdexit.FdEx m := memory.NewMapping(regions) - uffd, err := userfaultfd.NewFromFd( + uffd, err := userfaultfd.NewUserfaultfdFromFd( uintptr(fds[0]), u.memfile, m, diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go index 016b08224f..d3d1d2b0bc 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go @@ -121,9 +121,9 @@ func (l *Lifecycle) Bootstrap(args *rpcharness.BootstrapArgs, _ *rpcharness.Boot return fmt.Errorf("logger: %w", err) } - uffd, err := NewFromFd(l.state.uffdFd, data, mapping, log) + uffd, err := NewUserfaultfdFromFd(l.state.uffdFd, data, mapping, log) if err != nil { - return fmt.Errorf("NewFromFd: %w", err) + return fmt.Errorf("NewUserfaultfdFromFd: %w", err) } if args.AlwaysWP { diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go index dfcc1471b9..aeb2c7a5d7 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go @@ -80,9 +80,8 @@ const ( faultPhaseBeforeFaultPage ) -// NewFromFd wraps an already-open userfaultfd file descriptor. The caller -// is responsible for creating, configuring, and registering the fd. -func NewFromFd(fd uintptr, src block.Slicer, m *memory.Mapping, logger logger.Logger) (*Userfaultfd, error) { +// NewUserfaultfdFromFd creates a new userfaultfd instance with optional configuration. +func NewUserfaultfdFromFd(fd uintptr, src block.Slicer, m *memory.Mapping, logger logger.Logger) (*Userfaultfd, error) { blockSize := src.BlockSize() for _, region := range m.Regions { From ee0186b5b653310d3b7a23ba44e87cffafd3d45c Mon Sep 17 00:00:00 2001 From: ValentaTomas Date: Fri, 1 May 2026 18:56:39 -0700 Subject: [PATCH 14/22] fix(uffd-tests): propagate startServe errors and reap child on bootstrap failure Address two Cursor Bugbot findings on #2519: * harness_parent_test.go: register the helper-process cleanup immediately after rpcharness.NewClient(parentConn) so a failure in client.Bootstrap or client.WaitReady still reaps the child and closes the rpc socket. * rpc_services_test.go: startServeLocked now returns error; Lifecycle.Bootstrap and Paging.Resume propagate it instead of swallowing fdexit.New failures to stderr while telling the parent the serve loop is up. --- .../uffd/userfaultfd/harness_parent_test.go | 38 +++++++++---------- .../uffd/userfaultfd/rpc_services_test.go | 22 ++++------- 2 files changed, 27 insertions(+), 33 deletions(-) diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go index 8557b95b3f..8ee8c7f8b9 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go @@ -114,6 +114,25 @@ func configureCrossProcessTest(ctx context.Context, t *testing.T, tt testConfig) client := rpcharness.NewClient(parentConn) + t.Cleanup(func() { + _ = client.Shutdown() + _ = client.Close() + + waitErr := cmd.Wait() + if waitErr != nil { + var exitErr *exec.ExitError + if !errors.As(waitErr, &exitErr) { + t.Logf("helper process Wait: %v", waitErr) + } + } + + // Unregister before close so a future test that enables + // UFFD_FEATURE_EVENT_REMOVE does not see munmap block on + // un-acked REMOVE events queued against the registered range. + assert.NoError(t, unregister(uffdFd, memoryStart, uint64(size))) + uffdFd.close() + }) + h := &testHandler{ memoryArea: &memoryArea, pagesize: tt.pagesize, @@ -139,24 +158,5 @@ func configureCrossProcessTest(ctx context.Context, t *testing.T, tt testConfig) return nil, fmt.Errorf("Lifecycle.WaitReady: %w", err) } - t.Cleanup(func() { - _ = client.Shutdown() - _ = client.Close() - - waitErr := cmd.Wait() - if waitErr != nil { - var exitErr *exec.ExitError - if !errors.As(waitErr, &exitErr) { - t.Logf("helper process Wait: %v", waitErr) - } - } - - // Unregister before close so a future test that enables - // UFFD_FEATURE_EVENT_REMOVE does not see munmap block on - // un-acked REMOVE events queued against the registered range. - assert.NoError(t, unregister(uffdFd, memoryStart, uint64(size))) - uffdFd.close() - }) - return h, nil } diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go index d3d1d2b0bc..badf5cef1a 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go @@ -41,16 +41,14 @@ func newHarnessState(uffdFd uintptr) *harnessState { // stop fn. Caller must hold s.mu. Idempotent: if a serve goroutine is // already running (s.stop != nil), a stray duplicate Resume cannot // leak an untracked Serve goroutine and break later pauses. -func (s *harnessState) startServeLocked() { +func (s *harnessState) startServeLocked() error { if s.stop != nil { - return + return nil } exit, err := fdexit.New() if err != nil { - fmt.Fprintln(os.Stderr, "fdexit.New:", err) - - return + return fmt.Errorf("fdexit.New: %w", err) } uffd := s.uffd @@ -67,12 +65,8 @@ func (s *harnessState) startServeLocked() { <-done exit.Close() } -} -func (s *harnessState) startServe() { - s.mu.Lock() - defer s.mu.Unlock() - s.startServeLocked() + return nil } func (s *harnessState) stopServe() { @@ -142,9 +136,8 @@ func (l *Lifecycle) Bootstrap(args *rpcharness.BootstrapArgs, _ *rpcharness.Boot defer l.state.mu.Unlock() l.state.uffd = uffd l.state.br = br - l.state.startServeLocked() - return nil + return l.state.startServeLocked() } // WaitReady is a no-op today: Bootstrap is synchronous so its reply @@ -194,9 +187,10 @@ func (p *Paging) Pause(_ *rpcharness.Empty, _ *rpcharness.Empty) error { } func (p *Paging) Resume(_ *rpcharness.Empty, _ *rpcharness.Empty) error { - p.state.startServe() + p.state.mu.Lock() + defer p.state.mu.Unlock() - return nil + return p.state.startServeLocked() } // pageStateEntries returns a snapshot of every tracked page and its From 61ab59a64d9aefc9358b293ea7797626b513d04d Mon Sep 17 00:00:00 2001 From: ValentaTomas Date: Fri, 1 May 2026 23:25:51 -0700 Subject: [PATCH 15/22] fix(uffd-tests): hold settleRequests.Lock for entire pageStateEntries read Mirror PrefetchData's drain-and-hold pattern. The previous Lock+immediate- Unlock left a window where new workers could enter faultPage and mutate pageTracker between the unlock and the pageTracker.mu.RLock acquisition, breaking the "settled snapshot" intent the comment claimed. Resolves Cursor Bugbot finding on rpc_services_test.go. --- .../pkg/sandbox/uffd/userfaultfd/rpc_services_test.go | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go index badf5cef1a..5d0aa92913 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go @@ -193,15 +193,12 @@ func (p *Paging) Resume(_ *rpcharness.Empty, _ *rpcharness.Empty) error { return p.state.startServeLocked() } -// pageStateEntries returns a snapshot of every tracked page and its -// state, in rpcharness wire format. Briefly takes settleRequests.Lock -// so no in-flight worker can mutate pageTracker while we read it. +// pageStateEntries returns a snapshot of every tracked page and its state in +// rpcharness wire format. Holds settleRequests.Lock for the duration so no +// fault worker is in flight; mirrors PrefetchData. func (u *Userfaultfd) pageStateEntries() ([]rpcharness.PageStateEntry, error) { u.settleRequests.Lock() - u.settleRequests.Unlock() //nolint:staticcheck // SA2001: intentional — settle the read locks. - - u.pageTracker.mu.RLock() - defer u.pageTracker.mu.RUnlock() + defer u.settleRequests.Unlock() entries := make([]rpcharness.PageStateEntry, 0, len(u.pageTracker.m)) for addr, state := range u.pageTracker.m { From 96df5394abdaa5b06bde07d46b319361896af04d Mon Sep 17 00:00:00 2001 From: ValentaTomas Date: Fri, 1 May 2026 23:29:15 -0700 Subject: [PATCH 16/22] =?UTF-8?q?refactor(uffd-tests):=20move=20internal/r?= =?UTF-8?q?pcharness=20=E2=86=92=20testutils/testharness?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sibling of the existing testutils helpers; rename + move in one step. The path now makes the test-only origin obvious. Drops internal/ scoping but no production caller exists or would want this package. --- .../testharness}/barriers.go | 2 +- .../testharness}/client.go | 2 +- .../testharness}/wire.go | 8 ++-- .../uffd/userfaultfd/harness_parent_test.go | 6 +-- .../sandbox/uffd/userfaultfd/helpers_test.go | 4 +- .../uffd/userfaultfd/rpc_services_test.go | 44 +++++++++---------- 6 files changed, 33 insertions(+), 33 deletions(-) rename packages/orchestrator/pkg/sandbox/uffd/{userfaultfd/internal/rpcharness => testutils/testharness}/barriers.go (99%) rename packages/orchestrator/pkg/sandbox/uffd/{userfaultfd/internal/rpcharness => testutils/testharness}/client.go (98%) rename packages/orchestrator/pkg/sandbox/uffd/{userfaultfd/internal/rpcharness => testutils/testharness}/wire.go (81%) diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/barriers.go b/packages/orchestrator/pkg/sandbox/uffd/testutils/testharness/barriers.go similarity index 99% rename from packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/barriers.go rename to packages/orchestrator/pkg/sandbox/uffd/testutils/testharness/barriers.go index bdd2d3aabb..e256a676c6 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/barriers.go +++ b/packages/orchestrator/pkg/sandbox/uffd/testutils/testharness/barriers.go @@ -1,4 +1,4 @@ -package rpcharness +package testharness import ( "context" diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/client.go b/packages/orchestrator/pkg/sandbox/uffd/testutils/testharness/client.go similarity index 98% rename from packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/client.go rename to packages/orchestrator/pkg/sandbox/uffd/testutils/testharness/client.go index a52449040c..666b5926a3 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/client.go +++ b/packages/orchestrator/pkg/sandbox/uffd/testutils/testharness/client.go @@ -1,4 +1,4 @@ -package rpcharness +package testharness import ( "io" diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/wire.go b/packages/orchestrator/pkg/sandbox/uffd/testutils/testharness/wire.go similarity index 81% rename from packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/wire.go rename to packages/orchestrator/pkg/sandbox/uffd/testutils/testharness/wire.go index 365b7b264e..b76a76d59f 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness/wire.go +++ b/packages/orchestrator/pkg/sandbox/uffd/testutils/testharness/wire.go @@ -1,9 +1,9 @@ -// Package rpcharness provides the wire types, typed RPC client, and +// Package testharness provides the wire types, typed RPC client, and // barrier registry shared between the parent and child halves of the // userfaultfd test harness. It deliberately knows nothing about -// *Userfaultfd internals so it can sit in a sibling internal package -// and never get pulled into a production import path. -package rpcharness +// *Userfaultfd internals so it can sit alongside the other uffd test +// utilities and never get pulled into a production import path. +package testharness // Empty stands in for net/rpc methods that take or return nothing; // net/rpc still requires both args and reply to be exported pointers. diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go index 8ee8c7f8b9..5ca0b35dd9 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go @@ -17,7 +17,7 @@ import ( "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/block" "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/testutils" - "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness" + "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/testutils/testharness" ) // MemorySlicer exposes a byte slice via the Slicer interface. @@ -112,7 +112,7 @@ func configureCrossProcessTest(ctx context.Context, t *testing.T, tt testConfig) parentEnd.Close() require.NoError(t, err) - client := rpcharness.NewClient(parentConn) + client := testharness.NewClient(parentConn) t.Cleanup(func() { _ = client.Shutdown() @@ -140,7 +140,7 @@ func configureCrossProcessTest(ctx context.Context, t *testing.T, tt testConfig) client: client, } - if err := client.Bootstrap(rpcharness.BootstrapArgs{ + if err := client.Bootstrap(testharness.BootstrapArgs{ MmapStart: uint64(memoryStart), Pagesize: int64(tt.pagesize), TotalSize: size, diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go index 0be882b7c5..4e861ea361 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go @@ -15,7 +15,7 @@ import ( "github.com/stretchr/testify/require" "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/testutils" - "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness" + "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/testutils/testharness" ) type testConfig struct { @@ -74,7 +74,7 @@ type testHandler struct { // client is the typed RPC channel to the child helper process. // Tests should prefer the convenience methods on testHandler. - client *rpcharness.Client + client *testharness.Client mutex sync.RWMutex } diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go index 5d0aa92913..788d58bb9e 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go @@ -1,7 +1,7 @@ package userfaultfd // RPC service implementations for the cross-process UFFD test harness. -// These live in _test.go (rather than internal/rpcharness) because they +// These live in _test.go (rather than testutils/testharness) because they // need access to unexported *Userfaultfd internals. import ( @@ -13,7 +13,7 @@ import ( "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/fdexit" "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/memory" - "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/internal/rpcharness" + "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/testutils/testharness" "github.com/e2b-dev/infra/packages/shared/pkg/logger" ) @@ -24,7 +24,7 @@ type harnessState struct { mu sync.Mutex uffd *Userfaultfd - br *rpcharness.Registry + br *testharness.Registry stop func() // serve-stop fn; nil when paused shutdown chan struct{} closed bool @@ -94,7 +94,7 @@ type Lifecycle struct { state *harnessState } -func (l *Lifecycle) Bootstrap(args *rpcharness.BootstrapArgs, _ *rpcharness.BootstrapReply) error { +func (l *Lifecycle) Bootstrap(args *testharness.BootstrapArgs, _ *testharness.BootstrapReply) error { if int64(len(args.Content)) != args.TotalSize { return fmt.Errorf("content size %d != expected %d", len(args.Content), args.TotalSize) } @@ -124,11 +124,11 @@ func (l *Lifecycle) Bootstrap(args *rpcharness.BootstrapArgs, _ *rpcharness.Boot uffd.defaultCopyMode = UFFDIO_COPY_MODE_WP } - br := rpcharness.NewRegistry() + br := testharness.NewRegistry() if args.Barriers { hook := br.Hook() uffd.SetTestFaultHook(func(addr uintptr, p faultPhase) { - hook(addr, rpcharness.Point(p)) + hook(addr, testharness.Point(p)) }) } @@ -143,11 +143,11 @@ func (l *Lifecycle) Bootstrap(args *rpcharness.BootstrapArgs, _ *rpcharness.Boot // WaitReady is a no-op today: Bootstrap is synchronous so its reply // already implies readiness. Kept as a separate RPC so an async // Bootstrap variant can hold the parent here without changing callers. -func (l *Lifecycle) WaitReady(_ *rpcharness.Empty, _ *rpcharness.Empty) error { +func (l *Lifecycle) WaitReady(_ *testharness.Empty, _ *testharness.Empty) error { return nil } -func (l *Lifecycle) Shutdown(_ *rpcharness.Empty, _ *rpcharness.Empty) error { +func (l *Lifecycle) Shutdown(_ *testharness.Empty, _ *testharness.Empty) error { l.state.mu.Lock() defer l.state.mu.Unlock() if !l.state.closed { @@ -163,7 +163,7 @@ type Paging struct { state *harnessState } -func (p *Paging) States(_ *rpcharness.Empty, reply *rpcharness.PageStatesReply) error { +func (p *Paging) States(_ *testharness.Empty, reply *testharness.PageStatesReply) error { p.state.mu.Lock() uffd := p.state.uffd p.state.mu.Unlock() @@ -180,13 +180,13 @@ func (p *Paging) States(_ *rpcharness.Empty, reply *rpcharness.PageStatesReply) return nil } -func (p *Paging) Pause(_ *rpcharness.Empty, _ *rpcharness.Empty) error { +func (p *Paging) Pause(_ *testharness.Empty, _ *testharness.Empty) error { p.state.stopServe() return nil } -func (p *Paging) Resume(_ *rpcharness.Empty, _ *rpcharness.Empty) error { +func (p *Paging) Resume(_ *testharness.Empty, _ *testharness.Empty) error { p.state.mu.Lock() defer p.state.mu.Unlock() @@ -194,41 +194,41 @@ func (p *Paging) Resume(_ *rpcharness.Empty, _ *rpcharness.Empty) error { } // pageStateEntries returns a snapshot of every tracked page and its state in -// rpcharness wire format. Holds settleRequests.Lock for the duration so no +// testharness wire format. Holds settleRequests.Lock for the duration so no // fault worker is in flight; mirrors PrefetchData. -func (u *Userfaultfd) pageStateEntries() ([]rpcharness.PageStateEntry, error) { +func (u *Userfaultfd) pageStateEntries() ([]testharness.PageStateEntry, error) { u.settleRequests.Lock() defer u.settleRequests.Unlock() - entries := make([]rpcharness.PageStateEntry, 0, len(u.pageTracker.m)) + entries := make([]testharness.PageStateEntry, 0, len(u.pageTracker.m)) for addr, state := range u.pageTracker.m { offset, err := u.ma.GetOffset(addr) if err != nil { return nil, fmt.Errorf("address %#x not in mapping: %w", addr, err) } - entries = append(entries, rpcharness.PageStateEntry{State: uint8(state), Offset: uint64(offset)}) + entries = append(entries, testharness.PageStateEntry{State: uint8(state), Offset: uint64(offset)}) } return entries, nil } -// Barriers is the thin RPC wrapper exposing rpcharness.Registry to -// the parent; locking and lifecycle live in rpcharness. +// Barriers is the thin RPC wrapper exposing testharness.Registry to +// the parent; locking and lifecycle live in testharness. type Barriers struct { state *harnessState } -func (b *Barriers) Install(args *rpcharness.FaultBarrierArgs, reply *rpcharness.FaultBarrierReply) error { +func (b *Barriers) Install(args *testharness.FaultBarrierArgs, reply *testharness.FaultBarrierReply) error { br, err := b.registry() if err != nil { return err } - reply.Token = br.Install(uintptr(args.Addr), rpcharness.Point(args.Point)) + reply.Token = br.Install(uintptr(args.Addr), testharness.Point(args.Point)) return nil } -func (b *Barriers) WaitHeld(args *rpcharness.TokenArgs, _ *rpcharness.Empty) error { +func (b *Barriers) WaitHeld(args *testharness.TokenArgs, _ *testharness.Empty) error { br, err := b.registry() if err != nil { return err @@ -237,7 +237,7 @@ func (b *Barriers) WaitHeld(args *rpcharness.TokenArgs, _ *rpcharness.Empty) err return br.WaitArrived(context.Background(), args.Token) } -func (b *Barriers) Release(args *rpcharness.TokenArgs, _ *rpcharness.Empty) error { +func (b *Barriers) Release(args *testharness.TokenArgs, _ *testharness.Empty) error { br, err := b.registry() if err != nil { return err @@ -247,7 +247,7 @@ func (b *Barriers) Release(args *rpcharness.TokenArgs, _ *rpcharness.Empty) erro return nil } -func (b *Barriers) registry() (*rpcharness.Registry, error) { +func (b *Barriers) registry() (*testharness.Registry, error) { b.state.mu.Lock() br := b.state.br b.state.mu.Unlock() From eb294e2338fe0137b9fb96f072c8e56dcdbc5187 Mon Sep 17 00:00:00 2001 From: ValentaTomas Date: Fri, 1 May 2026 23:37:40 -0700 Subject: [PATCH 17/22] fix(uffd-tests): reap child on FileConn failure; require Barriers opt-in Address two ChatGPT-Codex-Connector findings on commit ee0186b: * harness_parent_test.go: register helper cleanup immediately after cmd.Start succeeds. The closure picks the graceful (client.Shutdown) or hard (cmd.Process.Kill) path depending on whether the rpc client has been constructed yet, so a net.FileConn failure no longer leaks the child or the uffd registration. * rpc_services_test.go: only create the barrier Registry when args.Barriers is true. With Barriers=false, Barriers.Install used to succeed and Barriers.WaitHeld then blocked forever; now it errors fast via the existing nil check, with a clearer message. --- .../uffd/userfaultfd/harness_parent_test.go | 28 +++++++++++++------ .../uffd/userfaultfd/rpc_services_test.go | 5 ++-- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go index 5ca0b35dd9..b67ca33ef3 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go @@ -107,16 +107,20 @@ func configureCrossProcessTest(ctx context.Context, t *testing.T, tt testConfig) require.NoError(t, startErr) } - // FileConn dups the underlying fd; close parentEnd to avoid leaking it. - parentConn, err := net.FileConn(parentEnd) - parentEnd.Close() - require.NoError(t, err) - - client := testharness.NewClient(parentConn) - + // Register teardown immediately so any failure between here and the + // final return still reaps the helper process and unregisters the + // userfaultfd. client is captured by reference so the closure can + // pick the graceful (Shutdown) or hard (Kill) path depending on + // how far init progressed. + var client *testharness.Client t.Cleanup(func() { - _ = client.Shutdown() - _ = client.Close() + if client != nil { + _ = client.Shutdown() + _ = client.Close() + } else { + _ = cmd.Process.Kill() + } + _ = parentEnd.Close() // idempotent; FileConn dups the underlying fd waitErr := cmd.Wait() if waitErr != nil { @@ -133,6 +137,12 @@ func configureCrossProcessTest(ctx context.Context, t *testing.T, tt testConfig) uffdFd.close() }) + parentConn, err := net.FileConn(parentEnd) + parentEnd.Close() + require.NoError(t, err) + + client = testharness.NewClient(parentConn) + h := &testHandler{ memoryArea: &memoryArea, pagesize: tt.pagesize, diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go index 788d58bb9e..50baabf768 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go @@ -124,8 +124,9 @@ func (l *Lifecycle) Bootstrap(args *testharness.BootstrapArgs, _ *testharness.Bo uffd.defaultCopyMode = UFFDIO_COPY_MODE_WP } - br := testharness.NewRegistry() + var br *testharness.Registry if args.Barriers { + br = testharness.NewRegistry() hook := br.Hook() uffd.SetTestFaultHook(func(addr uintptr, p faultPhase) { hook(addr, testharness.Point(p)) @@ -252,7 +253,7 @@ func (b *Barriers) registry() (*testharness.Registry, error) { br := b.state.br b.state.mu.Unlock() if br == nil { - return nil, errors.New("Barriers RPC called before Lifecycle.Bootstrap") + return nil, errors.New("Barriers RPC requires args.Barriers=true at Bootstrap") } return br, nil From 93e2727e9157ae40c4ec0efe978ccdc5bef39430 Mon Sep 17 00:00:00 2001 From: ValentaTomas Date: Sat, 2 May 2026 00:05:33 -0700 Subject: [PATCH 18/22] fix(uffd-tests): cleanup at acquisition, shutdown-aware WaitHeld, defensive locking, Release token-reuse Address four review findings on PR #2519, all real bugs introduced by the harness rewrite: * harness_parent_test.go: register the kernel uffd close, the unregister, and the child-reap cleanups at their points of acquisition. Failures in configureApi/register/FcntlInt/Socketpair/ cmd.Start no longer leak fds or zombie processes. * rpc_services_test.go: replace the harnessState.shutdown chan with a shutdownCtx; Barriers.WaitHeld now derives its ctx from it, so a misconfigured install or a child crash during a fault no longer parks the rpc handler forever (and SIGKILLs the test binary via Go's -timeout reaper). * rpc_services_test.go: re-add pageTracker.mu.RLock under the settleRequests.Lock in pageStateEntries. settleRequests alone is sufficient today but PR 2's REMOVE handler will write pageTracker.m without going through settleRequests; the inner RLock makes the snapshot defensible against that change. * testutils/testharness/barriers.go: Registry.Release now compare-and-deletes byKey, so an earlier token's release no longer clobbers a later Install for the same (addr, point) and the later WaitArrived no longer blocks forever. --- .../uffd/testutils/testharness/barriers.go | 8 ++++- .../uffd/userfaultfd/harness_child_test.go | 2 +- .../uffd/userfaultfd/harness_parent_test.go | 24 +++++++------ .../uffd/userfaultfd/rpc_services_test.go | 35 ++++++++++++------- 4 files changed, 44 insertions(+), 25 deletions(-) diff --git a/packages/orchestrator/pkg/sandbox/uffd/testutils/testharness/barriers.go b/packages/orchestrator/pkg/sandbox/uffd/testutils/testharness/barriers.go index e256a676c6..60cc68b9ac 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/testutils/testharness/barriers.go +++ b/packages/orchestrator/pkg/sandbox/uffd/testutils/testharness/barriers.go @@ -107,7 +107,13 @@ func (r *Registry) Release(token uint64) { s, ok := r.tokens[token] delete(r.tokens, token) if ok { - delete(r.byKey, key{s.addr, s.point}) + // A later Install at the same (addr, point) overwrites byKey; + // do not clobber that newer mapping when releasing this older + // token. + k := key{s.addr, s.point} + if r.byKey[k] == token { + delete(r.byKey, k) + } } r.mu.Unlock() diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_child_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_child_test.go index 07582b3740..b6909757a4 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_child_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_child_test.go @@ -62,7 +62,7 @@ func crossProcessServe() error { }() select { - case <-state.shutdown: + case <-state.ctx.Done(): case <-codecDone: } diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go index b67ca33ef3..33f636716e 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go @@ -73,9 +73,17 @@ func configureCrossProcessTest(ctx context.Context, t *testing.T, tt testConfig) uffdFd, err := newFd(syscall.O_CLOEXEC | syscall.O_NONBLOCK) require.NoError(t, err) + t.Cleanup(func() { uffdFd.close() }) require.NoError(t, configureApi(uffdFd, tt.pagesize)) require.NoError(t, register(uffdFd, memoryStart, uint64(size), UFFDIO_REGISTER_MODE_MISSING|UFFDIO_REGISTER_MODE_WP)) + t.Cleanup(func() { + // Unregister before the close cleanup runs (LIFO). A future test + // that enables UFFD_FEATURE_EVENT_REMOVE would otherwise see + // munmap block on un-acked REMOVE events queued against a + // still-registered range. + assert.NoError(t, unregister(uffdFd, memoryStart, uint64(size))) + }) cmd := exec.CommandContext(ctx, os.Args[0], "-test.run=TestHelperServingProcess", "-test.timeout=0") cmd.Env = append(os.Environ(), envHelperFlag+"=1") @@ -107,11 +115,11 @@ func configureCrossProcessTest(ctx context.Context, t *testing.T, tt testConfig) require.NoError(t, startErr) } - // Register teardown immediately so any failure between here and the - // final return still reaps the helper process and unregisters the - // userfaultfd. client is captured by reference so the closure can - // pick the graceful (Shutdown) or hard (Kill) path depending on - // how far init progressed. + // child-reap cleanup. client is captured by reference: nil → + // cmd.Process.Kill, non-nil → graceful Shutdown via RPC. LIFO + // ordering with the cleanups above gives child-reap → unregister → + // close, i.e. stop the child's Serve loop before tearing down + // kernel state. var client *testharness.Client t.Cleanup(func() { if client != nil { @@ -129,12 +137,6 @@ func configureCrossProcessTest(ctx context.Context, t *testing.T, tt testConfig) t.Logf("helper process Wait: %v", waitErr) } } - - // Unregister before close so a future test that enables - // UFFD_FEATURE_EVENT_REMOVE does not see munmap block on - // un-acked REMOVE events queued against the registered range. - assert.NoError(t, unregister(uffdFd, memoryStart, uint64(size))) - uffdFd.close() }) parentConn, err := net.FileConn(parentEnd) diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go index 50baabf768..d162e922ee 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go @@ -19,21 +19,27 @@ import ( // harnessState is the per-child container shared by Lifecycle / Paging / // Barriers RPCs. Mutable fields are guarded by mu. +// +//nolint:containedctx // shutdown-aware ctx is shared with RPC handlers; lifetime is the child process. type harnessState struct { uffdFd uintptr - mu sync.Mutex - uffd *Userfaultfd - br *testharness.Registry - stop func() // serve-stop fn; nil when paused - shutdown chan struct{} - closed bool + mu sync.Mutex + uffd *Userfaultfd + br *testharness.Registry + stop func() // serve-stop fn; nil when paused + ctx context.Context + cancel context.CancelFunc + closed bool } func newHarnessState(uffdFd uintptr) *harnessState { + ctx, cancel := context.WithCancel(context.Background()) + return &harnessState{ - uffdFd: uffdFd, - shutdown: make(chan struct{}), + uffdFd: uffdFd, + ctx: ctx, + cancel: cancel, } } @@ -153,7 +159,7 @@ func (l *Lifecycle) Shutdown(_ *testharness.Empty, _ *testharness.Empty) error { defer l.state.mu.Unlock() if !l.state.closed { l.state.closed = true - close(l.state.shutdown) + l.state.cancel() } return nil @@ -195,12 +201,17 @@ func (p *Paging) Resume(_ *testharness.Empty, _ *testharness.Empty) error { } // pageStateEntries returns a snapshot of every tracked page and its state in -// testharness wire format. Holds settleRequests.Lock for the duration so no -// fault worker is in flight; mirrors PrefetchData. +// testharness wire format. Holds settleRequests.Lock to fence fault workers +// (mirrors PrefetchData) plus pageTracker.mu.RLock defensively, so a future +// writer that mutates pageTracker.m without going through settleRequests +// (e.g. a REMOVE event handler) cannot race this snapshot. func (u *Userfaultfd) pageStateEntries() ([]testharness.PageStateEntry, error) { u.settleRequests.Lock() defer u.settleRequests.Unlock() + u.pageTracker.mu.RLock() + defer u.pageTracker.mu.RUnlock() + entries := make([]testharness.PageStateEntry, 0, len(u.pageTracker.m)) for addr, state := range u.pageTracker.m { offset, err := u.ma.GetOffset(addr) @@ -235,7 +246,7 @@ func (b *Barriers) WaitHeld(args *testharness.TokenArgs, _ *testharness.Empty) e return err } - return br.WaitArrived(context.Background(), args.Token) + return br.WaitArrived(b.state.ctx, args.Token) } func (b *Barriers) Release(args *testharness.TokenArgs, _ *testharness.Empty) error { From 4681ab2085a628a7008fba00181b25d42f9f2cab Mon Sep 17 00:00:00 2001 From: ValentaTomas Date: Sat, 2 May 2026 00:18:29 -0700 Subject: [PATCH 19/22] test(uffd): restore TestParallelMissing/Write parallelOperations to 1_000_000 Reverts the 1M -> 10K reduction made in #2461. Intermittent failures under 1M are bugs to be hunted, not silenced by lowering the load. The other parallel/serial variants in these files have legitimately been at 10K since their introduction in November 2025; only the two TestParallelMissing(/Write) cases were reduced. --- .../orchestrator/pkg/sandbox/uffd/userfaultfd/missing_test.go | 2 +- .../pkg/sandbox/uffd/userfaultfd/missing_write_test.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_test.go index 989915bfd3..d2ae6c2484 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_test.go @@ -141,7 +141,7 @@ func TestMissing(t *testing.T) { func TestParallelMissing(t *testing.T) { t.Parallel() - parallelOperations := 10_000 + parallelOperations := 1_000_000 tt := testConfig{ pagesize: header.PageSize, diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_write_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_write_test.go index caa835cbdb..184eb7f28a 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_write_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_write_test.go @@ -136,7 +136,7 @@ func TestMissingWrite(t *testing.T) { func TestParallelMissingWrite(t *testing.T) { t.Parallel() - parallelOperations := 10_000 + parallelOperations := 1_000_000 tt := testConfig{ pagesize: header.PageSize, From d264863defe387c146af5934be115f78d8a88ab9 Mon Sep 17 00:00:00 2001 From: ValentaTomas Date: Sat, 2 May 2026 00:35:27 -0700 Subject: [PATCH 20/22] test(uffd): restore TestSerial{Missing,MissingWrite} and TestParallelMissingWriteWithPrefault loads to 1_000_000 Continues the regression sweep started in 4681ab208. The original PR #1415 (88c396093, Nov 3 2025) introduced these tests at 1_000_000 operations. PR #1450 (6ee2ebb63, Nov 7 2025, "Modify UFFD tests to run serve loop in a separate process") cut them all to 10_000 in the same commit that switched from in-process to cross-process tests, with no commit message justification. The cross-process refactor itself does not require lower iteration counts: the per-iteration work is still in-process memory access; only the page-state snapshot RPC is cross-process and is called once per test. Reverts: - TestSerialMissing: 10_000 -> 1_000_000 - TestSerialMissingWrite: 10_000 -> 1_000_000 - TestParallelMissingWriteWithPrefault: 10_000 -> 1_000_000 Per-test wall time on this branch (sudo go test, no -race): - TestSerialMissing ~3s - TestSerialMissingWrite ~3s - TestParallelMissingWriteWithPrefault ~2s TestParallelMissingWithPrefault stays at 10_000: it was only ever parallelOperations := 10 in #1415, then bumped to 10_000 in #1450, so 10_000 is already above its historical baseline. --- .../orchestrator/pkg/sandbox/uffd/userfaultfd/missing_test.go | 2 +- .../pkg/sandbox/uffd/userfaultfd/missing_write_test.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_test.go index d2ae6c2484..6aeb8d1244 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_test.go @@ -218,7 +218,7 @@ func TestParallelMissingWithPrefault(t *testing.T) { func TestSerialMissing(t *testing.T) { t.Parallel() - serialOperations := 10_000 + serialOperations := 1_000_000 tt := testConfig{ pagesize: header.PageSize, diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_write_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_write_test.go index 184eb7f28a..ff9ca621d5 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_write_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/missing_write_test.go @@ -173,7 +173,7 @@ func TestParallelMissingWrite(t *testing.T) { func TestParallelMissingWriteWithPrefault(t *testing.T) { t.Parallel() - parallelOperations := 10_000 + parallelOperations := 1_000_000 tt := testConfig{ pagesize: header.PageSize, @@ -213,7 +213,7 @@ func TestParallelMissingWriteWithPrefault(t *testing.T) { func TestSerialMissingWrite(t *testing.T) { t.Parallel() - serialOperations := 10_000 + serialOperations := 1_000_000 tt := testConfig{ pagesize: header.PageSize, From 7683f88d7e4330d69391e50e7d9dc1ee751ed2a3 Mon Sep 17 00:00:00 2001 From: ValentaTomas Date: Sat, 2 May 2026 00:45:17 -0700 Subject: [PATCH 21/22] fix(uffd-tests): release harnessState.mu before blocking on Serve drain Address review findings from PR #2519: * rpc_services_test.go: stopServe now snapshots and clears s.stop under the lock, then drops the lock before calling stop() (which blocks on <-done). Holding s.mu across the drain blocked every concurrent RPC handler that takes s.mu, and would deadlock the upcoming race tests where WaitFaultHeld needs s.mu while a worker is parked at a barrier waiting for a paired Pause to drain. * harness_child_test.go: guard the child's rpc conn close with a sync.Once so the deferred safety-net close (needed for early returns from rpc.Register) does not double-close after the explicit close that unblocks server.ServeCodec on the success path. Audit of the rest of rpc_services_test.go found no other lock-during-blocking patterns: releaseAllBarriers, Paging.States, and Barriers.{Install,WaitHeld,Release} already snapshot under the lock and call out unlocked; Lifecycle.{Bootstrap,Shutdown,Resume} only do non-blocking work under the lock (channel send via context.cancel, fdexit.New, goroutine spawn). --- .../uffd/userfaultfd/harness_child_test.go | 10 ++++++++-- .../uffd/userfaultfd/rpc_services_test.go | 17 +++++++++++++---- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_child_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_child_test.go index b6909757a4..155245a172 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_child_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_child_test.go @@ -6,6 +6,7 @@ import ( "net/rpc" "net/rpc/jsonrpc" "os" + "sync" "testing" ) @@ -38,7 +39,12 @@ func crossProcessServe() error { if err != nil { return fmt.Errorf("net.FileConn rpc: %w", err) } - defer conn.Close() + // Close once: the explicit close before <-codecDone unblocks + // server.ServeCodec on the success path; the deferred close is the + // safety net for early returns from the rpc.Register calls below. + var closeConnOnce sync.Once + closeConn := func() { closeConnOnce.Do(func() { _ = conn.Close() }) } + defer closeConn() state := newHarnessState(uffdFile.Fd()) @@ -70,7 +76,7 @@ func crossProcessServe() error { state.releaseAllBarriers() state.stopServe() - _ = conn.Close() + closeConn() <-codecDone return nil diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go index d162e922ee..1d45bb9051 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go @@ -76,11 +76,20 @@ func (s *harnessState) startServeLocked() error { } func (s *harnessState) stopServe() { + // Snapshot s.stop and clear it under the lock, then drop the lock + // before calling stop() — stop() blocks on <-done draining the Serve + // goroutine, and any concurrent RPC handler that needs s.mu + // (Barriers.registry, Paging.States/Resume, Lifecycle.Bootstrap/Shutdown) + // would otherwise be blocked for the full drain. Required for the + // upcoming barrier-race RPCs where WaitFaultHeld must run while a + // worker parked at a barrier holds Pause's drain. s.mu.Lock() - defer s.mu.Unlock() - if s.stop != nil { - s.stop() - s.stop = nil + stop := s.stop + s.stop = nil + s.mu.Unlock() + + if stop != nil { + stop() } } From fa7d2eddf15ce0a83305f7d9a54964488380e1c1 Mon Sep 17 00:00:00 2001 From: ValentaTomas Date: Sat, 2 May 2026 01:10:47 -0700 Subject: [PATCH 22/22] chore(uffd-tests): drop non-load-bearing comments and unreferenced helpers Pure cleanup; no behavior change. - trim file-header and method docs across testharness/ + harness_*_test.go - inline releaseAllBarriers (single caller) - preserve drain-fence and REMOVE-defensive WHYs on pageStateEntries --- .../uffd/testutils/testharness/barriers.go | 34 ++++-------- .../uffd/testutils/testharness/client.go | 6 +- .../uffd/testutils/testharness/wire.go | 15 ++--- .../uffd/userfaultfd/harness_child_test.go | 21 ++++--- .../uffd/userfaultfd/harness_parent_test.go | 26 ++------- .../sandbox/uffd/userfaultfd/helpers_test.go | 19 +------ .../uffd/userfaultfd/rpc_services_test.go | 55 +++++-------------- .../sandbox/uffd/userfaultfd/userfaultfd.go | 6 +- 8 files changed, 53 insertions(+), 129 deletions(-) diff --git a/packages/orchestrator/pkg/sandbox/uffd/testutils/testharness/barriers.go b/packages/orchestrator/pkg/sandbox/uffd/testutils/testharness/barriers.go index 60cc68b9ac..6094f4b74f 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/testutils/testharness/barriers.go +++ b/packages/orchestrator/pkg/sandbox/uffd/testutils/testharness/barriers.go @@ -6,24 +6,18 @@ import ( "sync" ) -// Point identifies WHICH worker hook a barrier should park on. Values -// must match the parent package's faultPhase iota so the test hook can -// pass them through with a numeric cast. +// Point identifies which worker hook to park on. Values must match the +// parent package's faultPhase iota so the hook can cast across. type Point uint8 const ( - // BeforeRLock parks the worker BEFORE settleRequests.RLock(), so a - // parallel writer can take the write lock immediately. + // BeforeRLock parks before settleRequests.RLock. BeforeRLock Point = iota - // BeforeFaultPage parks the worker AFTER settleRequests.RLock but - // BEFORE the UFFDIO_COPY syscall, so a parent operation must still - // return even though a worker holds RLock. + // BeforeFaultPage parks after settleRequests.RLock, before UFFDIO_COPY. BeforeFaultPage ) -// Registry is the child-process side of the barrier mechanism. The -// per-fault hook on *Userfaultfd consults it by (addr, point) to decide -// whether to park. +// Registry is the child-side barrier store consulted by the per-fault hook. type Registry struct { mu sync.Mutex next uint64 @@ -51,7 +45,6 @@ func NewRegistry() *Registry { } } -// Install registers a barrier at (addr, point) and returns its token. func (r *Registry) Install(addr uintptr, point Point) uint64 { r.mu.Lock() defer r.mu.Unlock() @@ -82,8 +75,6 @@ func (r *Registry) lookupByAddr(addr uintptr, point Point) *slot { return r.tokens[token] } -// WaitArrived blocks until the worker hook for token has reached the -// barrier point, or until ctx is cancelled. func (r *Registry) WaitArrived(ctx context.Context, token uint64) error { r.mu.Lock() s, ok := r.tokens[token] @@ -100,16 +91,14 @@ func (r *Registry) WaitArrived(ctx context.Context, token uint64) error { } } -// Release frees the barrier identified by token, allowing any parked -// worker to proceed. Releasing an unknown token is a no-op. +// Release frees the barrier; unknown token is a no-op. func (r *Registry) Release(token uint64) { r.mu.Lock() s, ok := r.tokens[token] delete(r.tokens, token) if ok { - // A later Install at the same (addr, point) overwrites byKey; - // do not clobber that newer mapping when releasing this older - // token. + // A later Install at this key overwrites byKey; only delete if + // it still maps to this token. k := key{s.addr, s.point} if r.byKey[k] == token { delete(r.byKey, k) @@ -128,8 +117,7 @@ func (r *Registry) Release(token uint64) { } } -// ReleaseAll releases every still-installed barrier so any parked -// worker can finish before the child's serve goroutine is joined. +// ReleaseAll releases every still-installed barrier. func (r *Registry) ReleaseAll() { r.mu.Lock() tokens := make([]uint64, 0, len(r.tokens)) @@ -143,8 +131,8 @@ func (r *Registry) ReleaseAll() { } } -// Hook returns the per-fault hook tests install on *Userfaultfd. Faults -// at (addr, point) pairs without an Install'd slot are no-ops. +// Hook returns the per-fault hook to install on *Userfaultfd; faults +// without an installed slot are no-ops. func (r *Registry) Hook() func(addr uintptr, point Point) { return func(addr uintptr, point Point) { s := r.lookupByAddr(addr, point) diff --git a/packages/orchestrator/pkg/sandbox/uffd/testutils/testharness/client.go b/packages/orchestrator/pkg/sandbox/uffd/testutils/testharness/client.go index 666b5926a3..ad1f0a6a05 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/testutils/testharness/client.go +++ b/packages/orchestrator/pkg/sandbox/uffd/testutils/testharness/client.go @@ -7,15 +7,13 @@ import ( ) // Client is the typed parent-side wrapper around the JSON-RPC channel -// to the child helper process. It hides method-name strings, the Empty -// placeholder pointers, and the wire-vs-domain type translation. +// to the child helper process. type Client struct { rpc *rpc.Client conn io.Closer } -// NewClient wraps an already-connected duplex stream (typically one -// half of a socketpair handed off via cmd.ExtraFiles). Closing the +// NewClient wraps an already-connected duplex stream. Closing the // returned Client closes the underlying conn. func NewClient(conn io.ReadWriteCloser) *Client { return &Client{ diff --git a/packages/orchestrator/pkg/sandbox/uffd/testutils/testharness/wire.go b/packages/orchestrator/pkg/sandbox/uffd/testutils/testharness/wire.go index b76a76d59f..7a475b775d 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/testutils/testharness/wire.go +++ b/packages/orchestrator/pkg/sandbox/uffd/testutils/testharness/wire.go @@ -1,12 +1,10 @@ // Package testharness provides the wire types, typed RPC client, and // barrier registry shared between the parent and child halves of the -// userfaultfd test harness. It deliberately knows nothing about -// *Userfaultfd internals so it can sit alongside the other uffd test -// utilities and never get pulled into a production import path. +// userfaultfd test harness. package testharness -// Empty stands in for net/rpc methods that take or return nothing; -// net/rpc still requires both args and reply to be exported pointers. +// Empty is the placeholder for net/rpc methods that take or return +// nothing; net/rpc still requires both args and reply pointers. type Empty struct{} type BootstrapArgs struct { @@ -14,17 +12,14 @@ type BootstrapArgs struct { Pagesize int64 TotalSize int64 AlwaysWP bool - // Barriers gates the test-only worker hooks. Off by default so - // the worker hot path stays a single nil-pointer load + branch - // in non-race tests. + // Barriers gates the test-only worker hooks (off by default). Barriers bool Content []byte } type BootstrapReply struct{} -// PageStateEntry is the wire form of the parent package's pageState -// enum; the parent translates back at the boundary. +// PageStateEntry is the wire form of the parent package's pageState enum. type PageStateEntry struct { State uint8 Offset uint64 diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_child_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_child_test.go index 155245a172..899a2f2b95 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_child_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_child_test.go @@ -10,8 +10,6 @@ import ( "testing" ) -// TestHelperServingProcess is the entry point for the child helper -// process spawned by configureCrossProcessTest. func TestHelperServingProcess(t *testing.T) { t.Parallel() @@ -28,8 +26,7 @@ func TestHelperServingProcess(t *testing.T) { } func crossProcessServe() error { - // fork+exec dup3's the parent's ExtraFiles to fd 3 (uffd) and fd 4 - // (rpc socketpair half) with CLOEXEC cleared. + // fork+exec dup3's the parent's ExtraFiles to fd 3 (uffd) and fd 4 (rpc). uffdFile := os.NewFile(uintptr(3), "uffd") defer uffdFile.Close() @@ -39,9 +36,8 @@ func crossProcessServe() error { if err != nil { return fmt.Errorf("net.FileConn rpc: %w", err) } - // Close once: the explicit close before <-codecDone unblocks - // server.ServeCodec on the success path; the deferred close is the - // safety net for early returns from the rpc.Register calls below. + // Explicit close before <-codecDone unblocks ServeCodec on the success + // path; the deferred close is the safety net for early returns. var closeConnOnce sync.Once closeConn := func() { closeConnOnce.Do(func() { _ = conn.Close() }) } defer closeConn() @@ -59,8 +55,7 @@ func crossProcessServe() error { return fmt.Errorf("rpc Register Barriers: %w", err) } - // Run the codec in a goroutine so we can react to Shutdown - // without depending on the codec returning. + // Run the codec in a goroutine so Shutdown can unblock us via ctx. codecDone := make(chan struct{}) go func() { defer close(codecDone) @@ -72,8 +67,12 @@ func crossProcessServe() error { case <-codecDone: } - // Release parked barriers so the serve goroutine can drain. - state.releaseAllBarriers() + state.mu.Lock() + br := state.br + state.mu.Unlock() + if br != nil { + br.ReleaseAll() + } state.stopServe() closeConn() diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go index 33f636716e..01baa10f4f 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go @@ -20,7 +20,6 @@ import ( "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/testutils/testharness" ) -// MemorySlicer exposes a byte slice via the Slicer interface. type MemorySlicer struct { content []byte pagesize int64 @@ -78,26 +77,21 @@ func configureCrossProcessTest(ctx context.Context, t *testing.T, tt testConfig) require.NoError(t, configureApi(uffdFd, tt.pagesize)) require.NoError(t, register(uffdFd, memoryStart, uint64(size), UFFDIO_REGISTER_MODE_MISSING|UFFDIO_REGISTER_MODE_WP)) t.Cleanup(func() { - // Unregister before the close cleanup runs (LIFO). A future test - // that enables UFFD_FEATURE_EVENT_REMOVE would otherwise see - // munmap block on un-acked REMOVE events queued against a - // still-registered range. + // Unregister before close (LIFO): a future test enabling + // UFFD_FEATURE_EVENT_REMOVE would otherwise see munmap block on + // un-acked REMOVE events against a still-registered range. assert.NoError(t, unregister(uffdFd, memoryStart, uint64(size))) }) cmd := exec.CommandContext(ctx, os.Args[0], "-test.run=TestHelperServingProcess", "-test.timeout=0") cmd.Env = append(os.Environ(), envHelperFlag+"=1") - // F_DUPFD_CLOEXEC dup's atomically with CLOEXEC set, so a concurrent - // fork in another goroutine cannot inherit the dup'd fd before we - // hand it off via ExtraFiles. + // F_DUPFD_CLOEXEC dup's atomically with CLOEXEC set; a concurrent + // fork cannot inherit the fd before we hand it off via ExtraFiles. dup, err := unix.FcntlInt(uintptr(uffdFd), unix.F_DUPFD_CLOEXEC, 0) require.NoError(t, err) uffdFile := os.NewFile(uintptr(dup), "uffd") - // Socketpair gives a connected AF_UNIX pair in one syscall, with both - // ends born CLOEXEC; cmd.ExtraFiles clears CLOEXEC on the child end - // inside ForkExec. fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) require.NoError(t, err) parentEnd := os.NewFile(uintptr(fds[0]), "rpc-parent") @@ -115,11 +109,6 @@ func configureCrossProcessTest(ctx context.Context, t *testing.T, tt testConfig) require.NoError(t, startErr) } - // child-reap cleanup. client is captured by reference: nil → - // cmd.Process.Kill, non-nil → graceful Shutdown via RPC. LIFO - // ordering with the cleanups above gives child-reap → unregister → - // close, i.e. stop the child's Serve loop before tearing down - // kernel state. var client *testharness.Client t.Cleanup(func() { if client != nil { @@ -128,7 +117,7 @@ func configureCrossProcessTest(ctx context.Context, t *testing.T, tt testConfig) } else { _ = cmd.Process.Kill() } - _ = parentEnd.Close() // idempotent; FileConn dups the underlying fd + _ = parentEnd.Close() waitErr := cmd.Wait() if waitErr != nil { @@ -163,9 +152,6 @@ func configureCrossProcessTest(ctx context.Context, t *testing.T, tt testConfig) return nil, fmt.Errorf("Lifecycle.Bootstrap: %w", err) } - // Bootstrap is synchronous, so its reply already implies readiness; - // WaitReady is kept as a separate RPC so an async-Bootstrap variant - // can hold the parent here without touching call sites. if err := client.WaitReady(); err != nil { return nil, fmt.Errorf("Lifecycle.WaitReady: %w", err) } diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go index 4e861ea361..443e5cef83 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go @@ -28,9 +28,7 @@ type testConfig struct { operations []operation // alwaysWP makes the handler copy with UFFDIO_COPY_MODE_WP for all faults. alwaysWP bool - // barriers wires up the per-worker fault hooks in the child - // (used by race tests). Off by default so the worker hot path - // stays a single nil-pointer load + branch in non-race tests. + // barriers enables the per-worker fault hook (race tests only). barriers bool } @@ -54,15 +52,10 @@ type operation struct { async bool } -// handlerPageStates groups a pageTracker snapshot by state so follow-up -// PRs can add more per-state fields (e.g. removed) without touching -// existing call sites. type handlerPageStates struct { faulted []uint } -// allAccessed returns the sorted offsets the handler touched in any -// non-missing state. func (s handlerPageStates) allAccessed() []uint { return slices.Clone(s.faulted) } @@ -71,16 +64,10 @@ type testHandler struct { memoryArea *[]byte pagesize uint64 data *MemorySlicer - - // client is the typed RPC channel to the child helper process. - // Tests should prefer the convenience methods on testHandler. - client *testharness.Client - - mutex sync.RWMutex + client *testharness.Client + mutex sync.RWMutex } -// pageStates fetches a per-state snapshot of the child's pageTracker -// via the Paging.States RPC. func (h *testHandler) pageStates() (handlerPageStates, error) { entries, err := h.client.PageStates() if err != nil { diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go index 1d45bb9051..380e9c1d2f 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go @@ -1,8 +1,7 @@ package userfaultfd -// RPC service implementations for the cross-process UFFD test harness. -// These live in _test.go (rather than testutils/testharness) because they -// need access to unexported *Userfaultfd internals. +// RPC service implementations for the cross-process UFFD test harness; +// in _test.go because they need *Userfaultfd internals. import ( "context" @@ -17,10 +16,7 @@ import ( "github.com/e2b-dev/infra/packages/shared/pkg/logger" ) -// harnessState is the per-child container shared by Lifecycle / Paging / -// Barriers RPCs. Mutable fields are guarded by mu. -// -//nolint:containedctx // shutdown-aware ctx is shared with RPC handlers; lifetime is the child process. +//nolint:containedctx // shutdown-aware ctx shared with RPC handlers; lifetime is the child process. type harnessState struct { uffdFd uintptr @@ -43,10 +39,8 @@ func newHarnessState(uffdFd uintptr) *harnessState { } } -// startServeLocked spawns the uffd Serve goroutine and stores its -// stop fn. Caller must hold s.mu. Idempotent: if a serve goroutine is -// already running (s.stop != nil), a stray duplicate Resume cannot -// leak an untracked Serve goroutine and break later pauses. +// startServeLocked is idempotent so a stray duplicate Resume cannot +// leak an untracked Serve goroutine. Caller must hold s.mu. func (s *harnessState) startServeLocked() error { if s.stop != nil { return nil @@ -76,13 +70,9 @@ func (s *harnessState) startServeLocked() error { } func (s *harnessState) stopServe() { - // Snapshot s.stop and clear it under the lock, then drop the lock - // before calling stop() — stop() blocks on <-done draining the Serve - // goroutine, and any concurrent RPC handler that needs s.mu - // (Barriers.registry, Paging.States/Resume, Lifecycle.Bootstrap/Shutdown) - // would otherwise be blocked for the full drain. Required for the - // upcoming barrier-race RPCs where WaitFaultHeld must run while a - // worker parked at a barrier holds Pause's drain. + // Drop s.mu before stop() — stop() blocks on the Serve drain, and any + // concurrent RPC handler needing s.mu (e.g. WaitFaultHeld during a + // parked barrier) would otherwise stall until the drain completes. s.mu.Lock() stop := s.stop s.stop = nil @@ -93,18 +83,6 @@ func (s *harnessState) stopServe() { } } -func (s *harnessState) releaseAllBarriers() { - s.mu.Lock() - br := s.br - s.mu.Unlock() - if br != nil { - br.ReleaseAll() - } -} - -// Lifecycle owns boot/shutdown of the in-child uffd. Shutdown signals -// crossProcessServe to exit, where the serve goroutine and barrier -// registry are torn down outside any mutex. type Lifecycle struct { state *harnessState } @@ -156,9 +134,8 @@ func (l *Lifecycle) Bootstrap(args *testharness.BootstrapArgs, _ *testharness.Bo return l.state.startServeLocked() } -// WaitReady is a no-op today: Bootstrap is synchronous so its reply -// already implies readiness. Kept as a separate RPC so an async -// Bootstrap variant can hold the parent here without changing callers. +// WaitReady is a no-op today (Bootstrap is synchronous); kept as a separate +// RPC so an async-Bootstrap variant can hold the parent here unchanged. func (l *Lifecycle) WaitReady(_ *testharness.Empty, _ *testharness.Empty) error { return nil } @@ -174,7 +151,6 @@ func (l *Lifecycle) Shutdown(_ *testharness.Empty, _ *testharness.Empty) error { return nil } -// Paging exposes page-state introspection and pause/resume controls. type Paging struct { state *harnessState } @@ -209,11 +185,10 @@ func (p *Paging) Resume(_ *testharness.Empty, _ *testharness.Empty) error { return p.state.startServeLocked() } -// pageStateEntries returns a snapshot of every tracked page and its state in -// testharness wire format. Holds settleRequests.Lock to fence fault workers -// (mirrors PrefetchData) plus pageTracker.mu.RLock defensively, so a future -// writer that mutates pageTracker.m without going through settleRequests -// (e.g. a REMOVE event handler) cannot race this snapshot. +// pageStateEntries returns a wire-format snapshot of pageTracker. +// settleRequests.Lock drains fault workers (mirrors PrefetchData); +// pageTracker.mu.RLock is defensive against a future REMOVE writer +// that mutates pageTracker.m outside settleRequests. func (u *Userfaultfd) pageStateEntries() ([]testharness.PageStateEntry, error) { u.settleRequests.Lock() defer u.settleRequests.Unlock() @@ -233,8 +208,6 @@ func (u *Userfaultfd) pageStateEntries() ([]testharness.PageStateEntry, error) { return entries, nil } -// Barriers is the thin RPC wrapper exposing testharness.Registry to -// the parent; locking and lifecycle live in testharness. type Barriers struct { state *harnessState } diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go index aeb2c7a5d7..43b9fee492 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go @@ -64,15 +64,13 @@ type Userfaultfd struct { // defaultCopyMode overrides the UFFDIO_COPY mode for all faults when non-zero. defaultCopyMode CULong - // installed only by SetTestFaultHook in test builds; nil in production. + // testFaultHook is set only by SetTestFaultHook in test builds. testFaultHook atomic.Pointer[func(uintptr, faultPhase)] logger logger.Logger } -// faultPhase identifies WHEN inside the worker the test-only fault -// hook is invoked. testFaultHook is nil in production, so the -// per-fault cost is a single atomic load + nil check per call site. +// faultPhase identifies the worker fault hook call site (test-only). type faultPhase uint8 const (