Skip to content

Commit 9ce0941

Browse files
committed
feat(uffd): add UFFD_EVENT_REMOVE handling with matrix tests
Production: - UFFDIO_REGISTER_MODE_REMOVE is requested so the kernel reports MADV_DONTNEED'd pages via UFFD_EVENT_REMOVE. - Userfaultfd.Serve splits read events into removes + pagefaults, drains the REMOVE batch under settleRequests.Lock (calling pageTracker.SetRange(.., removed) with BlockIdx-computed indices), then dispatches the pagefault batch. - Worker dispatch switches on pageTracker.Get(idx): faulted -> short-circuit, removed -> zero-fill (source = nil), missing -> copy from u.src. The state read happens inside the worker under settleRequests.RLock so a concurrent REMOVE can't slip between the read and the install. - faultPage gains zero-fill paths for source == nil (4K read = DONTWAKE zero + WP + wake; 4K write = zero + wake; hugepage = copy(EmptyHugePage)) and returns (handled, err) so the worker can defer UFFDIO_COPY EAGAIN back into a deferredFaults queue. - wakeupPipe + deferredFaults wake the poll loop when a worker defers, so a deferred fault doesn't sit waiting for an unrelated UFFD event. The received uffd fd is marked FD_CLOEXEC. - Prefault short-circuits for faulted || removed. Tests: - testConfig gains removeEnabled; the parent unregisters the UFFD region on cleanup when REMOVE is on so munmap doesn't block on un-acked events. - Page-state wire format exposes removed via helpers_test.go. - operationModeRemove + executeRemove (madvise MADV_DONTNEED). - runMatrix wraps every existing generic test in remove-off and remove-on subtests so the no-REMOVE path (still used by production templates) stays covered while the new path is exercised. The matrix-level t.Parallel() is intentionally omitted to cap peak concurrency in CI. - remove_test.go: TestRemove, TestRemoveThenFault, TestRemoveThenWriteGated, TestWriteThenRemoveGated. Gated tests are //nolint:tparallel — a paused gated handler keeps a faulting goroutine suspended in the kernel pagefault path; a STW GC pause from a parallel test would wait forever for that goroutine to reach a safe point. - race_test.go: deterministic stale-source / madvise-deadlock / faulted-short-circuit regressions, serialised, with the FD_CLOEXEC and UFFDIO_COPY-EAGAIN fixes covered.
1 parent bf4fc62 commit 9ce0941

13 files changed

Lines changed: 1491 additions & 365 deletions

File tree

packages/orchestrator/pkg/sandbox/uffd/userfaultfd/async_wp_test.go

Lines changed: 26 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -289,38 +289,42 @@ func TestAsyncWriteProtection(t *testing.T) {
289289
t.Run(tt.name, func(t *testing.T) {
290290
t.Parallel()
291291

292-
h, err := configureCrossProcessTest(t.Context(), t, testConfig{
292+
runMatrix(t, testConfig{
293293
pagesize: tt.pagesize,
294294
numberOfPages: tt.numberOfPages,
295295
alwaysWP: tt.alwaysWP,
296-
})
297-
require.NoError(t, err)
296+
}, func(t *testing.T, cfg testConfig) {
297+
t.Helper()
298+
299+
h, err := configureCrossProcessTest(t.Context(), t, cfg)
300+
require.NoError(t, err)
298301

299-
h.executeAll(t, tt.operations)
302+
h.executeAll(t, tt.operations)
300303

301-
pagemap, err := testutils.NewPagemapReader()
302-
require.NoError(t, err)
303-
defer pagemap.Close()
304+
pagemap, err := testutils.NewPagemapReader()
305+
require.NoError(t, err)
306+
defer pagemap.Close()
304307

305-
memStart := uintptr(unsafe.Pointer(&(*h.memoryArea)[0]))
308+
memStart := uintptr(unsafe.Pointer(&(*h.memoryArea)[0]))
306309

307-
for _, p := range tt.expectedDirty {
308-
addr := memStart + uintptr(p)*uintptr(tt.pagesize)
309-
entry, err := pagemap.ReadEntry(addr)
310-
require.NoError(t, err, "pagemap read for dirty page %d", p)
310+
for _, p := range tt.expectedDirty {
311+
addr := memStart + uintptr(p)*uintptr(tt.pagesize)
312+
entry, err := pagemap.ReadEntry(addr)
313+
require.NoError(t, err, "pagemap read for dirty page %d", p)
311314

312-
assert.True(t, entry.IsPresent(), "dirty page %d should be present", p)
313-
assert.False(t, entry.IsWriteProtected(), "dirty page %d should have WP cleared", p)
314-
}
315+
assert.True(t, entry.IsPresent(), "dirty page %d should be present", p)
316+
assert.False(t, entry.IsWriteProtected(), "dirty page %d should have WP cleared", p)
317+
}
315318

316-
for _, p := range tt.expectedClean {
317-
addr := memStart + uintptr(p)*uintptr(tt.pagesize)
318-
entry, err := pagemap.ReadEntry(addr)
319-
require.NoError(t, err, "pagemap read for clean page %d", p)
319+
for _, p := range tt.expectedClean {
320+
addr := memStart + uintptr(p)*uintptr(tt.pagesize)
321+
entry, err := pagemap.ReadEntry(addr)
322+
require.NoError(t, err, "pagemap read for clean page %d", p)
320323

321-
assert.True(t, entry.IsPresent(), "clean page %d should be present", p)
322-
assert.True(t, entry.IsWriteProtected(), "clean page %d should have WP set", p)
323-
}
324+
assert.True(t, entry.IsPresent(), "clean page %d should be present", p)
325+
assert.True(t, entry.IsWriteProtected(), "clean page %d should have WP set", p)
326+
}
327+
})
324328
})
325329
}
326330
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
package userfaultfd
2+
3+
import "sync"
4+
5+
// deferredFaults collects pagefaults that returned EAGAIN so they get
6+
// retried on the next poll iteration. Safe for concurrent push.
7+
type deferredFaults struct {
8+
mu sync.Mutex
9+
pf []*UffdPagefault
10+
}
11+
12+
func (d *deferredFaults) push(pf *UffdPagefault) {
13+
d.mu.Lock()
14+
d.pf = append(d.pf, pf)
15+
d.mu.Unlock()
16+
}
17+
18+
func (d *deferredFaults) drain() []*UffdPagefault {
19+
d.mu.Lock()
20+
out := d.pf
21+
d.pf = nil
22+
d.mu.Unlock()
23+
24+
return out
25+
}

packages/orchestrator/pkg/sandbox/uffd/userfaultfd/fd.go

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -146,18 +146,29 @@ func getPagefaultAddress(pagefault *UffdPagefault) uintptr {
146146
// Fd is a helper type that wraps uffd fd.
147147
type Fd uintptr
148148

149-
// mode: UFFDIO_COPY_MODE_WP
150-
// When we use both missing and wp, we need to use UFFDIO_COPY_MODE_WP, otherwise copying would unprotect the page
149+
// copy requires UFFDIO_COPY_MODE_WP when both MISSING and WP tracking are active.
151150
func (f Fd) copy(addr, pagesize uintptr, data []byte, mode CULong) error {
152151
cpy := newUffdioCopy(data, CULong(addr)&^CULong(pagesize-1), CULong(pagesize), mode, 0)
153152

154153
if _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(f), UFFDIO_COPY, uintptr(unsafe.Pointer(&cpy))); errno != 0 {
155154
return errno
156155
}
157156

158-
// Check if the copied size matches the requested pagesize
159-
if cpy.copy != CLong(pagesize) {
160-
return fmt.Errorf("UFFDIO_COPY copied %d bytes, expected %d", cpy.copy, pagesize)
157+
return classifyCopyResult(int64(cpy.copy), int64(pagesize))
158+
}
159+
160+
// classifyCopyResult turns the UFFDIO_COPY cpy.copy field into a Go error.
161+
// The kernel encodes a negated errno on failure (most commonly -EAGAIN when
162+
// mmap_changing is set), and a short positive count when the copy was
163+
// preempted mid-page (e.g. hugetlb). Both partial outcomes surface as
164+
// EAGAIN so the caller drops the fault and lets the kernel redeliver.
165+
func classifyCopyResult(bytesCopied, pagesize int64) error {
166+
if bytesCopied < 0 {
167+
return syscall.Errno(-bytesCopied)
168+
}
169+
170+
if bytesCopied != pagesize {
171+
return syscall.EAGAIN
161172
}
162173

163174
return nil
@@ -170,7 +181,6 @@ func (f Fd) zero(addr, pagesize uintptr, mode CULong) error {
170181
return errno
171182
}
172183

173-
// Check if the bytes actually zeroed out by the kernel match the page size
174184
if zero.zeropage != CLong(pagesize) {
175185
return fmt.Errorf("UFFDIO_ZEROPAGE copied %d bytes, expected %d", zero.zeropage, pagesize)
176186
}

packages/orchestrator/pkg/sandbox/uffd/userfaultfd/fd_helpers_test.go

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@ import (
88
"github.com/e2b-dev/infra/packages/shared/pkg/storage/header"
99
)
1010

11-
// Used for testing.
12-
// flags: syscall.O_CLOEXEC|syscall.O_NONBLOCK
11+
// Used for testing (flags: syscall.O_CLOEXEC|syscall.O_NONBLOCK).
1312
func newFd(flags uintptr) (Fd, error) {
1413
uffd, _, errno := syscall.Syscall(NR_userfaultfd, flags, 0, 0)
1514
if errno != 0 {
@@ -19,18 +18,20 @@ func newFd(flags uintptr) (Fd, error) {
1918
return Fd(uffd), nil
2019
}
2120

22-
// Used for testing
23-
// features: UFFD_FEATURE_MISSING_HUGETLBFS
24-
// This is already called by the FC
25-
func configureApi(f Fd, pagesize uint64) error {
21+
// configureApi is used for testing. The caller (FC in production) sets
22+
// UFFD_FEATURE_MISSING_HUGETLBFS only when hugepages are in use.
23+
func configureApi(f Fd, pagesize uint64, removeEnabled bool) error {
2624
var features CULong
2725

28-
// Only set the hugepage feature if we're using hugepages
26+
// Only set the hugepage feature if we're using hugepages.
2927
if pagesize == header.HugepageSize {
3028
features |= UFFD_FEATURE_MISSING_HUGETLBFS
3129
}
3230

3331
features |= UFFD_FEATURE_WP_ASYNC
32+
if removeEnabled {
33+
features |= UFFD_FEATURE_EVENT_REMOVE
34+
}
3435

3536
api := newUffdioAPI(UFFD_API, features)
3637
ret, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(f), UFFDIO_API, uintptr(unsafe.Pointer(&api)))
@@ -42,9 +43,9 @@ func configureApi(f Fd, pagesize uint64) error {
4243
}
4344

4445
// unregister tears down the UFFD registration over [addr, addr+size).
45-
// Used in test cleanup so that any in-flight REMOVE events the kernel
46-
// may have queued (once UFFD_FEATURE_EVENT_REMOVE is enabled in a
47-
// follow-up) don't keep munmap blocked on un-acked events.
46+
// Used in test cleanup so in-flight REMOVE events queued by the kernel
47+
// (configureApi enables UFFD_FEATURE_EVENT_REMOVE when removeEnabled)
48+
// don't keep munmap blocked on un-acked events.
4849
func unregister(f Fd, addr uintptr, size uint64) error {
4950
r := newUffdioRange(CULong(addr), CULong(size))
5051

@@ -56,9 +57,8 @@ func unregister(f Fd, addr uintptr, size uint64) error {
5657
return nil
5758
}
5859

59-
// mode: UFFDIO_REGISTER_MODE_WP|UFFDIO_REGISTER_MODE_MISSING
60-
// This is already called by the FC, but only with the UFFDIO_REGISTER_MODE_MISSING
61-
// We need to call it with UFFDIO_REGISTER_MODE_WP when we use both missing and wp
60+
// register is used for testing. FC uses UFFDIO_REGISTER_MODE_MISSING; we add
61+
// UFFDIO_REGISTER_MODE_WP when both missing and write-protection are needed.
6262
func register(f Fd, addr uintptr, size uint64, mode CULong) error {
6363
register := newUffdioRegister(CULong(addr), CULong(size), mode)
6464

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
package userfaultfd
2+
3+
import (
4+
"syscall"
5+
"testing"
6+
7+
"github.com/stretchr/testify/assert"
8+
)
9+
10+
// TestClassifyCopyResult pins the kernel partial-copy convention used by
11+
// UFFDIO_COPY: negative cpy.copy carries negated errno; a short positive
12+
// copy is treated as EAGAIN so the caller drops the fault and redelivers.
13+
func TestClassifyCopyResult(t *testing.T) {
14+
t.Parallel()
15+
16+
const fourKi = int64(4096)
17+
const twoMi = int64(2 * 1024 * 1024)
18+
19+
tests := []struct {
20+
name string
21+
bytesCopied int64
22+
pagesize int64
23+
wantErr error
24+
wantEAGAIN bool
25+
}{
26+
{
27+
name: "full 4k copy succeeds",
28+
bytesCopied: fourKi,
29+
pagesize: fourKi,
30+
wantErr: nil,
31+
},
32+
{
33+
name: "kernel convention -EAGAIN surfaces as EAGAIN",
34+
bytesCopied: -int64(syscall.EAGAIN),
35+
pagesize: fourKi,
36+
wantEAGAIN: true,
37+
},
38+
{
39+
name: "zero bytes copied surfaces as EAGAIN (matches Firecracker bytes_copied==0)",
40+
bytesCopied: 0,
41+
pagesize: fourKi,
42+
wantEAGAIN: true,
43+
},
44+
{
45+
name: "partial positive copy on hugepage surfaces as EAGAIN",
46+
bytesCopied: fourKi,
47+
pagesize: twoMi,
48+
wantEAGAIN: true,
49+
},
50+
{
51+
name: "kernel convention -EFAULT surfaces as EFAULT (still fatal upstream)",
52+
bytesCopied: -int64(syscall.EFAULT),
53+
pagesize: fourKi,
54+
wantErr: syscall.EFAULT,
55+
},
56+
}
57+
58+
for _, tc := range tests {
59+
t.Run(tc.name, func(t *testing.T) {
60+
t.Parallel()
61+
62+
err := classifyCopyResult(tc.bytesCopied, tc.pagesize)
63+
if tc.wantEAGAIN {
64+
assert.ErrorIs(t, err, syscall.EAGAIN)
65+
66+
return
67+
}
68+
assert.Equal(t, tc.wantErr, err)
69+
})
70+
}
71+
}

packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,10 @@ func configureCrossProcessTest(ctx context.Context, t *testing.T, tt testConfig)
6464

6565
data := RandomPages(tt.pagesize, tt.numberOfPages)
6666

67+
if tt.sourcePatcher != nil {
68+
tt.sourcePatcher(data.Content())
69+
}
70+
6771
size, err := data.Size()
6872
require.NoError(t, err)
6973

@@ -74,7 +78,7 @@ func configureCrossProcessTest(ctx context.Context, t *testing.T, tt testConfig)
7478
require.NoError(t, err)
7579
t.Cleanup(func() { uffdFd.close() })
7680

77-
require.NoError(t, configureApi(uffdFd, tt.pagesize))
81+
require.NoError(t, configureApi(uffdFd, tt.pagesize, tt.removeEnabled))
7882
require.NoError(t, register(uffdFd, memoryStart, uint64(size), UFFDIO_REGISTER_MODE_MISSING|UFFDIO_REGISTER_MODE_WP))
7983
t.Cleanup(func() {
8084
// Unregister before close (LIFO): a future test enabling

0 commit comments

Comments
 (0)