@@ -15,28 +15,16 @@ import (
1515 "github.com/e2b-dev/infra/packages/shared/pkg/storage/header"
1616)
1717
18- // raceHappyPathBudget bounds every race test in this file. The whole
19- // point of these tests is that they detect a regression as a fast,
20- // targeted assertion rather than as a CI -timeout 30m hang. None of
21- // these tests should approach this budget on a healthy build.
22- const raceHappyPathBudget = 5 * time .Second
23-
24- // barrierArrivalDeadline is how long the test will wait for a worker
25- // to reach an installed barrier. The hook fires the first thing in
26- // the worker goroutine, so on a healthy build it's a sub-millisecond
27- // rendezvous over the unix-socket RPC. Anything approaching this
28- // deadline means the handler dispatch is wedged.
29- const barrierArrivalDeadline = 2 * time .Second
30-
31- // madviseBudget is how long we allow MADV_DONTNEED to spend in the
32- // kernel after we've parked a worker mid-handler. The fix guarantees
33- // madvise unblocks as soon as the handler drains the REMOVE event
34- // from the uffd fd, regardless of any worker holding RLock —
35- // readEvents requires no lock.
36- const madviseBudget = 2 * time .Second
37-
38- // withRaceContext bounds a single race test to raceHappyPathBudget,
39- // failing with a clear "deadlock" message if the budget is exceeded.
18+ // Bounded budgets so a regression surfaces as a fast assertion, not a
19+ // 30-minute CI hang. madviseBudget is the load-bearing one: madvise must
20+ // return as soon as the handler drains the REMOVE event, which requires
21+ // no lock — coupling readEvents to settleRequests would push us past it.
22+ const (
23+ raceHappyPathBudget = 5 * time .Second
24+ barrierArrivalDeadline = 2 * time .Second
25+ madviseBudget = 2 * time .Second
26+ )
27+
4028func withRaceContext (t * testing.T , body func (ctx context.Context )) {
4129 t .Helper ()
4230
@@ -56,29 +44,17 @@ func withRaceContext(t *testing.T, body func(ctx context.Context)) {
5644 }
5745}
5846
59- // TestStaleSourceRaceMissingAndRemove is the deterministic regression
60- // test for the production fix in Serve():
61- //
62- // - Pre-fix the parent serve loop captured `state == missing` and
63- // `source = u.src` BEFORE handing the work to a worker goroutine.
64- // A REMOVE event for the same page that arrived between then and
65- // the worker actually running would silently leave the worker
66- // with a stale `source = u.src` snapshot, which it would then
67- // UFFDIO_COPY into the page that the kernel had just unmapped.
47+ // TestStaleSourceRaceMissingAndRemove deterministically reproduces the
48+ // stale-source race: a worker that captured state == missing in the
49+ // parent loop must not UFFDIO_COPY u.src after a concurrent REMOVE has
50+ // transitioned the page to removed. The test plants a non-zero
51+ // sentinel into source data, parks the worker at faultPhaseBeforeRLock,
52+ // fires MADV_DONTNEED on the same page, releases the worker, and
53+ // asserts the resulting page is zero-filled (regression: page[0]
54+ // equals the sentinel).
6855//
69- // - Post-fix the worker reads pageTracker state INSIDE the
70- // goroutine, under settleRequests.RLock, atomically with the
71- // decision of which `source` to use.
72- //
73- // The test installs a barrierBeforeRLock on page X (so the worker
74- // for X parks before it can read state), triggers a MISSING-write
75- // fault on X from the parent, waits for the worker to park, fires
76- // MADV_DONTNEED on X (which can take settleRequests.Lock immediately
77- // — no worker holds RLock), and then releases the worker. After
78- // release the worker, post-fix, observes state=removed under RLock
79- // and zero-faults; pre-fix it would have UFFDIO_COPY'd the planted
80- // sentinel byte from u.src. A direct read of the page contents
81- // distinguishes the two outcomes deterministically.
56+ // Both variants fail until the fix in #2512 lands — the failure is
57+ // intentional and demonstrates the stale-source bug.
8258func TestStaleSourceRaceMissingAndRemove (t * testing.T ) {
8359 t .Parallel ()
8460
@@ -95,12 +71,6 @@ func TestStaleSourceRaceMissingAndRemove(t *testing.T) {
9571 t .Parallel ()
9672
9773 withRaceContext (t , func (ctx context.Context ) {
98- // Plant a deterministic, non-zero sentinel as the
99- // first byte of the source data for the page we'll
100- // race on. Pre-fix, the worker would UFFDIO_COPY this
101- // sentinel into the page after the REMOVE has already
102- // unmapped it. Post-fix the worker reads
103- // state == removed under RLock and zero-fills.
10474 const sentinel = byte (0xC3 )
10575 const pageIdx = 1
10676 pageOffset := int64 (pageIdx ) * int64 (tt .pagesize )
@@ -124,69 +94,43 @@ func TestStaleSourceRaceMissingAndRemove(t *testing.T) {
12494 token , err := h .installFaultBarrier (ctx , addr , faultPhaseBeforeRLock )
12595 require .NoError (t , err )
12696
127- // Trigger a READ fault (NOT a write — a write would
128- // overwrite the very byte we want to inspect to
129- // distinguish the two outcomes). h.executeRead does
130- // the touch + content check; we run it in a goroutine
131- // because it blocks on the fault until we release the
132- // barrier.
97+ // READ, not write: a write would overwrite the byte
98+ // we read below to distinguish the two outcomes.
13399 readErrCh := make (chan error , 1 )
134100 go func () {
135101 readErrCh <- h .executeRead (ctx , operation {offset : pageOffset , mode : operationModeRead })
136102 }()
137103
138- // Wait for the worker for `addr` to park at the
139- // pre-RLock barrier.
140104 waitCtx , waitCancel := context .WithTimeout (ctx , barrierArrivalDeadline )
141105 err = h .waitFaultHeld (waitCtx , token )
142106 waitCancel ()
143107 require .NoError (t , err , "worker for page %d (addr %#x) did not park at barrier" , pageIdx , addr )
144108
145- // Fire MADV_DONTNEED on the same page from the
146- // parent. The serve loop can take Lock immediately
147- // because the parked worker has not yet acquired
148- // RLock.
149109 err = h .executeRemove (operation {offset : pageOffset , mode : operationModeRemove })
150110 require .NoError (t , err , "MADV_DONTNEED on page %d did not return — handler dispatch wedged" , pageIdx )
151111
152- // Wait for the handler to commit setState(removed).
153- // A tight poll loop with a hard deadline is used
154- // rather than a sleep — the transition is
155- // microseconds in the happy path.
156112 require .NoError (t , waitForState (ctx , h , uint64 (pageOffset ), removed , barrierArrivalDeadline ),
157113 "handler did not transition page %d to `removed` after MADV_DONTNEED" , pageIdx )
158114
159- // Release the parked worker. Post-fix it will
160- // observe state == removed and zero-fault; pre-fix
161- // it would proceed with the captured stale source.
162115 require .NoError (t , h .releaseFault (ctx , token ))
163116
164117 select {
165- case err := <- readErrCh :
166- // Pre-fix: executeRead's bytes.Equal succeeds
167- // (page contains src bytes), so err == nil but
168- // the page is observably wrong. Post-fix:
169- // bytes.Equal fails (page is zero-filled), so
170- // err != nil. We use the page-content assertion
171- // below instead of relying on this side-channel.
172- _ = err
118+ case <- readErrCh :
119+ // Pre-fix the read sees src bytes (err == nil); post-fix
120+ // it sees zeros (err != nil). The page-content assertion
121+ // below is the bug-detection path; the read just
122+ // completes the fault.
173123 case <- ctx .Done ():
174124 t .Fatalf ("read of page %d did not unblock after barrier release" , pageIdx )
175125 }
176126
177- // THE bug-detection assertion: post-fix the page
178- // MUST be zero-filled. Pre-fix the worker
179- // UFFDIO_COPY'd the planted sentinel.
180127 page := (* h .memoryArea )[pageOffset : pageOffset + int64 (tt .pagesize )]
181128 assert .Equalf (t , byte (0 ), page [0 ],
182- "page %d first byte: want 0 (post-fix zero-fault for `removed` state ), got %#x — " +
183- "if this equals the sentinel %#x, the worker used a stale ` source = u.src` snapshot (regression)" ,
129+ "page %d first byte: want 0 (zero-fault for `removed`), got %#x — " +
130+ "if this equals the sentinel %#x, the worker used a stale source (regression)" ,
184131 pageIdx , page [0 ], sentinel ,
185132 )
186133
187- // Sanity: verify with /proc/self/pagemap that the
188- // page is in fact present after the racing read was
189- // served (worker re-mapped it as zero).
190134 pagemap , err := testutils .NewPagemapReader ()
191135 require .NoError (t , err )
192136 defer pagemap .Close ()
@@ -198,21 +142,11 @@ func TestStaleSourceRaceMissingAndRemove(t *testing.T) {
198142 }
199143}
200144
201- // TestNoMadviseDeadlockWithInflightCopy is a liveness regression test
202- // for the user-visible symptom that originally surfaced the stale-
203- // source race: the orchestrator's parent madvise(MADV_DONTNEED)
204- // blocking forever because the UFFD handler loop was wedged behind a
205- // worker.
206- //
207- // The harness parks the worker AFTER it has taken settleRequests.RLock
208- // AND captured `source` (i.e. as if its UFFDIO_COPY was in flight).
209- // From the parent we then issue MADV_DONTNEED on the same page and
210- // require that madvise returns within `madviseBudget`. madvise
211- // unblocks as soon as the handler's readEvents drains the REMOVE
212- // event, and readEvents requires no lock — so any future change that
213- // accidentally couples readEvents to settleRequests fails this test
214- // at the `madviseBudget` boundary instead of as a 30-minute CI
215- // timeout.
145+ // TestNoMadviseDeadlockWithInflightCopy is the liveness guard for
146+ // MADV_DONTNEED while a worker holds settleRequests.RLock. madvise
147+ // must return within madviseBudget because readEvents drains REMOVE
148+ // events without taking any lock — any future change that couples
149+ // readEvents to settleRequests fails this test at the budget boundary.
216150func TestNoMadviseDeadlockWithInflightCopy (t * testing.T ) {
217151 t .Parallel ()
218152
@@ -258,10 +192,7 @@ func TestNoMadviseDeadlockWithInflightCopy(t *testing.T) {
258192 waitCancel ()
259193 require .NoError (t , err , "worker for page %d (addr %#x) did not park at pre-COPY barrier" , pageIdx , addr )
260194
261- // Worker is parked AFTER RLock. Issue MADV_DONTNEED
262- // on the same page from the parent. The handler's
263- // readEvents must drain the REMOVE event (so madvise
264- // returns) even while the worker holds RLock.
195+ // Worker is parked holding RLock; madvise must still complete.
265196 madviseDone := make (chan error , 1 )
266197 go func () {
267198 madviseDone <- unix .Madvise ((* h .memoryArea )[pageOffset :pageOffset + int64 (tt .pagesize )], unix .MADV_DONTNEED )
@@ -291,18 +222,13 @@ func TestNoMadviseDeadlockWithInflightCopy(t *testing.T) {
291222 }
292223}
293224
294- // TestFaultedShortCircuitOrdering uses the gated harness to
295- // deterministically queue a WRITE pagefault for a fresh page AND a
296- // REMOVE for an already-faulted page in the SAME serve-loop
297- // iteration. After resume, the post-batch state is asserted: the
298- // REMOVE'd page is `removed` and the racing-write page is `faulted`.
299- //
300- // Both pre-fix and post-fix code reach the same end state for this
301- // scenario (REMOVE batch runs before the pagefault dispatch loop in
302- // every Serve iteration). This test guards the batch-processing
303- // invariant itself: any future change that, for example, dispatched
304- // pagefaults before draining REMOVEs would fail this test as a
305- // concrete state-mismatch assertion rather than a 30-minute hang.
225+ // TestFaultedShortCircuitOrdering is an end-state sanity check for a
226+ // REMOVE + PAGEFAULT batch on disjoint pages: page 0 (already faulted)
227+ // is REMOVE'd, page 1 (missing) gets a write fault, and after resume
228+ // page 0 must be `removed` and page 1 must be `faulted`. The two
229+ // orderings happen to commute on disjoint pages, so this test does
230+ // not by itself prove drain-order; same-page ordering is covered by
231+ // TestStaleSourceRaceMissingAndRemove.
306232//
307233//nolint:paralleltest,tparallel // serialised: a paused gated handler keeps a faulting goroutine suspended in the kernel pagefault path; a STW GC pause from another parallel test would wait forever for that goroutine to reach a safe point.
308234func TestFaultedShortCircuitOrdering (t * testing.T ) {
@@ -355,11 +281,7 @@ func TestFaultedShortCircuitOrdering(t *testing.T) {
355281}
356282
357283// waitForState polls the child's PageStates RPC until the page at
358- // the given offset reaches `want` or `deadline` elapses. Each RPC
359- // round-trip is microseconds-to-low-milliseconds; we yield with a
360- // small sleep between polls so the harness doesn't burn an entire
361- // CPU on tight-loop encoding while the rest of the suite is also
362- // running cross-process tests.
284+ // `offset` reaches `want` or `deadline` elapses.
363285func waitForState (ctx context.Context , h * testHandler , offset uint64 , want pageState , deadline time.Duration ) error {
364286 const pollInterval = 1 * time .Millisecond
365287
@@ -376,6 +298,8 @@ func waitForState(ctx context.Context, h *testHandler, offset uint64, want pageS
376298 bucket = states .removed
377299 case faulted :
378300 bucket = states .faulted
301+ default :
302+ return fmt .Errorf ("waitForState: unsupported want=%d" , want )
379303 }
380304
381305 for _ , off := range bucket {
0 commit comments