Skip to content

Commit db978d4

Browse files
committed
fix(uffd): close read-vs-apply race with separate readSerial lock
A worker holding settleRequests.RLock must never block readEvents, because madvise(MADV_DONTNEED) blocks the producer until userspace reads the UFFD_EVENT_REMOVE — and the producer can be the FC balloon thread that other syscalls depend on. Use a dedicated readSerial mutex (not settleRequests) to serialize serve-loop iterations with snapshot-time Export, while keeping the existing settleRequests discipline (workers RLock, REMOVE batch Lock) intact so readEvents remains lock-free relative to workers. Restores liveness for TestNoMadviseDeadlockWithInflightCopy while closing the read-vs-apply race that motivated the prior buggy commit (345f7e9, now amended).
1 parent d725d87 commit db978d4

1 file changed

Lines changed: 45 additions & 29 deletions

File tree

packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go

Lines changed: 45 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,14 @@ type Userfaultfd struct {
6868
// RLock for the lookup→install→SetRange sequence; the REMOVE batch takes
6969
// Lock so a concurrent worker can't overwrite a removed state.
7070
settleRequests sync.RWMutex
71+
72+
// readSerial serializes serve-loop iterations (read+apply) with snapshot-time
73+
// Export. Workers do NOT touch this lock — it must remain disjoint from
74+
// settleRequests so readEvents can always drain the kernel UFFD queue and
75+
// unblock madvise even when settleRequests.RLock is held by an in-flight
76+
// worker. See TestNoMadviseDeadlockWithInflightCopy.
77+
readSerial sync.Mutex
78+
7179
prefetchTracker *block.PrefetchTracker
7280

7381
// defaultCopyMode overrides UFFDIO_COPY mode for all faults when non-zero.
@@ -262,46 +270,54 @@ func (u *Userfaultfd) Serve(
262270
var pagefaults []*UffdPagefault
263271

264272
if hasEvent(uffdFd.Revents, unix.POLLIN) {
273+
// readSerial keeps Export from interleaving between read and
274+
// SetRange(removed) in the same serve-loop iteration. It does
275+
// NOT couple readEvents to settleRequests — workers don't take
276+
// readSerial, so a worker holding settleRequests.RLock can
277+
// never block this read. See TestNoMadviseDeadlockWithInflightCopy.
278+
u.readSerial.Lock()
279+
265280
var err error
266281
removes, pagefaults, err = u.readEvents(ctx)
267282
if err != nil {
283+
u.readSerial.Unlock()
268284
u.logger.Error(ctx, "uffd: read error", zap.Error(err))
269285

270286
return fmt.Errorf("failed to read: %w", err)
271287
}
272-
} else {
273-
noDataCounter.Increase("POLLIN")
274-
}
275-
276-
// REMOVE batch under write lock so an in-flight worker's SetRange(faulted)
277-
// can't overwrite the removed state we are about to install.
278-
if len(removes) > 0 {
279-
u.settleRequests.Lock()
280-
for _, rm := range removes {
281-
// rm.start (inclusive) and rm.end (exclusive) are page-aligned
282-
// to u.pageSize for the registered VMA (UFFD invariant), so
283-
// startOff is a multiple of pageSize and length is an integer
284-
// number of pages — both divisions below are exact, and
285-
// SetRange's half-open [startIdx, endIdx) lines up with the
286-
// half-open [rm.start, rm.end).
287-
startOff, err := u.ma.GetOffset(uintptr(rm.start))
288-
if err != nil {
289-
u.logger.Error(ctx, "UFFD REMOVE: failed to map start address",
290-
zap.Uintptr("start", uintptr(rm.start)), zap.Error(err))
291288

292-
continue
293-
}
289+
if len(removes) > 0 {
290+
u.settleRequests.Lock()
291+
for _, rm := range removes {
292+
// rm.start (inclusive) and rm.end (exclusive) are page-aligned
293+
// to u.pageSize for the registered VMA (UFFD invariant), so
294+
// startOff is a multiple of pageSize and length is an integer
295+
// number of pages — both divisions below are exact, and
296+
// SetRange's half-open [startIdx, endIdx) lines up with the
297+
// half-open [rm.start, rm.end).
298+
startOff, err := u.ma.GetOffset(uintptr(rm.start))
299+
if err != nil {
300+
u.logger.Error(ctx, "UFFD REMOVE: failed to map start address",
301+
zap.Uintptr("start", uintptr(rm.start)), zap.Error(err))
302+
303+
continue
304+
}
294305

295-
startIdx := uint64(header.BlockIdx(startOff, int64(u.pageSize)))
296-
endIdx := startIdx + uint64(rm.end-rm.start)/uint64(u.pageSize)
297-
if err := u.pageTracker.SetRange(startIdx, endIdx, removed); err != nil {
298-
// Programming bug only — keep draining the batch so
299-
// the remaining ranges still get marked.
300-
u.logger.Error(ctx, "UFFD REMOVE: pageTracker SetRange error",
301-
zap.Uint64("start_idx", startIdx), zap.Uint64("end_idx", endIdx), zap.Error(err))
306+
startIdx := uint64(header.BlockIdx(startOff, int64(u.pageSize)))
307+
endIdx := startIdx + uint64(rm.end-rm.start)/uint64(u.pageSize)
308+
if err := u.pageTracker.SetRange(startIdx, endIdx, removed); err != nil {
309+
// Programming bug only — keep draining the batch so
310+
// the remaining ranges still get marked.
311+
u.logger.Error(ctx, "UFFD REMOVE: pageTracker SetRange error",
312+
zap.Uint64("start_idx", startIdx), zap.Uint64("end_idx", endIdx), zap.Error(err))
313+
}
302314
}
315+
u.settleRequests.Unlock()
303316
}
304-
u.settleRequests.Unlock()
317+
318+
u.readSerial.Unlock()
319+
} else {
320+
noDataCounter.Increase("POLLIN")
305321
}
306322

307323
pagefaults = append(deferred.drain(), pagefaults...)

0 commit comments

Comments
 (0)