@@ -68,6 +68,14 @@ type Userfaultfd struct {
6868 // RLock for the lookup→install→SetRange sequence; the REMOVE batch takes
6969 // Lock so a concurrent worker can't overwrite a removed state.
7070 settleRequests sync.RWMutex
71+
72+ // readSerial serializes serve-loop iterations (read+apply) with snapshot-time
73+ // Export. Workers do NOT touch this lock — it must remain disjoint from
74+ // settleRequests so readEvents can always drain the kernel UFFD queue and
75+ // unblock madvise even when settleRequests.RLock is held by an in-flight
76+ // worker. See TestNoMadviseDeadlockWithInflightCopy.
77+ readSerial sync.Mutex
78+
7179 prefetchTracker * block.PrefetchTracker
7280
7381 // defaultCopyMode overrides UFFDIO_COPY mode for all faults when non-zero.
@@ -262,46 +270,54 @@ func (u *Userfaultfd) Serve(
262270 var pagefaults []* UffdPagefault
263271
264272 if hasEvent (uffdFd .Revents , unix .POLLIN ) {
273+ // readSerial keeps Export from interleaving between read and
274+ // SetRange(removed) in the same serve-loop iteration. It does
275+ // NOT couple readEvents to settleRequests — workers don't take
276+ // readSerial, so a worker holding settleRequests.RLock can
277+ // never block this read. See TestNoMadviseDeadlockWithInflightCopy.
278+ u .readSerial .Lock ()
279+
265280 var err error
266281 removes , pagefaults , err = u .readEvents (ctx )
267282 if err != nil {
283+ u .readSerial .Unlock ()
268284 u .logger .Error (ctx , "uffd: read error" , zap .Error (err ))
269285
270286 return fmt .Errorf ("failed to read: %w" , err )
271287 }
272- } else {
273- noDataCounter .Increase ("POLLIN" )
274- }
275-
276- // REMOVE batch under write lock so an in-flight worker's SetRange(faulted)
277- // can't overwrite the removed state we are about to install.
278- if len (removes ) > 0 {
279- u .settleRequests .Lock ()
280- for _ , rm := range removes {
281- // rm.start (inclusive) and rm.end (exclusive) are page-aligned
282- // to u.pageSize for the registered VMA (UFFD invariant), so
283- // startOff is a multiple of pageSize and length is an integer
284- // number of pages — both divisions below are exact, and
285- // SetRange's half-open [startIdx, endIdx) lines up with the
286- // half-open [rm.start, rm.end).
287- startOff , err := u .ma .GetOffset (uintptr (rm .start ))
288- if err != nil {
289- u .logger .Error (ctx , "UFFD REMOVE: failed to map start address" ,
290- zap .Uintptr ("start" , uintptr (rm .start )), zap .Error (err ))
291288
292- continue
293- }
289+ if len (removes ) > 0 {
290+ u .settleRequests .Lock ()
291+ for _ , rm := range removes {
292+ // rm.start (inclusive) and rm.end (exclusive) are page-aligned
293+ // to u.pageSize for the registered VMA (UFFD invariant), so
294+ // startOff is a multiple of pageSize and length is an integer
295+ // number of pages — both divisions below are exact, and
296+ // SetRange's half-open [startIdx, endIdx) lines up with the
297+ // half-open [rm.start, rm.end).
298+ startOff , err := u .ma .GetOffset (uintptr (rm .start ))
299+ if err != nil {
300+ u .logger .Error (ctx , "UFFD REMOVE: failed to map start address" ,
301+ zap .Uintptr ("start" , uintptr (rm .start )), zap .Error (err ))
302+
303+ continue
304+ }
294305
295- startIdx := uint64 (header .BlockIdx (startOff , int64 (u .pageSize )))
296- endIdx := startIdx + uint64 (rm .end - rm .start )/ uint64 (u .pageSize )
297- if err := u .pageTracker .SetRange (startIdx , endIdx , removed ); err != nil {
298- // Programming bug only — keep draining the batch so
299- // the remaining ranges still get marked.
300- u .logger .Error (ctx , "UFFD REMOVE: pageTracker SetRange error" ,
301- zap .Uint64 ("start_idx" , startIdx ), zap .Uint64 ("end_idx" , endIdx ), zap .Error (err ))
306+ startIdx := uint64 (header .BlockIdx (startOff , int64 (u .pageSize )))
307+ endIdx := startIdx + uint64 (rm .end - rm .start )/ uint64 (u .pageSize )
308+ if err := u .pageTracker .SetRange (startIdx , endIdx , removed ); err != nil {
309+ // Programming bug only — keep draining the batch so
310+ // the remaining ranges still get marked.
311+ u .logger .Error (ctx , "UFFD REMOVE: pageTracker SetRange error" ,
312+ zap .Uint64 ("start_idx" , startIdx ), zap .Uint64 ("end_idx" , endIdx ), zap .Error (err ))
313+ }
302314 }
315+ u .settleRequests .Unlock ()
303316 }
304- u .settleRequests .Unlock ()
317+
318+ u .readSerial .Unlock ()
319+ } else {
320+ noDataCounter .Increase ("POLLIN" )
305321 }
306322
307323 pagefaults = append (deferred .drain (), pagefaults ... )
0 commit comments