Skip to content

Commit 1255416

Browse files
committed
fix(uffd): treat UFFDIO_COPY EAGAIN and partial copies as soft, let kernel redeliver
Folds audit findings #1 and #7 into one commit since they share the same error arm in faultPage. The kernel surfaces concurrent mm churn (e.g. balloon-driven madvise(MADV_DONTNEED), mremap, fork against the same mm) through UFFDIO_COPY in two distinct ways: as an EAGAIN errno from the syscall, or — once UFFD_FEATURE_EVENT_REMOVE is enabled — through the partial-copy convention where the syscall returns 0 and cpy.copy carries either -EAGAIN or 0..pagesize. Hugetlb pages can also surface a positive short copy if a fault preempts the operation mid-page (#7). Pre-#2520 the latter path went through fmt.Errorf("UFFDIO_COPY copied N bytes...") and fell into the catch-all writeErr != nil arm — which calls onFailure() / fdExit.SignalExit(), tears the uffd serve loop down, and crashes the sandbox the moment the guest touches an unmapped page. The pre-existing errno-EAGAIN soft handler covered only the syscall errno path. Move the partial-copy classification into a small helper so both surfaces collapse onto the existing EAGAIN-returning-(false, nil) branch in faultPage. No retry budget — matches Firecracker's reference handler in src/firecracker/examples/uffd/uffd_utils.rs (Err(PartiallyCopied(n)) if n == 0 || n == -EAGAIN ⇒ return false). Add a uffd.copy_eagain span attribute for observability. Tests: unit-test classifyCopyResult directly. faultPage doesn't expose an Fd seam to mock UFFDIO_COPY without an interface refactor that would materially expand the diff; per the audit's "smallest pragmatic test" guidance the classifier covers the new branching and the existing cross-process matrix tests cover the integration path.
1 parent efa01a5 commit 1255416

3 files changed

Lines changed: 106 additions & 6 deletions

File tree

packages/orchestrator/pkg/sandbox/uffd/userfaultfd/fd.go

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -155,9 +155,26 @@ func (f Fd) copy(addr, pagesize uintptr, data []byte, mode CULong) error {
155155
return errno
156156
}
157157

158-
// Check if the copied size matches the requested pagesize
159-
if cpy.copy != CLong(pagesize) {
160-
return fmt.Errorf("UFFDIO_COPY copied %d bytes, expected %d", cpy.copy, pagesize)
158+
return classifyCopyResult(int64(cpy.copy), int64(pagesize))
159+
}
160+
161+
// classifyCopyResult interprets the bytes-copied field returned by a
162+
// UFFDIO_COPY ioctl whose syscall returned no errno. The kernel uses a
163+
// partial-copy convention: a negative value carries the negated errno of
164+
// the underlying failure (notably -EAGAIN when mmap_changing was set
165+
// mid-copy by a concurrent madvise(MADV_DONTNEED), mremap, or fork); a
166+
// positive value less than pagesize means the kernel made partial
167+
// progress (e.g. a hugetlb fault preempted the copy mid-page). Both
168+
// cases are surfaced as a soft errno so the caller drops the fault and
169+
// lets the kernel redeliver once the racing operation settles. Matches
170+
// Firecracker's reference handler in src/firecracker/examples/uffd/uffd_utils.rs.
171+
func classifyCopyResult(bytesCopied, pagesize int64) error {
172+
if bytesCopied < 0 {
173+
return syscall.Errno(-bytesCopied)
174+
}
175+
176+
if bytesCopied != pagesize {
177+
return syscall.EAGAIN
161178
}
162179

163180
return nil
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
package userfaultfd
2+
3+
import (
4+
"syscall"
5+
"testing"
6+
7+
"github.com/stretchr/testify/assert"
8+
)
9+
10+
// TestClassifyCopyResult covers UFFD audit findings #1 and #7. It pins the
11+
// kernel partial-copy convention used by UFFDIO_COPY: when the syscall
12+
// returns no errno, cpy.copy carries either the bytes copied or a negative
13+
// -errno. Both EAGAIN-surfacing paths and any short positive copy must be
14+
// classified as a soft errno so the caller drops the fault and lets the
15+
// kernel redeliver instead of tearing the sandbox down.
16+
//
17+
// Mocking faultPage end-to-end would require an interface seam over Fd
18+
// that this PR is too small to introduce; per the audit's "smallest
19+
// pragmatic test" guidance we test the extracted classifier directly and
20+
// rely on the existing cross-process matrix tests for integration coverage.
21+
func TestClassifyCopyResult(t *testing.T) {
22+
t.Parallel()
23+
24+
const fourKi = int64(4096)
25+
const twoMi = int64(2 * 1024 * 1024)
26+
27+
tests := []struct {
28+
name string
29+
bytesCopied int64
30+
pagesize int64
31+
wantErr error
32+
wantEAGAIN bool
33+
}{
34+
{
35+
name: "full 4k copy succeeds",
36+
bytesCopied: fourKi,
37+
pagesize: fourKi,
38+
wantErr: nil,
39+
},
40+
{
41+
name: "kernel convention -EAGAIN surfaces as EAGAIN",
42+
bytesCopied: -int64(syscall.EAGAIN),
43+
pagesize: fourKi,
44+
wantEAGAIN: true,
45+
},
46+
{
47+
name: "zero bytes copied surfaces as EAGAIN (matches Firecracker bytes_copied==0)",
48+
bytesCopied: 0,
49+
pagesize: fourKi,
50+
wantEAGAIN: true,
51+
},
52+
{
53+
name: "partial positive copy on hugepage surfaces as EAGAIN",
54+
bytesCopied: fourKi,
55+
pagesize: twoMi,
56+
wantEAGAIN: true,
57+
},
58+
{
59+
name: "kernel convention -EFAULT surfaces as EFAULT (still fatal upstream)",
60+
bytesCopied: -int64(syscall.EFAULT),
61+
pagesize: fourKi,
62+
wantErr: syscall.EFAULT,
63+
},
64+
}
65+
66+
for _, tc := range tests {
67+
t.Run(tc.name, func(t *testing.T) {
68+
t.Parallel()
69+
70+
err := classifyCopyResult(tc.bytesCopied, tc.pagesize)
71+
if tc.wantEAGAIN {
72+
assert.ErrorIs(t, err, syscall.EAGAIN)
73+
74+
return
75+
}
76+
assert.Equal(t, tc.wantErr, err)
77+
})
78+
}
79+
}

packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -527,9 +527,13 @@ func (u *Userfaultfd) faultPage(
527527
}
528528

529529
if errors.Is(writeErr, unix.EAGAIN) {
530-
// This happens when a remove event arrives in the UFFD file descriptor while
531-
// we are trying to copy()/zero() a page. We need to read all the events from
532-
// file descriptor and try again.
530+
// Kernel sets mmap_changing during a concurrent madvise(MADV_DONTNEED),
531+
// mremap, or fork against the same mm and surfaces it as EAGAIN — either
532+
// directly via errno or via the UFFDIO_COPY partial-copy convention
533+
// (cpy.copy == -EAGAIN, or 0 <= cpy.copy < pagesize for hugetlb faults
534+
// preempted mid-page; see classifyCopyResult). Drop the fault and let
535+
// the kernel redeliver it once the racing operation settles.
536+
span.SetAttributes(attribute.Bool("uffd.copy_eagain", true))
533537
u.logger.Debug(ctx, "UFFD page write EAGAIN, deferring", zap.Uintptr("addr", addr))
534538

535539
return false, nil

0 commit comments

Comments
 (0)