Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,12 @@ func (h *testHandler) executeRead(ctx context.Context, op operation) error {

readBytes := (*h.memoryArea)[op.offset : op.offset+int64(h.pagesize)]

// bytes.Equal is the first access to uffd-managed memory, triggering the pagefault.
// MADV_POPULATE_READ faults the page in via syscall, so the goroutine sits
// in _Gsyscall while the kernel waits for UFFDIO_COPY — STW can preempt.
if err := unix.Madvise(readBytes, unix.MADV_POPULATE_READ); err != nil {
return fmt.Errorf("madvise POPULATE_READ at offset %d: %w", op.offset, err)
}

if !bytes.Equal(readBytes, expectedBytes) {
idx, want, got := testutils.FirstDifferentByte(readBytes, expectedBytes)

Expand All @@ -331,7 +336,14 @@ func (h *testHandler) executeWrite(ctx context.Context, op operation) error {
h.mutex.Lock()
defer h.mutex.Unlock()

n := copy((*h.memoryArea)[op.offset:op.offset+int64(h.pagesize)], bytesToWrite)
writeBytes := (*h.memoryArea)[op.offset : op.offset+int64(h.pagesize)]

// MADV_POPULATE_WRITE: see executeRead for why we fault via syscall.
if err := unix.Madvise(writeBytes, unix.MADV_POPULATE_WRITE); err != nil {
return fmt.Errorf("madvise POPULATE_WRITE at offset %d: %w", op.offset, err)
}

n := copy(writeBytes, bytesToWrite)
if n != int(h.pagesize) {
return fmt.Errorf("copy length mismatch: want %d, got %d", h.pagesize, n)
}
Expand Down
31 changes: 22 additions & 9 deletions packages/orchestrator/pkg/sandbox/uffd/userfaultfd/race_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,9 @@ func withRaceContext(t *testing.T, body func(ctx context.Context)) {
//
// Both variants fail until the fix in #2512 lands — the failure is
// intentional and demonstrates the stale-source bug.
//
//nolint:paralleltest,tparallel // serialised: a paused gated handler keeps a faulting goroutine suspended in the kernel pagefault path; a STW GC pause from another parallel test would wait forever for that goroutine to reach a safe point.
func TestStaleSourceRaceMissingAndRemove(t *testing.T) {
t.Parallel()

tests := []struct {
name string
pagesize uint64
Expand All @@ -67,7 +67,9 @@ func TestStaleSourceRaceMissingAndRemove(t *testing.T) {
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { //nolint:paralleltest // see test-level comment
t.Run(tt.name, func(t *testing.T) {
t.Parallel()

withRaceContext(t, func(ctx context.Context) {
const sentinel = byte(0xC3)
const pageIdx = 1
Expand Down Expand Up @@ -123,6 +125,13 @@ func TestStaleSourceRaceMissingAndRemove(t *testing.T) {
}

page := (*h.memoryArea)[pageOffset : pageOffset+int64(tt.pagesize)]

// MADV_POPULATE_READ resolves the page via the kernel
// fault path while this goroutine sits in _Gsyscall, so
// the subsequent direct load can never fault and can
// never deadlock a concurrent STW.
err = unix.Madvise(page, unix.MADV_POPULATE_READ)
require.NoError(t, err, "madvise POPULATE_READ at offset %d", pageOffset)
assert.Equalf(t, byte(0), page[0],
"page %d first byte: want 0 (zero-fault for `removed`), got %#x — "+
"if this equals the sentinel %#x, the worker used a stale source (regression)",
Expand All @@ -145,9 +154,9 @@ func TestStaleSourceRaceMissingAndRemove(t *testing.T) {
// must return within madviseBudget because readEvents drains REMOVE
// events without taking any lock — any future change that couples
// readEvents to settleRequests fails this test at the budget boundary.
//
//nolint:paralleltest,tparallel // same reason — see TestFaultedShortCircuitOrdering
func TestNoMadviseDeadlockWithInflightCopy(t *testing.T) {
t.Parallel()

tests := []struct {
name string
pagesize uint64
Expand All @@ -157,7 +166,9 @@ func TestNoMadviseDeadlockWithInflightCopy(t *testing.T) {
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { //nolint:paralleltest // see test-level comment
t.Run(tt.name, func(t *testing.T) {
t.Parallel()

withRaceContext(t, func(ctx context.Context) {
cfg := testConfig{
pagesize: tt.pagesize,
Expand Down Expand Up @@ -225,9 +236,9 @@ func TestNoMadviseDeadlockWithInflightCopy(t *testing.T) {
// orderings happen to commute on disjoint pages, so this test does
// not by itself prove drain-order; same-page ordering is covered by
// TestStaleSourceRaceMissingAndRemove.
//
//nolint:paralleltest,tparallel // serialised: a paused gated handler keeps a faulting goroutine suspended in the kernel pagefault path; a STW GC pause from another parallel test would wait forever for that goroutine to reach a safe point.
func TestFaultedShortCircuitOrdering(t *testing.T) {
t.Parallel()

tests := []struct {
name string
pagesize uint64
Expand All @@ -237,7 +248,9 @@ func TestFaultedShortCircuitOrdering(t *testing.T) {
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { //nolint:paralleltest // see test-level comment
t.Run(tt.name, func(t *testing.T) {
t.Parallel()

withRaceContext(t, func(ctx context.Context) {
cfg := testConfig{
pagesize: tt.pagesize,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,9 @@ func TestRemoveThenFault(t *testing.T) {
// succeeds without faulting because MADV_DONTNEED blocks (waiting for ack)
// and doesn't unmap the page until the handler processes the event.
// When the handler resumes, it only sees the REMOVE — no MISSING fault.
//
//nolint:paralleltest,tparallel // serialised: a paused gated handler keeps a faulting goroutine suspended in the kernel pagefault path; a STW GC pause from another parallel test would wait forever for that goroutine to reach a safe point.
func TestRemoveThenWriteGated(t *testing.T) {
t.Parallel()

tests := []testConfig{
{
name: "4k gated remove with concurrent write",
Expand Down Expand Up @@ -203,7 +203,9 @@ func TestRemoveThenWriteGated(t *testing.T) {
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { //nolint:paralleltest // see test-level comment
t.Run(tt.name, func(t *testing.T) {
t.Parallel()

h, err := configureCrossProcessTest(t.Context(), t, tt)
require.NoError(t, err)

Expand All @@ -226,9 +228,9 @@ func TestRemoveThenWriteGated(t *testing.T) {
// was queued first. The write to a missing page triggers MISSING (queued first),
// then MADV_DONTNEED triggers REMOVE (queued second). When the handler resumes,
// it processes REMOVE first, then MISSING — the write is not skipped.
//
//nolint:paralleltest,tparallel // serialised: see TestRemoveThenWriteGated for STW deadlock rationale.
func TestWriteThenRemoveGated(t *testing.T) {
t.Parallel()

tests := []testConfig{
{
name: "4k write then remove in same batch",
Expand Down Expand Up @@ -267,7 +269,9 @@ func TestWriteThenRemoveGated(t *testing.T) {
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { //nolint:paralleltest // see test-level comment
t.Run(tt.name, func(t *testing.T) {
t.Parallel()

h, err := configureCrossProcessTest(t.Context(), t, tt)
require.NoError(t, err)

Expand Down
Loading