Skip to content

Commit bf4fc62

Browse files
committed
refactor(uffd): swap pageTracker for block.StateTracker, add removed state
Replace the map-based pageTracker with block.StateTracker[pageState], a roaring-bitmap-backed tracker with O(1) range ops. pageState gains a third value, removed, which is wired at the type level but not yet written anywhere -- #2520 adds the REMOVE-event handler that produces it. Page indices are computed at the call site via header.BlockIdx. pageStateEntries is updated to iterate the exported bitmaps so the cross-process test harness keeps working. Inline the 3-line pageState enum into userfaultfd.go and drop the dedicated page_tracker.go now that pageTracker is gone. Convert block.StateTracker's NewStateTracker / SetRange API from panics to errors. Distinct-state validation and unsupported-state checks now return fmt.Errorf descriptors; the userfaultfd-side init propagates the constructor error through NewUserfaultfdFromFd, and the SetRange call in the worker path logs and continues since these errors only fire on programming bugs.
1 parent 949a4ec commit bf4fc62

5 files changed

Lines changed: 86 additions & 81 deletions

File tree

packages/orchestrator/pkg/sandbox/block/state_tracker.go

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,11 @@ type StateTracker[S comparable] struct {
1717

1818
// NewStateTracker requires three distinct states. Duplicates are a
1919
// programming error — the switch in SetRange would silently favour the
20-
// first matching case and corrupt bitmap state — so we panic at
20+
// first matching case and corrupt bitmap state — so we reject them at
2121
// construction rather than defer the bug to a later SetRange call.
22-
func NewStateTracker[S comparable](defaultState, a, b S) *StateTracker[S] {
22+
func NewStateTracker[S comparable](defaultState, a, b S) (*StateTracker[S], error) {
2323
if defaultState == a || defaultState == b || a == b {
24-
panic(fmt.Sprintf("block.NewStateTracker: states must be distinct (default=%v a=%v b=%v)", defaultState, a, b))
24+
return nil, fmt.Errorf("block.NewStateTracker: states must be distinct (default=%v a=%v b=%v)", defaultState, a, b)
2525
}
2626

2727
return &StateTracker[S]{
@@ -30,15 +30,15 @@ func NewStateTracker[S comparable](defaultState, a, b S) *StateTracker[S] {
3030
b: b,
3131
bmA: roaring.New(),
3232
bmB: roaring.New(),
33-
}
33+
}, nil
3434
}
3535

3636
// SetRange takes uint64 because roaring's range API allows end = 1<<32
3737
// (the half-open upper bound of a 32-bit bitmap); Get stays uint32 since
3838
// no 33-bit value can ever be a bitmap member.
39-
func (t *StateTracker[S]) SetRange(start, end uint64, state S) {
39+
func (t *StateTracker[S]) SetRange(start, end uint64, state S) error {
4040
if end <= start {
41-
return
41+
return nil
4242
}
4343

4444
t.mu.Lock()
@@ -58,9 +58,11 @@ func (t *StateTracker[S]) SetRange(start, end uint64, state S) {
5858
// S is constrained only to comparable, so the compiler can't
5959
// prove exhaustiveness. A silent no-op here would hide a
6060
// programming error (caller added a state but forgot to wire
61-
// it); panic makes it fail fast in tests.
62-
panic(fmt.Sprintf("block.StateTracker.SetRange: unsupported state %v (only default=%v a=%v b=%v allowed)", state, t.defaultState, t.a, t.b))
61+
// it); surfacing an error makes it fail fast in tests.
62+
return fmt.Errorf("block.StateTracker.SetRange: unsupported state %v (only default=%v a=%v b=%v allowed)", state, t.defaultState, t.a, t.b)
6363
}
64+
65+
return nil
6466
}
6567

6668
func (t *StateTracker[S]) Export() (a, b *roaring.Bitmap) {

packages/orchestrator/pkg/sandbox/block/state_tracker_test.go

Lines changed: 36 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"testing"
55

66
"github.com/stretchr/testify/assert"
7+
"github.com/stretchr/testify/require"
78
)
89

910
type ts uint8
@@ -23,44 +24,47 @@ func TestStateTracker(t *testing.T) {
2324

2425
t.Run("initial state is default", func(t *testing.T) {
2526
t.Parallel()
26-
s := NewStateTracker(tsDefault, tsA, tsB)
27+
s, err := NewStateTracker(tsDefault, tsA, tsB)
28+
require.NoError(t, err)
2729
assert.Equal(t, tsDefault, s.Get(0))
2830
assert.Equal(t, tsDefault, s.Get(123))
2931
})
3032

3133
t.Run("transitions", func(t *testing.T) {
3234
t.Parallel()
33-
s := NewStateTracker(tsDefault, tsA, tsB)
35+
s, err := NewStateTracker(tsDefault, tsA, tsB)
36+
require.NoError(t, err)
3437

35-
s.SetRange(0, 1, tsA)
38+
require.NoError(t, s.SetRange(0, 1, tsA))
3639
assert.Equal(t, tsA, s.Get(0))
3740

38-
s.SetRange(0, 1, tsB)
41+
require.NoError(t, s.SetRange(0, 1, tsB))
3942
assert.Equal(t, tsB, s.Get(0), "a→b should flip the page")
4043
bmA, bmB := s.Export()
4144
assert.False(t, bmA.Contains(0), "a→b must clear bmA")
4245
assert.True(t, bmB.Contains(0), "a→b must add to bmB")
4346

44-
s.SetRange(0, 1, tsA)
47+
require.NoError(t, s.SetRange(0, 1, tsA))
4548
assert.Equal(t, tsA, s.Get(0), "b→a should flip back")
4649

47-
s.SetRange(0, 1, tsDefault)
50+
require.NoError(t, s.SetRange(0, 1, tsDefault))
4851
assert.Equal(t, tsDefault, s.Get(0), "→default must clear")
4952
bmA, bmB = s.Export()
5053
assert.False(t, bmA.Contains(0))
5154
assert.False(t, bmB.Contains(0))
5255

53-
s.SetRange(0, 1, tsA)
54-
s.SetRange(0, 1, tsA)
56+
require.NoError(t, s.SetRange(0, 1, tsA))
57+
require.NoError(t, s.SetRange(0, 1, tsA))
5558
assert.Equal(t, tsA, s.Get(0), "a→a is idempotent")
5659
})
5760

5861
t.Run("partial overlap moves only the overlapping pages", func(t *testing.T) {
5962
t.Parallel()
60-
s := NewStateTracker(tsDefault, tsA, tsB)
63+
s, err := NewStateTracker(tsDefault, tsA, tsB)
64+
require.NoError(t, err)
6165

62-
s.SetRange(0, 10, tsA)
63-
s.SetRange(3, 7, tsB)
66+
require.NoError(t, s.SetRange(0, 10, tsA))
67+
require.NoError(t, s.SetRange(3, 7, tsB))
6468

6569
for i := range uint32(3) {
6670
assert.Equal(t, tsA, s.Get(i), "page %d outside overlap stays a", i)
@@ -77,38 +81,45 @@ func TestStateTracker(t *testing.T) {
7781

7882
t.Run("empty and inverted ranges are no-ops", func(t *testing.T) {
7983
t.Parallel()
80-
s := NewStateTracker(tsDefault, tsA, tsB)
84+
s, err := NewStateTracker(tsDefault, tsA, tsB)
85+
require.NoError(t, err)
8186

82-
s.SetRange(5, 5, tsA)
83-
s.SetRange(7, 3, tsB)
87+
require.NoError(t, s.SetRange(5, 5, tsA))
88+
require.NoError(t, s.SetRange(7, 3, tsB))
8489
bmA, bmB := s.Export()
8590
assert.True(t, bmA.IsEmpty())
8691
assert.True(t, bmB.IsEmpty())
8792
})
8893

8994
t.Run("Export returns clones, not aliases", func(t *testing.T) {
9095
t.Parallel()
91-
s := NewStateTracker(tsDefault, tsA, tsB)
92-
s.SetRange(0, 1, tsA)
96+
s, err := NewStateTracker(tsDefault, tsA, tsB)
97+
require.NoError(t, err)
98+
require.NoError(t, s.SetRange(0, 1, tsA))
9399

94100
bmA, _ := s.Export()
95101
bmA.Add(99)
96102

97103
assert.Equal(t, tsDefault, s.Get(99), "mutating a clone must not leak into the tracker")
98104
})
99105

100-
t.Run("NewStateTracker rejects non-distinct states", func(t *testing.T) {
106+
t.Run("NewStateTracker errors on non-distinct states", func(t *testing.T) {
101107
t.Parallel()
102-
assert.Panics(t, func() { NewStateTracker(tsA, tsA, tsB) }, "default == a must panic")
103-
assert.Panics(t, func() { NewStateTracker(tsA, tsB, tsB) }, "a == b must panic")
104-
assert.Panics(t, func() { NewStateTracker(tsA, tsB, tsA) }, "default == b must panic")
105-
assert.Panics(t, func() { NewStateTracker(tsA, tsA, tsA) }, "all-equal must panic")
108+
_, err := NewStateTracker(tsA, tsA, tsB)
109+
require.Error(t, err, "default == a must error")
110+
_, err = NewStateTracker(tsA, tsB, tsB)
111+
require.Error(t, err, "a == b must error")
112+
_, err = NewStateTracker(tsA, tsB, tsA)
113+
require.Error(t, err, "default == b must error")
114+
_, err = NewStateTracker(tsA, tsA, tsA)
115+
require.Error(t, err, "all-equal must error")
106116
})
107117

108-
t.Run("SetRange panics on unsupported state", func(t *testing.T) {
118+
t.Run("SetRange errors on unsupported state", func(t *testing.T) {
109119
t.Parallel()
110-
s := NewStateTracker(tsDefault, tsA, tsB)
111-
assert.Panics(t, func() { s.SetRange(0, 1, ts(99)) },
112-
"unregistered state value must panic, not silently no-op")
120+
s, err := NewStateTracker(tsDefault, tsA, tsB)
121+
require.NoError(t, err)
122+
require.Error(t, s.SetRange(0, 1, ts(99)),
123+
"unregistered state value must error, not silently no-op")
113124
})
114125
}

packages/orchestrator/pkg/sandbox/uffd/userfaultfd/page_tracker.go

Lines changed: 0 additions & 33 deletions
This file was deleted.

packages/orchestrator/pkg/sandbox/uffd/userfaultfd/rpc_services_test.go

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ import (
1010
"os"
1111
"sync"
1212

13+
"github.com/RoaringBitmap/roaring/v2"
14+
1315
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/fdexit"
1416
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/memory"
1517
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/testutils/testharness"
@@ -186,24 +188,24 @@ func (p *Paging) Resume(_ *testharness.Empty, _ *testharness.Empty) error {
186188
}
187189

188190
// pageStateEntries returns a wire-format snapshot of pageTracker.
189-
// settleRequests.Lock drains fault workers (mirrors PrefetchData);
190-
// pageTracker.mu.RLock is defensive against a future REMOVE writer
191-
// that mutates pageTracker.m outside settleRequests.
191+
// settleRequests.Lock drains fault workers (mirrors PrefetchData) so
192+
// the snapshot is consistent w.r.t. concurrent installs.
192193
func (u *Userfaultfd) pageStateEntries() ([]testharness.PageStateEntry, error) {
193194
u.settleRequests.Lock()
194195
defer u.settleRequests.Unlock()
195196

196-
u.pageTracker.mu.RLock()
197-
defer u.pageTracker.mu.RUnlock()
198-
199-
entries := make([]testharness.PageStateEntry, 0, len(u.pageTracker.m))
200-
for addr, state := range u.pageTracker.m {
201-
offset, err := u.ma.GetOffset(addr)
202-
if err != nil {
203-
return nil, fmt.Errorf("address %#x not in mapping: %w", addr, err)
197+
bmFaulted, bmRemoved := u.pageTracker.Export()
198+
entries := make([]testharness.PageStateEntry, 0, bmFaulted.GetCardinality()+bmRemoved.GetCardinality())
199+
emit := func(bm *roaring.Bitmap, state pageState) {
200+
for _, idx := range bm.ToArray() {
201+
entries = append(entries, testharness.PageStateEntry{
202+
State: uint8(state),
203+
Offset: uint64(idx) * uint64(u.pageSize),
204+
})
204205
}
205-
entries = append(entries, testharness.PageStateEntry{State: uint8(state), Offset: uint64(offset)})
206206
}
207+
emit(bmFaulted, faulted)
208+
emit(bmRemoved, removed)
207209

208210
return entries, nil
209211
}

packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/fdexit"
2323
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/memory"
2424
"github.com/e2b-dev/infra/packages/shared/pkg/logger"
25+
"github.com/e2b-dev/infra/packages/shared/pkg/storage/header"
2526
)
2627

2728
var tracer = otel.Tracer("github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/userfaultfd")
@@ -46,13 +47,23 @@ func hasEvent(revents, event int16) bool {
4647
return revents&event != 0
4748
}
4849

50+
// pageState tracks which UFFD page-management action has been applied
51+
// to each registered page. The default (zero) value is missing.
52+
type pageState uint8
53+
54+
const (
55+
missing pageState = iota
56+
faulted
57+
removed
58+
)
59+
4960
type Userfaultfd struct {
5061
fd Fd
5162

5263
src block.Slicer
5364
ma *memory.Mapping
5465
pageSize uintptr
55-
pageTracker *pageTracker
66+
pageTracker *block.StateTracker[pageState]
5667

5768
// We use the settleRequests to guard the pageTracker so we can access a consistent state of the pageTracker after the requests are finished.
5869
settleRequests sync.RWMutex
@@ -88,11 +99,16 @@ func NewUserfaultfdFromFd(fd uintptr, src block.Slicer, m *memory.Mapping, logge
8899
}
89100
}
90101

102+
pageTracker, err := block.NewStateTracker(missing, faulted, removed)
103+
if err != nil {
104+
return nil, fmt.Errorf("failed to init page tracker: %w", err)
105+
}
106+
91107
u := &Userfaultfd{
92108
fd: Fd(fd),
93109
src: src,
94110
pageSize: uintptr(blockSize),
95-
pageTracker: newPageTracker(uintptr(blockSize)),
111+
pageTracker: pageTracker,
96112
prefetchTracker: block.NewPrefetchTracker(blockSize),
97113
ma: m,
98114
logger: logger,
@@ -418,7 +434,14 @@ retryLoop:
418434
return fmt.Errorf("failed uffdio copy: %w", joinedErr)
419435
}
420436

421-
u.pageTracker.setState(addr, addr+u.pageSize, faulted)
437+
idx := uint64(header.BlockIdx(offset, int64(u.pageSize)))
438+
if err := u.pageTracker.SetRange(idx, idx+1, faulted); err != nil {
439+
// Programming bug only — the serve loop is still healthy and
440+
// the page is correctly installed in guest memory. Log and
441+
// continue rather than abort.
442+
u.logger.Error(ctx, "UFFD serve pageTracker SetRange error",
443+
zap.Uint64("idx", idx), zap.Error(err))
444+
}
422445
u.prefetchTracker.Add(offset, accessType)
423446

424447
return nil

0 commit comments

Comments
 (0)