Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions packages/orchestrator/cmd/resume-build/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,15 @@ func main() {
cmdSignalPause := flag.String("cmd-signal-pause", "", "execute command in sandbox, then wait for SIGUSR1 before pausing")
optimize := flag.Bool("optimize", false, "collect fresh prefetch mapping after pause (resumes snapshot to record page faults)")

// Pause-time FPH override; 0 = use LD default (off).
fphTimeoutMs := flag.Int("fph-timeout-ms", 0, "override free-page-hinting-timeout-ms LD flag (0 = use LD default)")

flag.Parse()

if *fphTimeoutMs > 0 {
featureflags.NewIntFlag("free-page-hinting-timeout-ms", *fphTimeoutMs)
}

if *fromBuild == "" {
log.Fatal("-from-build required")
}
Expand Down
37 changes: 34 additions & 3 deletions packages/orchestrator/pkg/sandbox/fc/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -427,8 +427,11 @@ func (c *apiClient) startVM(ctx context.Context) error {
return nil
}

func (c *apiClient) enableFreePageReporting(ctx context.Context) error {
ctx, span := tracer.Start(ctx, "enable-free-page-reporting")
// installBalloon installs the virtio-balloon pre-boot with target size 0.
// FreePageReporting and FreePageHinting are each gated independently at
// template build time (FC version for FPR; guest kernel version for FPH).
func (c *apiClient) installBalloon(ctx context.Context, freePageReporting, freePageHinting bool) error {
ctx, span := tracer.Start(ctx, "install-balloon")
defer span.End()

amountMib := int64(0)
Expand All @@ -439,7 +442,8 @@ func (c *apiClient) enableFreePageReporting(ctx context.Context) error {
Body: &models.Balloon{
AmountMib: &amountMib,
DeflateOnOom: &deflateOnOom,
FreePageReporting: true,
FreePageReporting: freePageReporting,
FreePageHinting: freePageHinting,
},
}

Expand All @@ -451,6 +455,33 @@ func (c *apiClient) enableFreePageReporting(ctx context.Context) error {
return nil
}

func (c *apiClient) startBalloonHinting(ctx context.Context, acknowledgeOnStop bool) error {
params := operations.StartBalloonHintingParams{
Context: ctx,
Body: &models.BalloonStartCmd{AcknowledgeOnStop: acknowledgeOnStop},
}
_, err := c.client.Operations.StartBalloonHinting(&params)
if err != nil {
return fmt.Errorf("error starting balloon hinting: %w", err)
}

return nil
}

func (c *apiClient) describeBalloonHinting(ctx context.Context) (hostCmd, guestCmd int64, err error) {
params := operations.DescribeBalloonHintingParams{Context: ctx}
res, err := c.client.Operations.DescribeBalloonHinting(&params)
if err != nil {
return 0, 0, err
}
if res.Payload.HostCmd != nil {
hostCmd = *res.Payload.HostCmd
}
guestCmd = res.Payload.GuestCmd

return hostCmd, guestCmd, nil
}

func (c *apiClient) memoryMapping(ctx context.Context) (*memory.Mapping, error) {
params := operations.GetMemoryMappingsParams{
Context: ctx,
Expand Down
36 changes: 36 additions & 0 deletions packages/orchestrator/pkg/sandbox/fc/fph_gates.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package fc

import (
"strings"

"github.com/e2b-dev/infra/packages/shared/pkg/fcversion"
"github.com/e2b-dev/infra/packages/shared/pkg/utils"
)

// MinFreePageHintingKernelVersion is the minimum guest kernel version that
// contains the virtio-balloon free-page-hinting race fix. Templates built
// against an older kernel get the balloon installed with FreePageHinting
// disabled so the race can't be triggered, regardless of any runtime
// LaunchDarkly toggle. Bump this once the fixed kernel is published to
// e2b-dev/fc-kernels.
const MinFreePageHintingKernelVersion = "999.0.0"

// kernelSupportsFreePageHinting reports whether kernelVersion (e.g.
// "vmlinux-6.1.158") includes the FPH/MADV_DONTNEED race fix.
func kernelSupportsFreePageHinting(kernelVersion string) bool {
v := strings.TrimPrefix(kernelVersion, "vmlinux-")
ok, _ := utils.IsGTEVersion(v, MinFreePageHintingKernelVersion)

return ok
}

// fcSupportsFreePageHinting reports whether the Firecracker version exposes
// the start_balloon_hinting / describe_balloon_hinting API (v1.14+).
func fcSupportsFreePageHinting(fcVersion string) bool {
info, err := fcversion.New(fcVersion)
if err != nil {
return false
}

return info.HasFreePageHinting()
}
67 changes: 63 additions & 4 deletions packages/orchestrator/pkg/sandbox/fc/process.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/rootfs"
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/socket"
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/template"
"github.com/e2b-dev/infra/packages/shared/pkg/fc/client/operations"
"github.com/e2b-dev/infra/packages/shared/pkg/keys"
"github.com/e2b-dev/infra/packages/shared/pkg/logger"
sbxlogger "github.com/e2b-dev/infra/packages/shared/pkg/logger/sandbox"
Expand Down Expand Up @@ -440,13 +441,13 @@ func (p *Process) Create(
telemetry.ReportEvent(ctx, "set fc entropy config")

if freePageReporting {
err = p.client.enableFreePageReporting(ctx)
if err != nil {
freePageHinting := fcSupportsFreePageHinting(p.Versions.FirecrackerVersion) && kernelSupportsFreePageHinting(p.Versions.KernelVersion)
if err := p.client.installBalloon(ctx, freePageReporting, freePageHinting); err != nil {
fcStopErr := p.Stop(ctx)

return errors.Join(fmt.Errorf("error enabling free page reporting: %w", err), fcStopErr)
return errors.Join(fmt.Errorf("error installing balloon device: %w", err), fcStopErr)
}
telemetry.ReportEvent(ctx, "enabled free page reporting")
telemetry.ReportEvent(ctx, "installed balloon device", attribute.Bool("balloon.free_page_hinting", freePageHinting))
}

err = p.client.startVM(ctx)
Expand Down Expand Up @@ -710,6 +711,64 @@ func (p *Process) Pause(ctx context.Context) error {
return p.client.pauseVM(ctx)
}

// DrainBalloon triggers a free-page-hinting run and blocks until the guest
// acknowledges or ctx fires. No-op on FC < v1.14 (no API) and when no balloon
// device is configured (FC returns 400) so it survives snapshot/resume
// without local state. Records `drain-balloon.outcome` on the span:
// ok | fc-unsupported | not-configured | timeout | start-failed | describe-failed.
func (p *Process) DrainBalloon(ctx context.Context) error {
ctx, span := tracer.Start(ctx, "drain-balloon")
outcome := "ok"
defer func() {
span.SetAttributes(attribute.String("drain-balloon.outcome", outcome))
span.End()
}()

if !fcSupportsFreePageHinting(p.Versions.FirecrackerVersion) {
outcome = "fc-unsupported"

return nil
}

if err := p.client.startBalloonHinting(ctx, true /* ackOnStop */); err != nil {
var notConfigured *operations.StartBalloonHintingBadRequest
if errors.As(err, &notConfigured) {
outcome = "not-configured"

return nil
}

outcome = "start-failed"

return fmt.Errorf("start balloon hinting: %w", err)
}

backoff := 5 * time.Millisecond
for {
select {
case <-ctx.Done():
outcome = "timeout"

return ctx.Err()
case <-time.After(backoff):
}

host, guest, err := p.client.describeBalloonHinting(ctx)
if err != nil {
outcome = "describe-failed"

return fmt.Errorf("balloon hinting status: %w", err)
}
// host_cmd is monotonic and we just called start, so host > 0
// after FC accepts it. Require it to guard against transient
// nil/zero responses returning a false-positive completion.
if host > 0 && guest >= host {
return nil
}
backoff = min(backoff*2, 50*time.Millisecond)
}
}

// CreateSnapshot VM needs to be paused before creating a snapshot.
func (p *Process) CreateSnapshot(ctx context.Context, snapfilePath string) error {
ctx, childSpan := tracer.Start(ctx, "create-snapshot-fc")
Expand Down
19 changes: 17 additions & 2 deletions packages/orchestrator/pkg/sandbox/sandbox.go
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,8 @@ type Sandbox struct {
files *storage.SandboxFiles
cleanup *Cleanup

featureFlags *featureflags.Client

process *fc.Process
cgroupHandle *cgroup.CgroupHandle

Expand Down Expand Up @@ -458,7 +460,8 @@ func (f *Factory) CreateSandbox(
files: sandboxFiles,
process: fcHandle,

cleanup: cleanup,
cleanup: cleanup,
featureFlags: f.featureFlags,

APIStoredConfig: apiConfigToStore,

Expand Down Expand Up @@ -799,7 +802,8 @@ func (f *Factory) ResumeSandbox(
files: sandboxFiles,
process: fcHandle,

cleanup: cleanup,
cleanup: cleanup,
featureFlags: f.featureFlags,

APIStoredConfig: apiConfigToStore,
CABundle: f.egressProxy.CABundle(),
Expand Down Expand Up @@ -1053,6 +1057,17 @@ func (s *Sandbox) Pause(
// Stop the health check before pausing the VM
s.Checks.Stop()

// Drain free-page-hinting before pause so the snapshot doesn't capture
// pages the guest already considers free. No-op when no balloon. Failures
// are logged but non-fatal. Timeout=0 disables the step.
if t := time.Duration(s.featureFlags.IntFlag(ctx, featureflags.FreePageHintingTimeoutMs)) * time.Millisecond; t > 0 {
drainCtx, cancel := context.WithTimeout(ctx, t)
if err := s.process.DrainBalloon(drainCtx); err != nil {
telemetry.ReportError(ctx, "balloon hinting drain failed (continuing pause)", err)
}
cancel()
}

if err := s.process.Pause(ctx); err != nil {
return nil, fmt.Errorf("failed to pause VM: %w", err)
}
Expand Down
7 changes: 7 additions & 0 deletions packages/shared/pkg/fcversion/sandbox_features.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,10 @@ func (v *Info) HasHugePages() bool {
func (v *Info) HasFreePageReporting() bool {
return v.lastReleaseVersion.Major() > 1 || (v.lastReleaseVersion.Major() == 1 && v.lastReleaseVersion.Minor() >= 14)
}

// HasFreePageHinting reports whether the Firecracker version exposes the
// balloon free-page-hinting API (start_balloon_hinting / describe_balloon_hinting).
// Introduced in v1.14.
func (v *Info) HasFreePageHinting() bool {
return v.lastReleaseVersion.Major() > 1 || (v.lastReleaseVersion.Major() == 1 && v.lastReleaseVersion.Minor() >= 14)
}
5 changes: 4 additions & 1 deletion packages/shared/pkg/featureflags/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,10 @@ var (
BestOfKMaxOvercommit = NewIntFlag("best-of-k-max-overcommit", 400) // Default R=4 (stored as percentage, max over-commit ratio)
BestOfKAlpha = NewIntFlag("best-of-k-alpha", 50) // Default Alpha=0.5 (stored as percentage for int flag, current usage weight)
EnvdInitTimeoutMilliseconds = NewIntFlag("envd-init-request-timeout-milliseconds", 50) // Timeout for envd init request in milliseconds
HostStatsSamplingInterval = NewIntFlag("host-stats-sampling-interval", 5000) // Host stats sampling interval in milliseconds (default 5s)
// FreePageHintingTimeoutMs gates a pre-pause virtio-balloon free-page-hinting
// drain. 0 disables it. Operator opts in once the kernel has the FPH race fix.
FreePageHintingTimeoutMs = NewIntFlag("free-page-hinting-timeout-ms", 0)
HostStatsSamplingInterval = NewIntFlag("host-stats-sampling-interval", 5000) // Host stats sampling interval in milliseconds (default 5s)
MaxCacheWriterConcurrencyFlag = NewIntFlag("max-cache-writer-concurrency", 10)

// BuildCacheMaxUsagePercentage the maximum percentage of the cache disk storage
Expand Down
Loading