Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
548 changes: 386 additions & 162 deletions packages/api/internal/api/api.gen.go

Large diffs are not rendered by default.

48 changes: 48 additions & 0 deletions packages/api/internal/handlers/sandbox_metadata_patch.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package handlers

import (
"fmt"
"net/http"

"github.com/gin-gonic/gin"

"github.com/e2b-dev/infra/packages/api/internal/api"
"github.com/e2b-dev/infra/packages/api/internal/utils"
"github.com/e2b-dev/infra/packages/auth/pkg/auth"
"github.com/e2b-dev/infra/packages/shared/pkg/ginutils"
"github.com/e2b-dev/infra/packages/shared/pkg/telemetry"
)

func (a *APIStore) PatchSandboxesSandboxIDMetadata(
c *gin.Context,
sandboxID string,
) {
ctx := c.Request.Context()

var err error
sandboxID, err = utils.ShortID(sandboxID)
Comment on lines +22 to +23
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
var err error
sandboxID, err = utils.ShortID(sandboxID)
sandboxID, err := utils.ShortID(sandboxID)

if err != nil {
a.sendAPIStoreError(c, http.StatusBadRequest, "Invalid sandbox ID")

return
}

team := auth.MustGetTeamInfo(c)

body, err := ginutils.ParseBody[api.PatchSandboxesSandboxIDMetadataJSONRequestBody](ctx, c)
if err != nil {
a.sendAPIStoreError(c, http.StatusBadRequest, fmt.Sprintf("Error when parsing request: %s", err))
telemetry.ReportCriticalError(ctx, "error when parsing request", err)

return
}

if apiErr := a.orchestrator.PatchSandboxMetadata(ctx, team.ID, sandboxID, body); apiErr != nil {
telemetry.ReportErrorByCode(ctx, apiErr.Code, "error patching sandbox metadata", apiErr.Err)
a.sendAPIStoreError(c, apiErr.Code, apiErr.ClientMsg)

return
}

c.Status(http.StatusNoContent)
}
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ func (n *Node) GetSandboxes(ctx context.Context) ([]sandbox.Sandbox, error) {
config.GetExecutionId(),
teamID,
buildID,
config.GetMetadata(),
sbx.GetMetadata(),
time.Duration(config.GetMaxSandboxLength())*time.Hour,
sbx.GetStartTime().AsTime(),
sbx.GetEndTime().AsTime(),
Expand Down
121 changes: 121 additions & 0 deletions packages/api/internal/orchestrator/patch_metadata.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
package orchestrator

import (
"context"
"errors"
"fmt"
"maps"
"net/http"

"github.com/google/uuid"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/trace"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"

"github.com/e2b-dev/infra/packages/api/internal/api"
"github.com/e2b-dev/infra/packages/api/internal/sandbox"
"github.com/e2b-dev/infra/packages/api/internal/utils"
orchestratorgrpc "github.com/e2b-dev/infra/packages/shared/pkg/grpc/orchestrator"
"github.com/e2b-dev/infra/packages/shared/pkg/telemetry"
)

// PatchSandboxMetadata applies a JSON-Merge-Patch-style update: non-nil values
// upsert the key, a nil pointer (or empty string) removes it, and absent keys
// are left alone.
func (o *Orchestrator) PatchSandboxMetadata(
ctx context.Context,
teamID uuid.UUID,
sandboxID string,
patch map[string]*string,
) *api.APIError {
var merged map[string]string

updateFunc := func(sbx sandbox.Sandbox) (sandbox.Sandbox, error) {
if sbx.State != sandbox.StateRunning {
return sbx, &sandbox.NotRunningError{SandboxID: sandboxID, State: sbx.State}
}

merged = applyMetadataPatch(sbx.Metadata, patch)
sbx.Metadata = merged

return sbx, nil
}

var sbxNotRunningErr *sandbox.NotRunningError

sbx, err := o.sandboxStore.Update(ctx, teamID, sandboxID, updateFunc)
if err != nil {
switch {
case errors.As(err, &sbxNotRunningErr):
return &api.APIError{Code: http.StatusConflict, ClientMsg: utils.SandboxChangingStateMsg(sandboxID, sbxNotRunningErr.State), Err: err}
case errors.Is(err, sandbox.ErrNotFound):
return &api.APIError{Code: http.StatusNotFound, ClientMsg: utils.SandboxNotFoundMsg(sandboxID), Err: err}
default:
return &api.APIError{Code: http.StatusInternalServerError, ClientMsg: "Error patching sandbox metadata", Err: err}
}
}

return o.patchSandboxMetadataOnNode(ctx, sbx, merged)

Check failure on line 59 in packages/api/internal/orchestrator/patch_metadata.go

View check run for this annotation

Claude / Claude Code Review

Non-atomic two-phase write in PatchSandboxMetadata causes Redis/node divergence

The two-phase write in `PatchSandboxMetadata` (`patch_metadata.go` lines 47–59) is non-atomic in two distinct failure modes: (1) if `patchSandboxMetadataOnNode` fails after `sandboxStore.Update` succeeds, Redis permanently holds the new merged metadata while the node retains the old values — no rollback exists; (2) concurrent PATCH calls serialize at the Redis level but their gRPC calls are completely unserialized, so whichever gRPC arrives at the node last silently wins regardless of Redis writ
Comment on lines +47 to +59
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 The two-phase write in PatchSandboxMetadata (patch_metadata.go lines 47–59) is non-atomic in two distinct failure modes: (1) if patchSandboxMetadataOnNode fails after sandboxStore.Update succeeds, Redis permanently holds the new merged metadata while the node retains the old values — no rollback exists; (2) concurrent PATCH calls serialize at the Redis level but their gRPC calls are completely unserialized, so whichever gRPC arrives at the node last silently wins regardless of Redis write order, leaving Redis and the live sandbox permanently diverged. Fix (1) by capturing old metadata before sandboxStore.Update and issuing a compensating update in the error path; fix (2) by adding per-sandbox mutual exclusion around the (store.Update, patchOnNode) pair, or switching the node RPC to a conditional/versioned write.

Extended reasoning...

What the bug is and how it manifests

PatchSandboxMetadata in packages/api/internal/orchestrator/patch_metadata.go (lines 26–60) performs a two-phase write: it first commits the merged metadata to Redis via sandboxStore.Update (line 47), then propagates the result to the running sandbox on the orchestrator node via gRPC (patchSandboxMetadataOnNode, line 59). This two-step sequence is not atomic at either the failure-recovery or the concurrency level, producing two independent bugs.

Bug 1 – No rollback on gRPC failure (split-brain)

sandboxStore.Update acquires a Redis distributed lock, performs a read-modify-write, then releases the lock before returning. At that point Redis durably holds the new merged metadata. If the subsequent patchSandboxMetadataOnNode gRPC call fails for any reason (node unreachable, timeout, codes.NotFound, network partition), the function returns an error to the caller but there is no compensating sandboxStore.Update on the error path. Redis now permanently reflects the new metadata while the running sandbox has the old metadata. Because the API store is Redis-backed, this divergence survives API restarts. Any subsequent GET /sandboxes/{id} call will return the new (incorrect) metadata from Redis while the sandbox environment runs with the old values. The only recovery path is a subsequent successful end-to-end PATCH, which callers who received an error have no reason to believe is necessary.

Bug 2 – Concurrent PATCH calls silently overwrite each other at the node

The Redis-level sandboxStore.Update callback serializes concurrent writes correctly: each call acquires the distributed lock, reads the current state, computes a merged map, writes it, and releases the lock. However, the critical section ends before the gRPC call. Each call captures its own merged map via closure at the time of its Redis write, then fires the gRPC independently. Because gRPC calls are completely unserialized with respect to each other, whichever call's RPC arrives at the node last wins — regardless of which Redis write was later. The gRPC carries the full resolved map (not a delta), so the earlier call's stale snapshot fully replaces whatever the node received from the later call.

Why existing code does not prevent this

The orchestrator-side Update handler (sandboxes.go:336–347) correctly uses ApplyAllOrNone with compensating closures to protect the node's in-memory apiMetadata, but this cannot reach back to undo an already-committed Redis write. The Reconcile loop only kills orphaned sandboxes; it never pushes Redis metadata back to a running node. There is no per-sandbox mutex or sequencing primitive around the (store.Update, patchOnNode) pair in the API layer.

Step-by-step proof of Bug 1

  1. Client PATCHes {env: staging} on sandbox S. sandboxStore.Update succeeds → Redis: {env: staging}. 2. patchSandboxMetadataOnNode gRPC fails (timeout). API returns 500. 3. Redis permanently holds {env: staging}; node still has {env: prod}. 4. Any GET /sandboxes/S now returns {env: staging} from Redis while the sandbox process runs with {env: prod}. 5. Divergence persists across API restarts until a successful end-to-end PATCH overwrites both.

Step-by-step proof of Bug 2

Initial metadata: {}. Call A patches {x:1}, call B patches {y:2} concurrently. 1. A's Redis write: Redis={x:1}, A holds merged={x:1}. 2. B's Redis write (reads A's result): Redis={x:1,y:2}, B holds merged={x:1,y:2}. 3. B's gRPC fires first → node apiMetadata={x:1,y:2}. 4. A's gRPC fires second → node apiMetadata={x:1}. Final: Redis={x:1,y:2} (correct) vs. node={x:1} (stale). The divergence is silent, returns no error, and persists for the sandbox lifetime.

How to fix

Bug 1: Capture old metadata before calling sandboxStore.Update and issue a compensating sandboxStore.Update restoring the old value in the patchSandboxMetadataOnNode error path. Alternatively, reverse the order: call patchSandboxMetadataOnNode first (already rollback-safe on the node side) and only commit to Redis on success. Bug 2: Wrap the (sandboxStore.Update, patchSandboxMetadataOnNode) pair in a per-sandbox mutex or keyed lock so concurrent PATCH calls for the same sandbox are serialized end-to-end.

}

func applyMetadataPatch(current map[string]string, patch map[string]*string) map[string]string {
out := make(map[string]string, len(current)+len(patch))
maps.Copy(out, current)
for k, v := range patch {
if v == nil || *v == "" {
delete(out, k)
} else {
out[k] = *v
}
}

return out
}

func (o *Orchestrator) patchSandboxMetadataOnNode(
ctx context.Context,
sbx sandbox.Sandbox,
metadata map[string]string,
) *api.APIError {
ctx, span := tracer.Start(ctx, "patch-sandbox-metadata-on-node",
trace.WithAttributes(
attribute.String("instance.id", sbx.SandboxID),
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

telemetry.WithSandboxID

),
)
defer span.End()

node := o.getOrConnectNode(ctx, sbx.ClusterID, sbx.NodeID)
if node == nil {
return &api.APIError{
Code: http.StatusInternalServerError,
ClientMsg: fmt.Sprintf("Node hosting sandbox '%s' not found", sbx.SandboxID),
Err: fmt.Errorf("node '%s' not found for cluster '%s'", sbx.NodeID, sbx.ClusterID),
}
}

client, ctx := node.GetClient(ctx)
_, err := client.Sandbox.Update(ctx, &orchestratorgrpc.SandboxUpdateRequest{
SandboxId: sbx.SandboxID,
Metadata: &orchestratorgrpc.SandboxMetadataUpdate{Entries: metadata},
})
if err != nil {
grpcErr, ok := status.FromError(err)
if ok && grpcErr.Code() == codes.NotFound {
return &api.APIError{Code: http.StatusNotFound, ClientMsg: utils.SandboxNotFoundMsg(sbx.SandboxID), Err: err}
}

err = utils.UnwrapGRPCError(err)
telemetry.ReportCriticalError(ctx, "failed to patch sandbox metadata on node", err)

return &api.APIError{
Code: http.StatusInternalServerError,
ClientMsg: "Error applying metadata to sandbox",
Err: fmt.Errorf("failed to patch sandbox metadata on node: %w", err),
}
}

telemetry.ReportEvent(ctx, "Patched sandbox metadata on node")

return nil
}
9 changes: 9 additions & 0 deletions packages/orchestrator/orchestrator.proto
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,11 @@ message SandboxUpdateRequest {
// All fields are optional — only set fields are applied.
optional google.protobuf.Timestamp end_time = 2;
optional SandboxNetworkEgressConfig egress = 3;
optional SandboxMetadataUpdate metadata = 4;
}

message SandboxMetadataUpdate {
map<string, string> entries = 1;
}

message SandboxDeleteRequest {
Expand All @@ -127,6 +132,10 @@ message RunningSandbox {

google.protobuf.Timestamp start_time = 3;
google.protobuf.Timestamp end_time = 4;

// Live user-facing metadata tags. Authoritative over config.metadata, which
// reflects only the create-time snapshot.
map<string, string> metadata = 5;
Comment on lines +135 to +138
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why od we need this change?

}

message SandboxListResponse {
Expand Down
39 changes: 32 additions & 7 deletions packages/orchestrator/pkg/sandbox/sandbox.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,9 +182,10 @@ type Metadata struct {
Config *Config
Runtime RuntimeMetadata

rwmu sync.RWMutex // protects startedAt, endAt
startedAt time.Time
endAt time.Time
rwmu sync.RWMutex // protects startedAt, endAt, apiMetadata
startedAt time.Time
endAt time.Time
apiMetadata map[string]string
}

// GetEndAt returns the sandbox end time in a thread-safe manner.
Expand All @@ -203,6 +204,28 @@ func (m *Metadata) SetEndAt(t time.Time) {
m.endAt = t
}

// GetAPIMetadata returns the user-facing sandbox metadata tags. Callers must
// not mutate the map.
func (m *Metadata) GetAPIMetadata() map[string]string {
m.rwmu.RLock()
defer m.rwmu.RUnlock()

return m.apiMetadata
}

// SetAPIMetadata replaces the user-facing metadata. Callers must not mutate the
// map after handing it over.
func (m *Metadata) SetAPIMetadata(metadata map[string]string) {
if metadata == nil {
metadata = map[string]string{}
}

m.rwmu.Lock()
defer m.rwmu.Unlock()

m.apiMetadata = metadata
}

type Sandbox struct {
*Resources
*Metadata
Expand Down Expand Up @@ -456,8 +479,9 @@ func (f *Factory) CreateSandbox(
Config: config,
Runtime: runtime,

startedAt: time.Now(),
endAt: time.Now().Add(sandboxTimeout),
startedAt: time.Now(),
endAt: time.Now().Add(sandboxTimeout),
apiMetadata: apiConfigToStore.GetMetadata(),
}

sbx := &Sandbox{
Expand Down Expand Up @@ -803,8 +827,9 @@ func (f *Factory) ResumeSandbox(
Config: config,
Runtime: runtime,

startedAt: startedAt,
endAt: endAt,
startedAt: startedAt,
endAt: endAt,
apiMetadata: apiConfigToStore.GetMetadata(),
}

sbx := &Sandbox{
Expand Down
26 changes: 20 additions & 6 deletions packages/orchestrator/pkg/server/sandboxes.go
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,17 @@
})
}

if req.GetMetadata() != nil {
Comment thread
levb marked this conversation as resolved.
updates = append(updates, func(_ context.Context) (func(context.Context), error) {
oldMetadata := sbx.GetAPIMetadata()
sbx.SetAPIMetadata(req.GetMetadata().GetEntries())

return func(_ context.Context) {
sbx.SetAPIMetadata(oldMetadata)
}, nil
})
}

if err := utils.ApplyAllOrNone(ctx, updates); err != nil {
telemetry.ReportCriticalError(ctx, "failed to update sandbox", err)

Expand All @@ -355,7 +366,6 @@
"allowed_domains": egress.GetAllowedDomains(),
}
}

go s.sbxEventsService.Publish(
context.WithoutCancel(ctx),
teamID,
Expand Down Expand Up @@ -401,6 +411,7 @@
ClientId: s.info.ClientId,
StartTime: timestamppb.New(startedAt),
EndTime: timestamppb.New(sbx.GetEndAt()),
Metadata: sbx.GetAPIMetadata(),
})
}

Expand Down Expand Up @@ -629,12 +640,15 @@
)
if err != nil {
telemetry.ReportCriticalError(ctx, "error resuming sandbox after checkpoint", err, telemetry.WithSandboxID(in.GetSandboxId()))

return nil, status.Errorf(codes.Internal, "error resuming sandbox after checkpoint: %s", err)
}
// ResumeSandbox seeds apiMetadata from the (immutable) APIStoredConfig
// snapshot — override with the live value so any PATCH carries over.
resumedSbx.SetAPIMetadata(sbx.GetAPIMetadata())

// Collect prefetch data immediately after resume while it's most accurate
prefetchData, prefetchErr := resumedSbx.MemoryPrefetchData(ctx)

Check failure on line 651 in packages/orchestrator/pkg/server/sandboxes.go

View check run for this annotation

Claude / Claude Code Review

Checkpoint handler: SetAPIMetadata race window after MarkRunning

The Checkpoint handler calls `resumedSbx.SetAPIMetadata(sbx.GetAPIMetadata())` at line 648 *after* `ResumeSandbox` returns, but `ResumeSandbox` calls `MarkRunning` internally as its final step — making `resumedSbx` immediately discoverable via `Sandboxes.Get()` before the metadata override is applied. A concurrent PATCH gRPC `Update` call that lands in this window will apply the user's patched metadata to `resumedSbx`, which the Checkpoint handler then silently overwrites with the old sandbox's
Comment on lines 643 to 651
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 The Checkpoint handler calls resumedSbx.SetAPIMetadata(sbx.GetAPIMetadata()) at line 648 after ResumeSandbox returns, but ResumeSandbox calls MarkRunning internally as its final step — making resumedSbx immediately discoverable via Sandboxes.Get() before the metadata override is applied. A concurrent PATCH gRPC Update call that lands in this window will apply the user's patched metadata to resumedSbx, which the Checkpoint handler then silently overwrites with the old sandbox's pre-patch metadata, causing Redis (already updated by sandboxStore.Update) and the node to diverge.

Extended reasoning...

What the bug is and how it manifests

Inside ResumeSandbox (sandbox.go:933), f.Sandboxes.MarkRunning(ctx, sbx) is the last substantive call before return sbx, nil. The moment MarkRunning completes, Map.Get() (map.go:81-88) will return resumedSbx for any caller because IsRunning() is now true. The Checkpoint handler at sandboxes.go:625 then waits for ResumeSandbox to return, and only afterward calls resumedSbx.SetAPIMetadata(sbx.GetAPIMetadata()) at line 648. There is a real, non-zero window between those two events.

The specific code path that triggers it

  1. Checkpoint handler calls ResumeSandbox(..., sbx.APIStoredConfig) (line 625).
  2. Deep inside ResumeSandbox, line 933 executes f.Sandboxes.MarkRunning(ctx, sbx)resumedSbx is now visible to concurrent Get() callers. Two goroutines are spawned immediately after, creating Go scheduling points where other goroutines can run.
  3. ResumeSandbox returns to the Checkpoint handler.
  4. The Checkpoint handler executes resumedSbx.SetAPIMetadata(sbx.GetAPIMetadata()) (line 648) — note: sbx is the OLD sandbox; its GetAPIMetadata() reflects pre-PATCH state.

Why existing code does not prevent it

The Update gRPC handler (sandboxes.go:336-347) correctly saves and restores metadata via an ApplyAllOrNone closure, so the node-side rollback is safe. However, that rollback only protects the node state. The API layer's sandboxStore.Update (Redis) is committed before the gRPC call in patchSandboxMetadataOnNode, meaning Redis already holds the post-PATCH value when the race occurs. The rwmu lock inside SetAPIMetadata only prevents torn writes; it does nothing to prevent the Checkpoint handler from overwriting a legitimately applied PATCH.

What the impact would be

Consider: sbx.apiMetadata = {"env": "staging"} (the live value after a PATCH), while sbx.APIStoredConfig.Metadata = {"env": "prod"} (the immutable create-time snapshot). After the race:

  • Redis holds {"env": "staging"} (from sandboxStore.Update)
  • resumedSbx.apiMetadata is reset to {"env": "prod"} by the Checkpoint handler

Any subsequent GetAPIMetadata() call on the resumed sandbox returns stale data. The List RPC ships the wrong value, event telemetry records the wrong tags, and if the sandbox is checkpointed again the stale value propagates further.

How to fix it

Option A (preferred — atomic): Pass the live metadata into ResumeSandbox as a parameter and seed apiMetadata from it rather than from apiConfigToStore.GetMetadata(). This ensures the correct value is in place before MarkRunning makes the sandbox visible.

Option B (surgical): Extract MarkRunning out of ResumeSandbox and call it in the Checkpoint handler after resumedSbx.SetAPIMetadata(sbx.GetAPIMetadata()). This keeps the existing structure but requires callers to remember to call MarkRunning themselves.

Step-by-step proof of divergence

  1. Sandbox is created with metadata {"env": "prod"}; both Redis and apiMetadata hold this value.
  2. Client calls PATCH /sandboxes/{id}/metadata with {"env": "staging"}.
  3. sandboxStore.Update commits {"env": "staging"} to Redis.
  4. gRPC Update arrives at the node; sbx.SetAPIMetadata({"env": "staging"}) sets the live value.
  5. Concurrently, client calls POST /sandboxes/{id}/checkpoints.
  6. Checkpoint handler calls ResumeSandbox; inside, MarkRunning fires — resumedSbx is now publicly visible.
  7. Before the Checkpoint handler reaches line 648, a second Update gRPC call (e.g., another concurrent PATCH) arrives and calls resumedSbx.SetAPIMetadata({"env": "staging"}).
  8. Checkpoint handler executes resumedSbx.SetAPIMetadata(sbx.GetAPIMetadata())sbx still has {"env": "prod"} in its create-time snapshot or has not yet been updated.
  9. Result: Redis = {"env": "staging"}, node resumedSbx.apiMetadata = {"env": "prod"}. Silent divergence.

if prefetchErr != nil {
sbxlogger.I(resumedSbx).Warn(ctx, "failed to get prefetch data for checkpoint", zap.Error(prefetchErr))
}
Expand Down Expand Up @@ -692,13 +706,13 @@
}

buildId := ""
eventData := make(map[string]any)
if sbx.APIStoredConfig != nil {
buildId = sbx.APIStoredConfig.GetBuildId()
if sbx.APIStoredConfig.Metadata != nil {
// Copy the map to avoid race conditions
eventData["sandbox_metadata"] = utils.ShallowCopyMap(sbx.APIStoredConfig.GetMetadata())
}
}

eventData := make(map[string]any)
if md := sbx.GetAPIMetadata(); len(md) > 0 {
eventData["sandbox_metadata"] = md
}

return teamID, buildId, eventData
Expand Down
Loading
Loading