From fd15a5f8c259250bffc095244c30b0b661517c7b Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Wed, 10 Jun 2026 11:59:32 +0200
Subject: [PATCH] fix(cli): remove eval benchmark json flag

---
 .../cli/src/commands/eval/benchmark-writer.ts | 85 -------------------
 apps/cli/src/commands/eval/commands/run.ts    |  7 --
 apps/cli/src/commands/eval/run-eval.ts        | 18 ----
 .../commands/eval/benchmark-writer.test.ts    | 74 ----------------
 apps/cli/test/eval.integration.test.ts        | 31 +++++++
 .../guides/skill-improvement-workflow.mdx     |  2 +-
 .../docs/integrations/agent-skills-evals.mdx  | 12 ++-
 docs/plans/2026-06-09-eval-output-surface.md  |  4 +-
 8 files changed, 43 insertions(+), 190 deletions(-)
 delete mode 100644 apps/cli/src/commands/eval/benchmark-writer.ts
 delete mode 100644 apps/cli/test/commands/eval/benchmark-writer.test.ts
diff --git a/apps/cli/src/commands/eval/benchmark-writer.ts b/apps/cli/src/commands/eval/benchmark-writer.ts
deleted file mode 100644
index 671de1160..000000000
--- a/apps/cli/src/commands/eval/benchmark-writer.ts
+++ /dev/null
@@ -1,85 +0,0 @@
-import { writeFile } from 'node:fs/promises';
-
-import { DEFAULT_THRESHOLD, type EvaluationResult } from '@agentv/core';
-
-interface BenchmarkStats {
-  readonly mean: number;
-  readonly stddev: number;
-}
-
-interface BenchmarkRunSummary {
-  readonly pass_rate: BenchmarkStats;
-  readonly time_seconds: BenchmarkStats;
-  readonly tokens: BenchmarkStats;
-}
-
-interface BenchmarkJson {
-  readonly run_summary: {
-    readonly with_skill: BenchmarkRunSummary;
-  };
-}
-
-function computeStats(values: readonly number[]): BenchmarkStats {
-  if (values.length === 0) {
-    return { mean: 0, stddev: 0 };
-  }
-  const mean = values.reduce((sum, v) => sum + v, 0) / values.length;
-  const variance = values.reduce((sum, v) => sum + (v - mean) ** 2, 0) / values.length;
-  return {
-    mean: Math.round(mean * 1000) / 1000,
-    stddev: Math.round(Math.sqrt(variance) * 1000) / 1000,
-  };
-}
-
-/**
- * Compute per-test pass_rate from grader scores.
- *
- * For each test, pass_rate = count(evaluator.score >= 0.8) / total_evaluators.
- * If no per-grader scores exist, falls back to the top-level result score
- * with the same threshold (>= 0.8 → 1.0, else 0.0).
- */
-function computePassRate(result: EvaluationResult): number {
-  const scores = result.scores;
-  if (scores && scores.length > 0) {
-    const passed = scores.filter((s) => s.score >= DEFAULT_THRESHOLD).length;
-    return passed / scores.length;
-  }
-  return result.score >= DEFAULT_THRESHOLD ? 1.0 : 0.0;
-}
-
-/**
- * Build an Agent Skills benchmark.json from AgentV evaluation results.
- */
-export function buildBenchmarkJson(results: readonly EvaluationResult[]): BenchmarkJson {
-  const passRates = results.map(computePassRate);
-  const timings = results
-    .filter((r) => r.durationMs != null)
-    .map((r) => (r.durationMs as number) / 1000);
-  const tokens = results
-    .filter((r) => r.tokenUsage != null)
-    .map((r) => {
-      const usage = r.tokenUsage as { input?: number; output?: number };
-      return (usage.input ?? 0) + (usage.output ?? 0);
-    });
-
-  return {
-    run_summary: {
-      with_skill: {
-        pass_rate: computeStats(passRates),
-        time_seconds: computeStats(timings),
-        tokens: computeStats(tokens),
-      },
-    },
-  };
-}
-
-/**
- * Write benchmark.json to disk.
- */
-export async function writeBenchmarkJson(
-  outputPath: string,
-  results: readonly EvaluationResult[],
-): Promise<void> {
-  const benchmark = buildBenchmarkJson(results);
-  await writeFile(outputPath, `${JSON.stringify(benchmark, null, 2)}\n`, 'utf8');
-}
diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts
index c0e7f045f..b6bc035c2 100644
--- a/apps/cli/src/commands/eval/commands/run.ts
+++ b/apps/cli/src/commands/eval/commands/run.ts
@@ -179,12 +179,6 @@ export const evalRunCommand = command({
       long: 'strict',
       description: 'Exit with error on version mismatch (instead of warning)',
     }),
-    benchmarkJson: option({
-      type: optional(string),
-      long: 'benchmark-json',
-      description:
-        '[Deprecated: benchmark.json is included in artifact dir] Write Agent Skills benchmark.json to the specified path',
-    }),
     artifacts: option({
       type: optional(string),
       long: 'artifacts',
@@ -282,7 +276,6 @@ export const evalRunCommand = command({
       resume: args.resume,
       rerunFailed: args.rerunFailed,
       strict: args.strict,
-      benchmarkJson: args.benchmarkJson,
       artifacts: args.artifacts,
       graderTarget: args.graderTarget,
       model: args.model,
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index 3badda9d0..0fe445eac 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -40,7 +40,6 @@ import {
   writeArtifactsFromResults,
   writeInitialBenchmarkArtifact,
 } from './artifact-writer.js';
-import { writeBenchmarkJson } from './benchmark-writer.js';
 import { loadEnvFromHierarchy } from './env.js';
 import { type OutputWriter, createOutputWriter, createWriterFromPath } from './output-writer.js';
 import { ProgressDisplay, type Verdict, type WorkerProgress } from './progress-display.js';
@@ -113,8 +112,6 @@ interface NormalizedOptions {
   readonly workspaceMode?: 'pooled' | 'temp' | 'static';
   readonly workspacePath?: string;
   readonly keepWorkspaces: boolean;
-  /** Deprecated: benchmark.json is always written to artifact dir */
-  readonly benchmarkJson?: string;
   /** Removed: use --output instead */
   readonly artifacts?: string;
   /** Removed: the run directory always uses index.jsonl */
@@ -461,7 +458,6 @@ function normalizeOptions(
       normalizeBoolean(rawOptions.keepWorkspaces) ||
       yamlExecution?.keep_workspaces === true ||
       config?.execution?.keepWorkspaces === true,
-    benchmarkJson: normalizeString(rawOptions.benchmarkJson),
     artifacts: normalizeString(rawOptions.artifacts),
     outputFormat: normalizeString(rawOptions.outputFormat),
     graderTarget: normalizeString(rawOptions.graderTarget),
@@ -1250,13 +1246,6 @@ export async function runEvalCommand(
     console.log(`Repository root: ${repoRoot}`);
   }
 
-  // Emit deprecation warnings for remaining legacy flags.
-  if (options.benchmarkJson) {
-    console.warn(
-      'Warning: --benchmark-json is deprecated. benchmark.json is always written to the artifact directory.',
-    );
-  }
-
   // Resolve artifact directory (runDir) and primary output path.
   // Precedence: --output > config output.dir > default
   const explicitDir = options.outputDir;
@@ -1776,13 +1765,6 @@ export async function runEvalCommand(
       console.log(formatMatrixSummary(summaryResults));
     }
 
-    // Write Agent Skills benchmark.json if requested (deprecated flag — backward compat)
-    if (options.benchmarkJson && allResults.length > 0) {
-      const benchmarkPath = path.resolve(options.benchmarkJson);
-      await writeBenchmarkJson(benchmarkPath, allResults);
-      console.log(`Benchmark written to: ${benchmarkPath}`);
-    }
-
     // Write artifacts to the run directory (always, not conditional on flags)
     if (allResults.length > 0) {
       const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : '';
diff --git a/apps/cli/test/commands/eval/benchmark-writer.test.ts b/apps/cli/test/commands/eval/benchmark-writer.test.ts
deleted file mode 100644
index 56349ecd0..000000000
--- a/apps/cli/test/commands/eval/benchmark-writer.test.ts
+++ /dev/null
@@ -1,74 +0,0 @@
-import { describe, expect, it } from 'bun:test';
-
-import type { EvaluationResult } from '@agentv/core';
-import { buildBenchmarkJson } from '../../../src/commands/eval/benchmark-writer.js';
-
-function makeResult(overrides: Partial<EvaluationResult> = {}): EvaluationResult {
-  return {
-    timestamp: '2026-03-13T00:00:00.000Z',
-    testId: 'test-1',
-    score: 0.9,
-    assertions: [],
-    output: [{ role: 'assistant' as const, content: 'test answer' }],
-    target: 'test-target',
-    verdict: 'pass',
-    executionStatus: 'ok',
-    ...overrides,
-  } as EvaluationResult;
-}
-
-describe('buildBenchmarkJson', () => {
-  it('computes pass_rate from per-grader scores', () => {
-    const results = [
-      makeResult({
-        scores: [
-          { name: 'a1', type: 'llm-grader', score: 0.9, assertions: [] },
-          { name: 'a2', type: 'llm-grader', score: 0.7, assertions: [] },
-          { name: 'a3', type: 'llm-grader', score: 0.85, assertions: [] },
-        ],
-      }),
-    ];
-    const benchmark = buildBenchmarkJson(results);
-    // 2 of 3 pass (>= 0.8), so pass_rate = 2/3 ≈ 0.667
-    expect(benchmark.run_summary.with_skill.pass_rate.mean).toBeCloseTo(0.667, 2);
-    expect(benchmark.run_summary.with_skill.pass_rate.stddev).toBe(0);
-  });
-
-  it('falls back to top-level score when no grader scores', () => {
-    const results = [makeResult({ score: 0.9 }), makeResult({ score: 0.5 })];
-    const benchmark = buildBenchmarkJson(results);
-    // First passes (>= 0.8 → 1.0), second fails (< 0.8 → 0.0), mean = 0.5
-    expect(benchmark.run_summary.with_skill.pass_rate.mean).toBe(0.5);
-    expect(benchmark.run_summary.with_skill.pass_rate.stddev).toBe(0.5);
-  });
-
-  it('computes time_seconds from durationMs', () => {
-    const results = [makeResult({ durationMs: 30000 }), makeResult({ durationMs: 60000 })];
-    const benchmark = buildBenchmarkJson(results);
-    expect(benchmark.run_summary.with_skill.time_seconds.mean).toBe(45);
-    expect(benchmark.run_summary.with_skill.time_seconds.stddev).toBe(15);
-  });
-
-  it('computes tokens from tokenUsage', () => {
-    const results = [
-      makeResult({ tokenUsage: { input: 1000, output: 500 } } as Partial<EvaluationResult>),
-      makeResult({ tokenUsage: { input: 2000, output: 1000 } } as Partial<EvaluationResult>),
-    ];
-    const benchmark = buildBenchmarkJson(results);
-    expect(benchmark.run_summary.with_skill.tokens.mean).toBe(2250);
-    expect(benchmark.run_summary.with_skill.tokens.stddev).toBe(750);
-  });
-
-  it('handles empty results', () => {
-    const benchmark = buildBenchmarkJson([]);
-    expect(benchmark.run_summary.with_skill.pass_rate.mean).toBe(0);
-    expect(benchmark.run_summary.with_skill.pass_rate.stddev).toBe(0);
-  });
-
-  it('handles results without timing or token data', () => {
-    const results = [makeResult({})];
-    const benchmark = buildBenchmarkJson(results);
-    expect(benchmark.run_summary.with_skill.time_seconds.mean).toBe(0);
-    expect(benchmark.run_summary.with_skill.tokens.mean).toBe(0);
-  });
-});
diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts
index e1117b17e..88a3bf5fd 100644
--- a/apps/cli/test/eval.integration.test.ts
+++ b/apps/cli/test/eval.integration.test.ts
@@ -557,4 +557,35 @@ describe('agentv eval CLI', () => {
       .toLowerCase();
     expect(transcriptHelp).not.toContain('cache');
   }, 30_000);
+
+  it('omits removed benchmark JSON export flag from help', async () => {
+    const result = await execa('bun', ['--no-env-file', CLI_ENTRY, 'eval', 'run', '--help'], {
+      cwd: projectRoot,
+      env: { ...process.env, CI: 'true' },
+      reject: false,
+    });
+    const helpText = `${result.stdout}\n${result.stderr}`;
+    expect(helpText).not.toContain('--benchmark-json');
+    expect(helpText).toContain('--output');
+    expect(helpText).toContain('benchmark.json');
+  }, 30_000);
+
+  it('rejects the removed benchmark JSON export flag as an unknown argument', async () => {
+    const fixture = await createFixture();
+    try {
+      const result = await runCli(fixture, [
+        'eval',
+        fixture.testFilePath,
+        '--benchmark-json',
+        path.join(fixture.baseDir, 'benchmark.json'),
+      ]);
+
+      expect(result.exitCode).not.toBe(0);
+      const output = `${result.stdout}\n${result.stderr}`;
+      expect(output).toContain('Unknown arguments');
+      expect(output).toContain('--benchmark-json');
+    } finally {
+      await rm(fixture.baseDir, { recursive: true, force: true });
+    }
+  }, 30_000);
 });
diff --git a/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx b/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx
index ec5ac0d2e..c926ed6f5 100644
--- a/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx
+++ b/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx
@@ -255,7 +255,7 @@ If you've been using the Agent Skills skill-creator workflow, AgentV reads your
 | `evals.json` | `agentv eval evals.json` | Direct — no conversion needed |
 | `claude -p "prompt"` | `agentv eval evals.json --target claude` | Same eval, richer engine |
 | `grading.json` (read) | `<test-id>/grading.json` (write) | Same per-test schema, AgentV writes one grading file per test case |
-| `benchmark.json` (read) | `benchmark.json` (write) | Same schema, AgentV produces it |
+| `benchmark.json` (read) | `<output>/benchmark.json` (write) | AgentV writes the canonical run summary; convert it in a wrapper if another tool needs a narrower compatibility shape |
 | n/a | `index.jsonl` (write) | AgentV-specific per-test manifest for filtering, retry, and replay workflows |
 | with-skill vs without-skill | `--target baseline --target candidate` | Structured comparison |
 | Graduate to richer evals | `agentv convert evals.json` → EVAL.yaml | Adds workspace, code graders, etc. |
diff --git a/apps/web/src/content/docs/docs/integrations/agent-skills-evals.mdx b/apps/web/src/content/docs/docs/integrations/agent-skills-evals.mdx
index 54c03807e..3b4639677 100644
--- a/apps/web/src/content/docs/docs/integrations/agent-skills-evals.mdx
+++ b/apps/web/src/content/docs/docs/integrations/agent-skills-evals.mdx
@@ -107,19 +107,23 @@ The rest of the bundle follows the same pattern:
 
 ## Benchmark output
 
-Generate an Agent Skills compatible `benchmark.json` alongside the standard result JSONL. The `benchmark.json` is automatically written to the artifact directory:
+Generate the run `benchmark.json` alongside the standard result JSONL. The `benchmark.json` is automatically written to the artifact directory:
 
 ```bash
 agentv eval evals.json --target claude --output ./results
 # benchmark.json is written to ./results/benchmark.json
 ```
 
-The benchmark uses AgentV's pass threshold (score >= 0.8) to map continuous scores to the binary pass/fail that Agent Skills `pass_rate` expects:
+The benchmark uses AgentV's pass threshold (score >= 0.8) for each target's `pass_rate`, plus timing and token summaries:
 
 ```json
 {
+  "metadata": {
+    "targets": ["claude"],
+    "tests_run": ["example-test"]
+  },
   "run_summary": {
-    "with_skill": {
+    "claude": {
       "pass_rate": {"mean": 0.83, "stddev": 0.06},
       "time_seconds": {"mean": 45.0, "stddev": 12.0},
       "tokens": {"mean": 3800, "stddev": 400}
@@ -128,6 +132,8 @@ The benchmark uses AgentV's pass threshold (score >= 0.8) to map continuous scor
 }
 ```
 
+If another tool needs a different benchmark shape, keep `--output` as the source of truth and convert `<output>/benchmark.json` in a wrapper.
+
 ## Converting to EVAL.yaml
 
 When you're ready to graduate, convert your evals.json to EVAL.yaml:
diff --git a/docs/plans/2026-06-09-eval-output-surface.md b/docs/plans/2026-06-09-eval-output-surface.md
index 849cddbd1..dabf81c0e 100644
--- a/docs/plans/2026-06-09-eval-output-surface.md
+++ b/docs/plans/2026-06-09-eval-output-surface.md
@@ -14,7 +14,7 @@ The eval run command currently exposes several overlapping ways to choose where
 - `--out <path>` is deprecated and currently treated as a file path whose dirname becomes the artifact directory.
 - `--artifacts <dir>` is deprecated and currently aliases the artifact directory.
 - `--output-format` is deprecated and ignored because run directories always use `index.jsonl`.
-- `--benchmark-json` is deprecated, still writes an extra Agent Skills compatibility file, and is outside this cleanup's requested removal set.
+- `--benchmark-json` was a deprecated extra Agent Skills compatibility output path outside this cleanup's requested removal set; a follow-up cleanup removes that flag and keeps the run directory `benchmark.json` as canonical.
 - Dashboard launch paths already pass `--output <dir>` and expect `<dir>/index.jsonl`.
 - Repository docs/examples still contain old `agentv eval --out <file>` guidance in compare workflows, grader-score helper comments, and local scripts.
 
@@ -51,7 +51,7 @@ Removed now:
 
 Warned/scheduled:
 
-- `--benchmark-json` remains deprecated for now because the Bead did not list it as a known surface and it writes a specialized compatibility artifact. Follow-up cleanup should remove it after a separate audit.
+- `--benchmark-json` is removed by the follow-up cleanup after auditing for consumers; use `--output <dir>` and read `<dir>/benchmark.json` instead of requesting a second benchmark file.
 
 ## Migration