From fd15a5f8c259250bffc095244c30b0b661517c7b Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 10 Jun 2026 11:59:32 +0200 Subject: [PATCH] fix(cli): remove eval benchmark json flag --- .../cli/src/commands/eval/benchmark-writer.ts | 85 ------------------- apps/cli/src/commands/eval/commands/run.ts | 7 -- apps/cli/src/commands/eval/run-eval.ts | 18 ---- .../commands/eval/benchmark-writer.test.ts | 74 ---------------- apps/cli/test/eval.integration.test.ts | 31 +++++++ .../guides/skill-improvement-workflow.mdx | 2 +- .../docs/integrations/agent-skills-evals.mdx | 12 ++- docs/plans/2026-06-09-eval-output-surface.md | 4 +- 8 files changed, 43 insertions(+), 190 deletions(-) delete mode 100644 apps/cli/src/commands/eval/benchmark-writer.ts delete mode 100644 apps/cli/test/commands/eval/benchmark-writer.test.ts diff --git a/apps/cli/src/commands/eval/benchmark-writer.ts b/apps/cli/src/commands/eval/benchmark-writer.ts deleted file mode 100644 index 671de1160..000000000 --- a/apps/cli/src/commands/eval/benchmark-writer.ts +++ /dev/null @@ -1,85 +0,0 @@ -import { writeFile } from 'node:fs/promises'; - -import { DEFAULT_THRESHOLD, type EvaluationResult } from '@agentv/core'; - -interface BenchmarkStats { - readonly mean: number; - readonly stddev: number; -} - -interface BenchmarkRunSummary { - readonly pass_rate: BenchmarkStats; - readonly time_seconds: BenchmarkStats; - readonly tokens: BenchmarkStats; -} - -interface BenchmarkJson { - readonly run_summary: { - readonly with_skill: BenchmarkRunSummary; - }; -} - -function computeStats(values: readonly number[]): BenchmarkStats { - if (values.length === 0) { - return { mean: 0, stddev: 0 }; - } - const mean = values.reduce((sum, v) => sum + v, 0) / values.length; - const variance = values.reduce((sum, v) => sum + (v - mean) ** 2, 0) / values.length; - return { - mean: Math.round(mean * 1000) / 1000, - stddev: Math.round(Math.sqrt(variance) * 1000) / 1000, - }; -} - -/** - * Compute per-test pass_rate from grader scores. - * - * For each test, pass_rate = count(evaluator.score >= 0.8) / total_evaluators. - * If no per-grader scores exist, falls back to the top-level result score - * with the same threshold (>= 0.8 → 1.0, else 0.0). - */ -function computePassRate(result: EvaluationResult): number { - const scores = result.scores; - if (scores && scores.length > 0) { - const passed = scores.filter((s) => s.score >= DEFAULT_THRESHOLD).length; - return passed / scores.length; - } - return result.score >= DEFAULT_THRESHOLD ? 1.0 : 0.0; -} - -/** - * Build an Agent Skills benchmark.json from AgentV evaluation results. - */ -export function buildBenchmarkJson(results: readonly EvaluationResult[]): BenchmarkJson { - const passRates = results.map(computePassRate); - const timings = results - .filter((r) => r.durationMs != null) - .map((r) => (r.durationMs as number) / 1000); - const tokens = results - .filter((r) => r.tokenUsage != null) - .map((r) => { - const usage = r.tokenUsage as { input?: number; output?: number }; - return (usage.input ?? 0) + (usage.output ?? 0); - }); - - return { - run_summary: { - with_skill: { - pass_rate: computeStats(passRates), - time_seconds: computeStats(timings), - tokens: computeStats(tokens), - }, - }, - }; -} - -/** - * Write benchmark.json to disk. - */ -export async function writeBenchmarkJson( - outputPath: string, - results: readonly EvaluationResult[], -): Promise { - const benchmark = buildBenchmarkJson(results); - await writeFile(outputPath, `${JSON.stringify(benchmark, null, 2)}\n`, 'utf8'); -} diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts index c0e7f045f..b6bc035c2 100644 --- a/apps/cli/src/commands/eval/commands/run.ts +++ b/apps/cli/src/commands/eval/commands/run.ts @@ -179,12 +179,6 @@ export const evalRunCommand = command({ long: 'strict', description: 'Exit with error on version mismatch (instead of warning)', }), - benchmarkJson: option({ - type: optional(string), - long: 'benchmark-json', - description: - '[Deprecated: benchmark.json is included in artifact dir] Write Agent Skills benchmark.json to the specified path', - }), artifacts: option({ type: optional(string), long: 'artifacts', @@ -282,7 +276,6 @@ export const evalRunCommand = command({ resume: args.resume, rerunFailed: args.rerunFailed, strict: args.strict, - benchmarkJson: args.benchmarkJson, artifacts: args.artifacts, graderTarget: args.graderTarget, model: args.model, diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 3badda9d0..0fe445eac 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -40,7 +40,6 @@ import { writeArtifactsFromResults, writeInitialBenchmarkArtifact, } from './artifact-writer.js'; -import { writeBenchmarkJson } from './benchmark-writer.js'; import { loadEnvFromHierarchy } from './env.js'; import { type OutputWriter, createOutputWriter, createWriterFromPath } from './output-writer.js'; import { ProgressDisplay, type Verdict, type WorkerProgress } from './progress-display.js'; @@ -113,8 +112,6 @@ interface NormalizedOptions { readonly workspaceMode?: 'pooled' | 'temp' | 'static'; readonly workspacePath?: string; readonly keepWorkspaces: boolean; - /** Deprecated: benchmark.json is always written to artifact dir */ - readonly benchmarkJson?: string; /** Removed: use --output instead */ readonly artifacts?: string; /** Removed: the run directory always uses index.jsonl */ @@ -461,7 +458,6 @@ function normalizeOptions( normalizeBoolean(rawOptions.keepWorkspaces) || yamlExecution?.keep_workspaces === true || config?.execution?.keepWorkspaces === true, - benchmarkJson: normalizeString(rawOptions.benchmarkJson), artifacts: normalizeString(rawOptions.artifacts), outputFormat: normalizeString(rawOptions.outputFormat), graderTarget: normalizeString(rawOptions.graderTarget), @@ -1250,13 +1246,6 @@ export async function runEvalCommand( console.log(`Repository root: ${repoRoot}`); } - // Emit deprecation warnings for remaining legacy flags. - if (options.benchmarkJson) { - console.warn( - 'Warning: --benchmark-json is deprecated. benchmark.json is always written to the artifact directory.', - ); - } - // Resolve artifact directory (runDir) and primary output path. // Precedence: --output > config output.dir > default const explicitDir = options.outputDir; @@ -1776,13 +1765,6 @@ export async function runEvalCommand( console.log(formatMatrixSummary(summaryResults)); } - // Write Agent Skills benchmark.json if requested (deprecated flag — backward compat) - if (options.benchmarkJson && allResults.length > 0) { - const benchmarkPath = path.resolve(options.benchmarkJson); - await writeBenchmarkJson(benchmarkPath, allResults); - console.log(`Benchmark written to: ${benchmarkPath}`); - } - // Write artifacts to the run directory (always, not conditional on flags) if (allResults.length > 0) { const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : ''; diff --git a/apps/cli/test/commands/eval/benchmark-writer.test.ts b/apps/cli/test/commands/eval/benchmark-writer.test.ts deleted file mode 100644 index 56349ecd0..000000000 --- a/apps/cli/test/commands/eval/benchmark-writer.test.ts +++ /dev/null @@ -1,74 +0,0 @@ -import { describe, expect, it } from 'bun:test'; - -import type { EvaluationResult } from '@agentv/core'; -import { buildBenchmarkJson } from '../../../src/commands/eval/benchmark-writer.js'; - -function makeResult(overrides: Partial = {}): EvaluationResult { - return { - timestamp: '2026-03-13T00:00:00.000Z', - testId: 'test-1', - score: 0.9, - assertions: [], - output: [{ role: 'assistant' as const, content: 'test answer' }], - target: 'test-target', - verdict: 'pass', - executionStatus: 'ok', - ...overrides, - } as EvaluationResult; -} - -describe('buildBenchmarkJson', () => { - it('computes pass_rate from per-grader scores', () => { - const results = [ - makeResult({ - scores: [ - { name: 'a1', type: 'llm-grader', score: 0.9, assertions: [] }, - { name: 'a2', type: 'llm-grader', score: 0.7, assertions: [] }, - { name: 'a3', type: 'llm-grader', score: 0.85, assertions: [] }, - ], - }), - ]; - const benchmark = buildBenchmarkJson(results); - // 2 of 3 pass (>= 0.8), so pass_rate = 2/3 ≈ 0.667 - expect(benchmark.run_summary.with_skill.pass_rate.mean).toBeCloseTo(0.667, 2); - expect(benchmark.run_summary.with_skill.pass_rate.stddev).toBe(0); - }); - - it('falls back to top-level score when no grader scores', () => { - const results = [makeResult({ score: 0.9 }), makeResult({ score: 0.5 })]; - const benchmark = buildBenchmarkJson(results); - // First passes (>= 0.8 → 1.0), second fails (< 0.8 → 0.0), mean = 0.5 - expect(benchmark.run_summary.with_skill.pass_rate.mean).toBe(0.5); - expect(benchmark.run_summary.with_skill.pass_rate.stddev).toBe(0.5); - }); - - it('computes time_seconds from durationMs', () => { - const results = [makeResult({ durationMs: 30000 }), makeResult({ durationMs: 60000 })]; - const benchmark = buildBenchmarkJson(results); - expect(benchmark.run_summary.with_skill.time_seconds.mean).toBe(45); - expect(benchmark.run_summary.with_skill.time_seconds.stddev).toBe(15); - }); - - it('computes tokens from tokenUsage', () => { - const results = [ - makeResult({ tokenUsage: { input: 1000, output: 500 } } as Partial), - makeResult({ tokenUsage: { input: 2000, output: 1000 } } as Partial), - ]; - const benchmark = buildBenchmarkJson(results); - expect(benchmark.run_summary.with_skill.tokens.mean).toBe(2250); - expect(benchmark.run_summary.with_skill.tokens.stddev).toBe(750); - }); - - it('handles empty results', () => { - const benchmark = buildBenchmarkJson([]); - expect(benchmark.run_summary.with_skill.pass_rate.mean).toBe(0); - expect(benchmark.run_summary.with_skill.pass_rate.stddev).toBe(0); - }); - - it('handles results without timing or token data', () => { - const results = [makeResult({})]; - const benchmark = buildBenchmarkJson(results); - expect(benchmark.run_summary.with_skill.time_seconds.mean).toBe(0); - expect(benchmark.run_summary.with_skill.tokens.mean).toBe(0); - }); -}); diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts index e1117b17e..88a3bf5fd 100644 --- a/apps/cli/test/eval.integration.test.ts +++ b/apps/cli/test/eval.integration.test.ts @@ -557,4 +557,35 @@ describe('agentv eval CLI', () => { .toLowerCase(); expect(transcriptHelp).not.toContain('cache'); }, 30_000); + + it('omits removed benchmark JSON export flag from help', async () => { + const result = await execa('bun', ['--no-env-file', CLI_ENTRY, 'eval', 'run', '--help'], { + cwd: projectRoot, + env: { ...process.env, CI: 'true' }, + reject: false, + }); + const helpText = `${result.stdout}\n${result.stderr}`; + expect(helpText).not.toContain('--benchmark-json'); + expect(helpText).toContain('--output'); + expect(helpText).toContain('benchmark.json'); + }, 30_000); + + it('rejects the removed benchmark JSON export flag as an unknown argument', async () => { + const fixture = await createFixture(); + try { + const result = await runCli(fixture, [ + 'eval', + fixture.testFilePath, + '--benchmark-json', + path.join(fixture.baseDir, 'benchmark.json'), + ]); + + expect(result.exitCode).not.toBe(0); + const output = `${result.stdout}\n${result.stderr}`; + expect(output).toContain('Unknown arguments'); + expect(output).toContain('--benchmark-json'); + } finally { + await rm(fixture.baseDir, { recursive: true, force: true }); + } + }, 30_000); }); diff --git a/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx b/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx index ec5ac0d2e..c926ed6f5 100644 --- a/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx +++ b/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx @@ -255,7 +255,7 @@ If you've been using the Agent Skills skill-creator workflow, AgentV reads your | `evals.json` | `agentv eval evals.json` | Direct — no conversion needed | | `claude -p "prompt"` | `agentv eval evals.json --target claude` | Same eval, richer engine | | `grading.json` (read) | `/grading.json` (write) | Same per-test schema, AgentV writes one grading file per test case | -| `benchmark.json` (read) | `benchmark.json` (write) | Same schema, AgentV produces it | +| `benchmark.json` (read) | `/benchmark.json` (write) | AgentV writes the canonical run summary; convert it in a wrapper if another tool needs a narrower compatibility shape | | n/a | `index.jsonl` (write) | AgentV-specific per-test manifest for filtering, retry, and replay workflows | | with-skill vs without-skill | `--target baseline --target candidate` | Structured comparison | | Graduate to richer evals | `agentv convert evals.json` → EVAL.yaml | Adds workspace, code graders, etc. | diff --git a/apps/web/src/content/docs/docs/integrations/agent-skills-evals.mdx b/apps/web/src/content/docs/docs/integrations/agent-skills-evals.mdx index 54c03807e..3b4639677 100644 --- a/apps/web/src/content/docs/docs/integrations/agent-skills-evals.mdx +++ b/apps/web/src/content/docs/docs/integrations/agent-skills-evals.mdx @@ -107,19 +107,23 @@ The rest of the bundle follows the same pattern: ## Benchmark output -Generate an Agent Skills compatible `benchmark.json` alongside the standard result JSONL. The `benchmark.json` is automatically written to the artifact directory: +Generate the run `benchmark.json` alongside the standard result JSONL. The `benchmark.json` is automatically written to the artifact directory: ```bash agentv eval evals.json --target claude --output ./results # benchmark.json is written to ./results/benchmark.json ``` -The benchmark uses AgentV's pass threshold (score >= 0.8) to map continuous scores to the binary pass/fail that Agent Skills `pass_rate` expects: +The benchmark uses AgentV's pass threshold (score >= 0.8) for each target's `pass_rate`, plus timing and token summaries: ```json { + "metadata": { + "targets": ["claude"], + "tests_run": ["example-test"] + }, "run_summary": { - "with_skill": { + "claude": { "pass_rate": {"mean": 0.83, "stddev": 0.06}, "time_seconds": {"mean": 45.0, "stddev": 12.0}, "tokens": {"mean": 3800, "stddev": 400} @@ -128,6 +132,8 @@ The benchmark uses AgentV's pass threshold (score >= 0.8) to map continuous scor } ``` +If another tool needs a different benchmark shape, keep `--output` as the source of truth and convert `/benchmark.json` in a wrapper. + ## Converting to EVAL.yaml When you're ready to graduate, convert your evals.json to EVAL.yaml: diff --git a/docs/plans/2026-06-09-eval-output-surface.md b/docs/plans/2026-06-09-eval-output-surface.md index 849cddbd1..dabf81c0e 100644 --- a/docs/plans/2026-06-09-eval-output-surface.md +++ b/docs/plans/2026-06-09-eval-output-surface.md @@ -14,7 +14,7 @@ The eval run command currently exposes several overlapping ways to choose where - `--out ` is deprecated and currently treated as a file path whose dirname becomes the artifact directory. - `--artifacts ` is deprecated and currently aliases the artifact directory. - `--output-format` is deprecated and ignored because run directories always use `index.jsonl`. -- `--benchmark-json` is deprecated, still writes an extra Agent Skills compatibility file, and is outside this cleanup's requested removal set. +- `--benchmark-json` was a deprecated extra Agent Skills compatibility output path outside this cleanup's requested removal set; a follow-up cleanup removes that flag and keeps the run directory `benchmark.json` as canonical. - Dashboard launch paths already pass `--output ` and expect `/index.jsonl`. - Repository docs/examples still contain old `agentv eval --out ` guidance in compare workflows, grader-score helper comments, and local scripts. @@ -51,7 +51,7 @@ Removed now: Warned/scheduled: -- `--benchmark-json` remains deprecated for now because the Bead did not list it as a known surface and it writes a specialized compatibility artifact. Follow-up cleanup should remove it after a separate audit. +- `--benchmark-json` is removed by the follow-up cleanup after auditing for consumers; use `--output ` and read `/benchmark.json` instead of requesting a second benchmark file. ## Migration