-
Notifications
You must be signed in to change notification settings - Fork 8
Expand file tree
/
Copy pathreporting.py
More file actions
131 lines (113 loc) · 4.65 KB
/
reporting.py
File metadata and controls
131 lines (113 loc) · 4.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
"""jsonl → per-adapter rollup → markdown summary + CSV.
The rollup is a dict keyed by adapter name; each value is an AdapterSummary.
The primary-axis winner is determined mechanically: whichever adapter has the
best value on the declared `primary_axis` key of AdapterSummary.
"""
from __future__ import annotations
import csv
import json
import statistics
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Iterator
from benchmarks.suite.scoring import latency_stats
@dataclass
class AdapterSummary:
adapter: str
n_queries: int
coverage_pct: float
acc_at_1_pct: float
acc_at_5_pct: float
acc_at_10_pct: float
mrr: float
avg_latency_ms: float
median_latency_ms: float
p95_latency_ms: float
avg_tokens: float
def _iter_rows(jsonl_path: Path) -> Iterator[dict]:
with jsonl_path.open() as f:
for line in f:
line = line.strip()
if not line:
continue
yield json.loads(line)
def rollup_from_jsonl(jsonl_path: Path) -> dict[str, AdapterSummary]:
per_adapter: dict[str, list[dict]] = {}
for row in _iter_rows(jsonl_path):
per_adapter.setdefault(row["adapter"], []).append(row)
out: dict[str, AdapterSummary] = {}
for name, rows in per_adapter.items():
n = len(rows)
lat_stats = latency_stats([r["latency_ms"] for r in rows])
covered = sum(1 for r in rows if r["paths_count"] > 0)
hit1 = sum(1 for r in rows if r["rank"] == 1)
hit5 = sum(1 for r in rows if r["rank"] is not None and r["rank"] <= 5)
hit10 = sum(1 for r in rows if r["rank"] is not None and r["rank"] <= 10)
recips = [1.0 / r["rank"] if r["rank"] is not None else 0.0 for r in rows]
tokens = [r["tokens"] for r in rows]
out[name] = AdapterSummary(
adapter=name,
n_queries=n,
coverage_pct=round(covered / n * 100, 2) if n else 0.0,
acc_at_1_pct=round(hit1 / n * 100, 2) if n else 0.0,
acc_at_5_pct=round(hit5 / n * 100, 2) if n else 0.0,
acc_at_10_pct=round(hit10 / n * 100, 2) if n else 0.0,
mrr=round(sum(recips) / n, 3) if n else 0.0,
avg_latency_ms=round(lat_stats["mean"], 2),
median_latency_ms=round(lat_stats["median"], 2),
p95_latency_ms=round(lat_stats["p95"], 2),
avg_tokens=round(statistics.mean(tokens), 0) if tokens else 0.0,
)
return out
def _primary_axis_value(s: AdapterSummary, axis: str) -> float:
# Dotted access supported later (e.g., callers_of.recall); for Bench #0
# the axis is always a direct attribute.
return float(getattr(s, axis))
def format_markdown(
rollup: dict[str, AdapterSummary],
bench_id: str,
primary_axis: str,
dataset_version: str,
n_queries: int,
) -> str:
header = [
f"# {bench_id}",
"",
f"**Primary axis:** `{primary_axis}`",
f"**Queries:** {n_queries}",
f"**Dataset version:** {dataset_version}",
"",
"| Adapter | Coverage | Acc@1 | Acc@5 | Acc@10 | MRR | Avg latency (ms) | Avg tokens |",
"|---|---:|---:|---:|---:|---:|---:|---:|",
]
# Stable order: declared order of insertion into rollup preserved by dict.
for s in rollup.values():
header.append(
f"| {s.adapter} | {s.coverage_pct:.1f}% | {s.acc_at_1_pct:.1f}% | "
f"{s.acc_at_5_pct:.1f}% | {s.acc_at_10_pct:.1f}% | {s.mrr:.3f} | "
f"{s.avg_latency_ms:.2f} | {int(round(s.avg_tokens))} |"
)
# Primary-axis winner.
winner = max(rollup.values(), key=lambda s: _primary_axis_value(s, primary_axis))
runners = [s for s in rollup.values() if s.adapter != winner.adapter]
best_other = max(runners, key=lambda s: _primary_axis_value(s, primary_axis)) if runners else None
winner_v = _primary_axis_value(winner, primary_axis)
other_v = _primary_axis_value(best_other, primary_axis) if best_other else 0.0
header.extend([
"",
"## Primary axis result",
"",
f"✅ **{winner.adapter} wins** `{primary_axis}` "
f"({winner_v:.1f}% vs {other_v:.1f}%)."
if primary_axis.endswith("_pct")
else f"✅ **{winner.adapter} wins** `{primary_axis}` "
f"({winner_v:.3f} vs {other_v:.3f}).",
])
return "\n".join(header) + "\n"
def write_csv(rollup: dict[str, AdapterSummary], dst: Path) -> None:
dst.parent.mkdir(parents=True, exist_ok=True)
with dst.open("w", newline="") as f:
w = csv.writer(f)
w.writerow(list(AdapterSummary.__dataclass_fields__.keys()))
for s in rollup.values():
w.writerow(list(asdict(s).values()))