Skip to content

Commit d3965a8

Browse files
Fix report crash on headerless raw CSV
A leftover 0-byte session_<id>_raw.csv (e.g. from an interrupted or restarted run) was appended to without ever receiving a header, since _initialize_csv only wrote one when the file was missing. load_session then read it with csv.DictReader, which treated the first data row as the header and raised KeyError: 'timestamp', aborting report generation even though all the data was valid. - _initialize_csv: write the header when the file is missing OR empty. - load_session: tolerate a headerless CSV by mapping rows positionally against CSV_HEADER (detected via the first cell), padding short rows. - Add regression tests for both paths.
1 parent 6aedeed commit d3965a8

2 files changed

Lines changed: 78 additions & 31 deletions

File tree

src/benchmarking/metrics/collector.py

Lines changed: 49 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,11 @@ def __init__(self, session_id: str, results_dir: str = "./results"):
3737
self._initialize_csv()
3838

3939
def _initialize_csv(self) -> None:
40-
if not self.csv_path.exists():
40+
# Write the header when the file is missing OR exists but is empty.
41+
# A leftover 0-byte file (e.g. from an interrupted/restarted run)
42+
# would otherwise be appended to without ever getting a header,
43+
# producing a headerless CSV that breaks report generation.
44+
if not self.csv_path.exists() or self.csv_path.stat().st_size == 0:
4145
with open(self.csv_path, "w", newline="") as f:
4246
csv.writer(f).writerow(CSV_HEADER)
4347

@@ -88,37 +92,51 @@ def _f(v: str | None) -> float | None:
8892
return None
8993
return float(v)
9094

91-
with open(csv_path, "r") as f:
92-
for row in csv.DictReader(f):
93-
timestamp = datetime.fromisoformat(row["timestamp"])
94-
if start_time is None or timestamp < start_time:
95-
start_time = timestamp
96-
if end_time is None or timestamp > end_time:
97-
end_time = timestamp
98-
99-
services.add(row["service_name"])
100-
models.add(row.get("model_name", ""))
101-
workloads.add(row["workload_type"])
102-
iterations_per_workload = max(iterations_per_workload, int(row["iteration"]) + 1)
103-
104-
metrics.append(
105-
BenchmarkMetric(
106-
session_id=row["session_id"],
107-
service_name=row["service_name"],
108-
model_name=row.get("model_name", ""),
109-
workload_type=row["workload_type"],
110-
iteration=int(row["iteration"]),
111-
ttft_ms=_f(row.get("ttft_ms")),
112-
end_to_end_latency_ms=float(row["end_to_end_latency_ms"]),
113-
tokens_generated=int(row["tokens_generated"]),
114-
prompt_tokens=int(row["prompt_tokens"]),
115-
tokens_per_sec=_f(row.get("tokens_per_sec")),
116-
inter_token_latency_ms=_f(row.get("inter_token_latency_ms")),
117-
attempts=int(row.get("attempts") or 1),
118-
timestamp=timestamp,
119-
error=row["error"] if row["error"] else None,
120-
)
95+
with open(csv_path, "r", newline="") as f:
96+
data_rows = [r for r in csv.reader(f) if r]
97+
98+
# Tolerate raw CSVs written without a header row. A genuine header
99+
# begins with the literal "session_id" column name; anything else is
100+
# treated as data and mapped positionally against CSV_HEADER. This
101+
# lets us still produce reports from files an interrupted/restarted
102+
# run left headerless.
103+
if data_rows and data_rows[0][0] == CSV_HEADER[0]:
104+
data_rows = data_rows[1:]
105+
106+
for raw in data_rows:
107+
# Pad short rows so every column is present (mirrors DictReader's
108+
# restval behaviour) before zipping into a name->value mapping.
109+
padded = list(raw) + [""] * (len(CSV_HEADER) - len(raw))
110+
row = dict(zip(CSV_HEADER, padded))
111+
timestamp = datetime.fromisoformat(row["timestamp"])
112+
if start_time is None or timestamp < start_time:
113+
start_time = timestamp
114+
if end_time is None or timestamp > end_time:
115+
end_time = timestamp
116+
117+
services.add(row["service_name"])
118+
models.add(row.get("model_name", ""))
119+
workloads.add(row["workload_type"])
120+
iterations_per_workload = max(iterations_per_workload, int(row["iteration"]) + 1)
121+
122+
metrics.append(
123+
BenchmarkMetric(
124+
session_id=row["session_id"],
125+
service_name=row["service_name"],
126+
model_name=row.get("model_name", ""),
127+
workload_type=row["workload_type"],
128+
iteration=int(row["iteration"]),
129+
ttft_ms=_f(row.get("ttft_ms")),
130+
end_to_end_latency_ms=float(row["end_to_end_latency_ms"]),
131+
tokens_generated=int(row["tokens_generated"]),
132+
prompt_tokens=int(row["prompt_tokens"]),
133+
tokens_per_sec=_f(row.get("tokens_per_sec")),
134+
inter_token_latency_ms=_f(row.get("inter_token_latency_ms")),
135+
attempts=int(row.get("attempts") or 1),
136+
timestamp=timestamp,
137+
error=row["error"] if row["error"] else None,
121138
)
139+
)
122140

123141
return BenchmarkSession(
124142
session_id=session_id,

tests/test_metrics/test_collector.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,32 @@ def test_metrics_collector_round_trips_result(tmp_path: Path) -> None:
3232
assert session.results[0].ttft_ms == 250.0
3333
assert session.results[0].end_to_end_latency_ms == 1500.0
3434
assert session.results[0].attempts == 2
35+
36+
37+
def test_load_session_tolerates_headerless_csv(tmp_path: Path) -> None:
38+
"""A raw CSV left headerless by an interrupted/restarted run must still
39+
load (it previously crashed report generation with KeyError: 'timestamp')."""
40+
csv_path = tmp_path / "session_legacy_raw.csv"
41+
csv_path.write_text(
42+
"legacy,Provider A,model-a,ctx_256,0,250.0,1500.0,100,25,80.0,12.5,1,"
43+
"2026-06-09T18:15:05.451820+00:00,\n"
44+
)
45+
46+
collector = MetricsCollector("legacy", str(tmp_path))
47+
session = collector.load_session("legacy")
48+
49+
assert len(session.results) == 1
50+
assert session.results[0].service_name == "Provider A"
51+
assert session.results[0].tokens_generated == 100
52+
assert session.results[0].error is None
53+
54+
55+
def test_initialize_csv_repairs_empty_file(tmp_path: Path) -> None:
56+
"""A pre-existing 0-byte session file must get a header written, otherwise
57+
every subsequent append produces a headerless CSV."""
58+
csv_path = tmp_path / "session_empty_raw.csv"
59+
csv_path.write_text("") # leftover 0-byte file
60+
61+
MetricsCollector("empty", str(tmp_path))
62+
63+
assert csv_path.read_text().startswith("session_id,")

0 commit comments

Comments
 (0)