#!/usr/bin/env python3 # /// script # requires-python = ">=3.9" # /// """Generate an aggregate HTML report for a run folder. Reads run.json, execution-summary.json, each /grading.json (if present), and triggers-result.json (if present), then renders a single-file HTML report. Usage: python3 generate_report.py --run-dir PATH [-o report.html] """ from __future__ import annotations import argparse import html as html_lib import json import sys from pathlib import Path def esc(s: object) -> str: return html_lib.escape(str(s), quote=True) def load(path: Path) -> dict | list | None: if not path.is_file(): return None try: return json.loads(path.read_text(encoding="utf-8")) except json.JSONDecodeError: return None def render(run_dir: Path) -> str: run_meta = load(run_dir / "run.json") or {} exec_summary = load(run_dir / "execution-summary.json") or {} triggers = load(run_dir / "triggers-result.json") eval_blocks: list[str] = [] grading_total = 0 grading_passed = 0 for res in exec_summary.get("results", []): eval_id = str(res.get("eval_id", "?")) eval_dir = run_dir / eval_id grading = load(eval_dir / "grading.json") metrics = res.get("metrics") or load(eval_dir / "metrics.json") or {} rc = res.get("return_code") rows: list[str] = [] if grading: for exp in grading.get("expectations", []): passed = bool(exp.get("passed")) grading_total += 1 if passed: grading_passed += 1 rows.append( f'' f'{ "✔" if passed else "✘" }' f'{esc(exp.get("text", ""))}' f'{esc(exp.get("evidence", ""))}' ) feedback = (grading or {}).get("eval_feedback") or {} feedback_html = "" if feedback: sugg = feedback.get("suggestions") or [] sugg_html = "".join( f"
  • {esc(s.get('assertion','(general)'))}: {esc(s.get('reason',''))}
  • " for s in sugg ) overall = esc(feedback.get("overall", "")) feedback_html = ( f'
    Grader feedback on the evals' f'

    {overall}

    ' f'{"" if sugg_html else ""}' f'
    ' ) artifacts_listing = "" artifacts_dir = eval_dir / "artifacts" if artifacts_dir.is_dir(): files = sorted(p for p in artifacts_dir.rglob("*") if p.is_file()) if files: artifacts_listing = "" tool_calls = metrics.get("tool_calls", {}) tool_summary = ", ".join(f"{k}={v}" for k, v in sorted(tool_calls.items())) or "—" eval_blocks.append(f"""

    Eval {esc(eval_id)} rc={esc(rc)} · {esc(metrics.get('elapsed_s', '?'))}s

    Tool calls: {esc(tool_summary)} · output {esc(metrics.get('output_chars', 0))}b · transcript {esc(metrics.get('transcript_chars', 0))}b

    { '' + ''.join(rows) + '
    ExpectationEvidence
    ' if rows else '

    No grading.json yet.

    ' } {feedback_html}
    Artifacts{artifacts_listing or '

    No artifacts captured.

    '}
    """) triggers_html = "" if triggers: rows = [] for r in triggers.get("results", []): rows.append( f'' f'{ "✔" if r["pass"] else "✘" }' f'{esc(r["query"])}' f'{esc(r["should_trigger"])}' f'{r["triggers"]}/{r["runs"]} ({r["trigger_rate"]:.2f})' ) s = triggers.get("summary", {}) triggers_html = f"""

    Trigger Evals — {s.get('passed',0)}/{s.get('total',0)} pass

    {''.join(rows)}
    QueryShould fireRate
    """ artifact_summary = "" if exec_summary: artifact_summary = ( f"

    Executed {exec_summary.get('executed', 0)} / {exec_summary.get('total', 0)} " f"evals · {exec_summary.get('exec_failures', 0)} execution failures · " f"grader: {grading_passed}/{grading_total} expectations passed

    " ) return f""" Eval Run — {esc(run_meta.get('skill_name','?'))}

    {esc(run_meta.get('skill_name','?'))} — eval run

    Run id: {esc(run_meta.get('run_id','?'))} · isolation: {esc(run_meta.get('isolation','?'))} · started: {esc(run_meta.get('started_at','?'))}
    {artifact_summary} {''.join(eval_blocks)} {triggers_html} """ def main() -> int: parser = argparse.ArgumentParser(description="Generate HTML report for an eval run folder") parser.add_argument("--run-dir", required=True, type=Path) parser.add_argument("-o", "--output", type=Path, default=None) args = parser.parse_args() run_dir = args.run_dir.resolve() if not run_dir.is_dir(): print(f"run-dir not found: {run_dir}", file=sys.stderr) return 2 out = args.output or (run_dir / "report.html") out.write_text(render(run_dir), encoding="utf-8") print(str(out)) return 0 if __name__ == "__main__": sys.exit(main())