Files
sar/.claude/skills/bmad-eval-runner/scripts/generate_report.py
julian 17c08e6392 chore: initial monorepo scaffold + WDS Phase 1+2 artifacts
- Nx 22.7 monorepo (pnpm 11.1, TypeScript 5.9, Node 24)
- apps/api: NestJS 11 (CJS conforme CODING-RULES.md PGD-DB-004)
- apps/web: React 19 + Vite 8 (ESM)
- libs/shared/api-interface: Zod contract base
- Docker Compose dev: Postgres 18, Valkey 8, MinIO, Mailpit
- WDS artifacts:
  - design-artifacts/A-Product-Brief/ (5 docs canônicos + 16 dialogs)
  - design-artifacts/B-Trigger-Map/ (hub + 4 personas + feature impact)
- Stack canon: STACK.md v2.2 + CODING-RULES.md v2.0 + brand.md
- AGENTS.md + README.md como entrada para devs/agentes

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-27 14:34:20 +00:00

185 lines
7.1 KiB
Python

#!/usr/bin/env python3
# /// script
# requires-python = ">=3.9"
# ///
"""Generate an aggregate HTML report for a run folder.
Reads run.json, execution-summary.json, each <eval-id>/grading.json (if present),
and triggers-result.json (if present), then renders a single-file HTML report.
Usage:
python3 generate_report.py --run-dir PATH [-o report.html]
"""
from __future__ import annotations
import argparse
import html as html_lib
import json
import sys
from pathlib import Path
def esc(s: object) -> str:
return html_lib.escape(str(s), quote=True)
def load(path: Path) -> dict | list | None:
if not path.is_file():
return None
try:
return json.loads(path.read_text(encoding="utf-8"))
except json.JSONDecodeError:
return None
def render(run_dir: Path) -> str:
run_meta = load(run_dir / "run.json") or {}
exec_summary = load(run_dir / "execution-summary.json") or {}
triggers = load(run_dir / "triggers-result.json")
eval_blocks: list[str] = []
grading_total = 0
grading_passed = 0
for res in exec_summary.get("results", []):
eval_id = str(res.get("eval_id", "?"))
eval_dir = run_dir / eval_id
grading = load(eval_dir / "grading.json")
metrics = res.get("metrics") or load(eval_dir / "metrics.json") or {}
rc = res.get("return_code")
rows: list[str] = []
if grading:
for exp in grading.get("expectations", []):
passed = bool(exp.get("passed"))
grading_total += 1
if passed:
grading_passed += 1
rows.append(
f'<tr class="{ "pass" if passed else "fail" }">'
f'<td>{ "" if passed else "" }</td>'
f'<td>{esc(exp.get("text", ""))}</td>'
f'<td>{esc(exp.get("evidence", ""))}</td></tr>'
)
feedback = (grading or {}).get("eval_feedback") or {}
feedback_html = ""
if feedback:
sugg = feedback.get("suggestions") or []
sugg_html = "".join(
f"<li><strong>{esc(s.get('assertion','(general)'))}</strong>: {esc(s.get('reason',''))}</li>"
for s in sugg
)
overall = esc(feedback.get("overall", ""))
feedback_html = (
f'<details class="feedback"><summary>Grader feedback on the evals</summary>'
f'<p>{overall}</p>'
f'{"<ul>" + sugg_html + "</ul>" if sugg_html else ""}'
f'</details>'
)
artifacts_listing = ""
artifacts_dir = eval_dir / "artifacts"
if artifacts_dir.is_dir():
files = sorted(p for p in artifacts_dir.rglob("*") if p.is_file())
if files:
artifacts_listing = "<ul>" + "".join(
f'<li><code>{esc(p.relative_to(eval_dir))}</code> '
f'<span class="muted">({p.stat().st_size}b)</span></li>'
for p in files
) + "</ul>"
tool_calls = metrics.get("tool_calls", {})
tool_summary = ", ".join(f"{k}={v}" for k, v in sorted(tool_calls.items())) or ""
eval_blocks.append(f"""
<section class="eval">
<h3>Eval {esc(eval_id)} <span class="muted">rc={esc(rc)} · {esc(metrics.get('elapsed_s', '?'))}s</span></h3>
<p class="muted">Tool calls: {esc(tool_summary)} · output {esc(metrics.get('output_chars', 0))}b · transcript {esc(metrics.get('transcript_chars', 0))}b</p>
{ '<table><thead><tr><th></th><th>Expectation</th><th>Evidence</th></tr></thead><tbody>' + ''.join(rows) + '</tbody></table>' if rows else '<p class="muted">No grading.json yet.</p>' }
{feedback_html}
<details><summary>Artifacts</summary>{artifacts_listing or '<p class="muted">No artifacts captured.</p>'}</details>
</section>
""")
triggers_html = ""
if triggers:
rows = []
for r in triggers.get("results", []):
rows.append(
f'<tr class="{ "pass" if r["pass"] else "fail" }">'
f'<td>{ "" if r["pass"] else "" }</td>'
f'<td>{esc(r["query"])}</td>'
f'<td>{esc(r["should_trigger"])}</td>'
f'<td>{r["triggers"]}/{r["runs"]} ({r["trigger_rate"]:.2f})</td></tr>'
)
s = triggers.get("summary", {})
triggers_html = f"""
<section class="triggers">
<h2>Trigger Evals — {s.get('passed',0)}/{s.get('total',0)} pass</h2>
<table><thead><tr><th></th><th>Query</th><th>Should fire</th><th>Rate</th></tr></thead>
<tbody>{''.join(rows)}</tbody></table>
</section>
"""
artifact_summary = ""
if exec_summary:
artifact_summary = (
f"<p>Executed {exec_summary.get('executed', 0)} / {exec_summary.get('total', 0)} "
f"evals · {exec_summary.get('exec_failures', 0)} execution failures · "
f"grader: {grading_passed}/{grading_total} expectations passed</p>"
)
return f"""<!doctype html>
<html><head><meta charset="utf-8"><title>Eval Run — {esc(run_meta.get('skill_name','?'))}</title>
<style>
body {{ font: 14px/1.5 system-ui, sans-serif; max-width: 1080px; margin: 2em auto; color: #222; padding: 0 1em; }}
h1, h2, h3 {{ font-weight: 600; }}
h1 {{ font-size: 1.6em; margin-bottom: 0.2em; }}
.meta {{ color: #666; margin-bottom: 1.5em; }}
.muted {{ color: #888; font-weight: normal; }}
section.eval {{ border: 1px solid #ddd; border-radius: 6px; padding: 1em 1.2em; margin: 1em 0; background: #fafafa; }}
table {{ width: 100%; border-collapse: collapse; margin: 0.5em 0; font-size: 13px; }}
th, td {{ text-align: left; padding: 6px 8px; border-bottom: 1px solid #eee; vertical-align: top; }}
tr.pass td:first-child {{ color: #2c8a3a; font-weight: 700; }}
tr.fail td:first-child {{ color: #b3261e; font-weight: 700; }}
tr.fail {{ background: #fdf3f2; }}
details.feedback {{ margin-top: 0.6em; padding: 0.4em 0.7em; background: #fff8e1; border-radius: 4px; }}
details summary {{ cursor: pointer; font-weight: 600; }}
code {{ background: #eee; padding: 1px 4px; border-radius: 3px; }}
</style></head>
<body>
<h1>{esc(run_meta.get('skill_name','?'))} — eval run</h1>
<div class="meta">
Run id: <code>{esc(run_meta.get('run_id','?'))}</code> ·
isolation: {esc(run_meta.get('isolation','?'))} ·
started: {esc(run_meta.get('started_at','?'))}
</div>
{artifact_summary}
{''.join(eval_blocks)}
{triggers_html}
</body></html>
"""
def main() -> int:
parser = argparse.ArgumentParser(description="Generate HTML report for an eval run folder")
parser.add_argument("--run-dir", required=True, type=Path)
parser.add_argument("-o", "--output", type=Path, default=None)
args = parser.parse_args()
run_dir = args.run_dir.resolve()
if not run_dir.is_dir():
print(f"run-dir not found: {run_dir}", file=sys.stderr)
return 2
out = args.output or (run_dir / "report.html")
out.write_text(render(run_dir), encoding="utf-8")
print(str(out))
return 0
if __name__ == "__main__":
sys.exit(main())