chore: initial monorepo scaffold + WDS Phase 1+2 artifacts
- Nx 22.7 monorepo (pnpm 11.1, TypeScript 5.9, Node 24) - apps/api: NestJS 11 (CJS conforme CODING-RULES.md PGD-DB-004) - apps/web: React 19 + Vite 8 (ESM) - libs/shared/api-interface: Zod contract base - Docker Compose dev: Postgres 18, Valkey 8, MinIO, Mailpit - WDS artifacts: - design-artifacts/A-Product-Brief/ (5 docs canônicos + 16 dialogs) - design-artifacts/B-Trigger-Map/ (hub + 4 personas + feature impact) - Stack canon: STACK.md v2.2 + CODING-RULES.md v2.0 + brand.md - AGENTS.md + README.md como entrada para devs/agentes Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
366
.claude/skills/bmad-eval-runner/scripts/run_triggers.py
Normal file
366
.claude/skills/bmad-eval-runner/scripts/run_triggers.py
Normal file
@@ -0,0 +1,366 @@
|
||||
#!/usr/bin/env python3
|
||||
# /// script
|
||||
# requires-python = ">=3.9"
|
||||
# ///
|
||||
"""Run trigger evals: does the skill's description fire on each query?
|
||||
|
||||
Adapted from Anthropic skill-creator's run_eval.py
|
||||
(https://github.com/anthropics/skills/tree/main/skills/skill-creator) with two
|
||||
adaptations:
|
||||
|
||||
1. Isolation. Each query runs in either a fresh Docker container off
|
||||
bmad-eval-runner:latest, or a fresh local tmp dir under ~/bmad-evals/<run-id>/
|
||||
with HOME overridden to a clean directory. This prevents the host's global
|
||||
CLAUDE.md and auto-memory from biasing whether the skill fires.
|
||||
|
||||
2. Output. Results are written to a run folder alongside the artifact eval
|
||||
run-folder layout (so triggers and artifacts can share a single report).
|
||||
|
||||
Usage:
|
||||
python3 run_triggers.py \\
|
||||
--skill-path PATH \\
|
||||
--triggers-file PATH/triggers.json \\
|
||||
--output-dir PATH \\
|
||||
--isolation docker|local \\
|
||||
[--workers N] [--runs-per-query N] [--timeout SECS] [--threshold 0.5]
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import uuid
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from pathlib import Path
|
||||
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
sys.path.insert(0, str(SCRIPT_DIR))
|
||||
|
||||
from utils import ( # noqa: E402
|
||||
new_run_id,
|
||||
parse_skill_md,
|
||||
read_json,
|
||||
read_macos_keychain_credentials,
|
||||
stage_credentials,
|
||||
utc_now_iso,
|
||||
write_json,
|
||||
)
|
||||
|
||||
DOCKER_IMAGE = "bmad-eval-runner:latest"
|
||||
_KEYCHAIN_CREDS: str | None = read_macos_keychain_credentials()
|
||||
|
||||
|
||||
def write_synthetic_skill(skills_dir: Path, skill_name: str, description: str, unique_id: str) -> tuple[Path, str]:
|
||||
"""Place a synthetic skill at <skills_dir>/<clean_name>/SKILL.md.
|
||||
|
||||
The Skill tool only fires for entries discovered as actual skills (frontmatter
|
||||
`name` + `description` under a `.claude/skills/<name>/SKILL.md`). Slash-commands
|
||||
under `.claude/commands/` do not auto-invoke the Skill tool, so the previous
|
||||
implementation could never observe a positive trigger. This places the synthetic
|
||||
skill where Claude Code looks for skills, with a unique name so the detector
|
||||
can disambiguate it from any pre-existing skill of the same display name.
|
||||
"""
|
||||
clean_name = f"{skill_name}-skill-{unique_id}"
|
||||
skill_root = skills_dir / clean_name
|
||||
skill_root.mkdir(parents=True, exist_ok=True)
|
||||
path = skill_root / "SKILL.md"
|
||||
indented_desc = "\n ".join(description.split("\n"))
|
||||
path.write_text(
|
||||
f"---\n"
|
||||
f"name: {clean_name}\n"
|
||||
f"description: |\n"
|
||||
f" {indented_desc}\n"
|
||||
f"---\n\n"
|
||||
f"# {skill_name}\n\n"
|
||||
f"This skill handles: {description}\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
return path, clean_name
|
||||
|
||||
|
||||
def parse_stream_for_trigger(buffer: str, clean_name: str) -> tuple[bool | None, str]:
|
||||
"""Return (triggered_or_none, leftover_buffer). None means undecided yet."""
|
||||
triggered: bool | None = None
|
||||
pending_tool: str | None = None
|
||||
accumulated_json = ""
|
||||
leftover = ""
|
||||
|
||||
while "\n" in buffer:
|
||||
line, buffer = buffer.split("\n", 1)
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
evt = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
if evt.get("type") == "stream_event":
|
||||
se = evt.get("event", {})
|
||||
t = se.get("type", "")
|
||||
if t == "content_block_start":
|
||||
cb = se.get("content_block", {})
|
||||
if cb.get("type") == "tool_use":
|
||||
name = cb.get("name", "")
|
||||
if name in ("Skill", "Read"):
|
||||
pending_tool = name
|
||||
accumulated_json = ""
|
||||
else:
|
||||
return False, ""
|
||||
elif t == "content_block_delta" and pending_tool:
|
||||
delta = se.get("delta", {})
|
||||
if delta.get("type") == "input_json_delta":
|
||||
accumulated_json += delta.get("partial_json", "")
|
||||
if clean_name in accumulated_json:
|
||||
return True, ""
|
||||
elif t in ("content_block_stop", "message_stop"):
|
||||
if pending_tool:
|
||||
return clean_name in accumulated_json, ""
|
||||
if t == "message_stop":
|
||||
return False, ""
|
||||
elif evt.get("type") == "assistant":
|
||||
for item in evt.get("message", {}).get("content", []):
|
||||
if item.get("type") != "tool_use":
|
||||
continue
|
||||
tname = item.get("name", "")
|
||||
tinput = item.get("input", {})
|
||||
if tname == "Skill" and clean_name in tinput.get("skill", ""):
|
||||
return True, ""
|
||||
if tname == "Read" and clean_name in tinput.get("file_path", ""):
|
||||
return True, ""
|
||||
return False, ""
|
||||
elif evt.get("type") == "result":
|
||||
return triggered if triggered is not None else False, ""
|
||||
leftover = buffer
|
||||
return triggered, leftover
|
||||
|
||||
|
||||
def run_query_local(query: str, skill_name: str, description: str,
|
||||
workspace_root: Path, timeout: int) -> bool:
|
||||
workspace_root.mkdir(parents=True, exist_ok=True)
|
||||
home_dir = workspace_root / ".home"
|
||||
(home_dir / ".claude").mkdir(parents=True, exist_ok=True)
|
||||
stage_credentials(home_dir / ".claude", _KEYCHAIN_CREDS)
|
||||
project_dir = workspace_root / "project"
|
||||
skills_dir = project_dir / ".claude" / "skills"
|
||||
project_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
unique = uuid.uuid4().hex[:8]
|
||||
cmd_file, clean_name = write_synthetic_skill(skills_dir, skill_name, description, unique)
|
||||
|
||||
env = {
|
||||
"HOME": str(home_dir),
|
||||
"CLAUDE_CONFIG_DIR": str(home_dir / ".claude"),
|
||||
"PATH": os.environ.get("PATH", ""),
|
||||
"ANTHROPIC_API_KEY": os.environ.get("ANTHROPIC_API_KEY", ""),
|
||||
}
|
||||
|
||||
cmd = [
|
||||
"claude", "-p", query,
|
||||
"--output-format", "stream-json",
|
||||
"--verbose",
|
||||
"--include-partial-messages",
|
||||
"--dangerously-skip-permissions",
|
||||
]
|
||||
|
||||
try:
|
||||
proc = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.DEVNULL,
|
||||
cwd=str(project_dir),
|
||||
env=env,
|
||||
)
|
||||
buffer = ""
|
||||
triggered: bool | None = None
|
||||
start = time.time()
|
||||
try:
|
||||
while time.time() - start < timeout:
|
||||
if proc.poll() is not None:
|
||||
rest = proc.stdout.read()
|
||||
if rest:
|
||||
buffer += rest.decode("utf-8", errors="replace")
|
||||
break
|
||||
chunk = proc.stdout.read1(8192) if hasattr(proc.stdout, "read1") else proc.stdout.read(8192)
|
||||
if not chunk:
|
||||
time.sleep(0.05)
|
||||
continue
|
||||
buffer += chunk.decode("utf-8", errors="replace")
|
||||
decided, buffer = parse_stream_for_trigger(buffer, clean_name)
|
||||
if decided is not None:
|
||||
triggered = decided
|
||||
break
|
||||
finally:
|
||||
if proc.poll() is None:
|
||||
proc.kill()
|
||||
proc.wait()
|
||||
if triggered is None:
|
||||
decided, _ = parse_stream_for_trigger(buffer + "\n", clean_name)
|
||||
triggered = bool(decided)
|
||||
return bool(triggered)
|
||||
finally:
|
||||
try:
|
||||
shutil.rmtree(cmd_file.parent, ignore_errors=True)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def run_query_docker(query: str, skill_name: str, description: str,
|
||||
workspace_root: Path, timeout: int) -> bool:
|
||||
workspace_root.mkdir(parents=True, exist_ok=True)
|
||||
unique = uuid.uuid4().hex[:8]
|
||||
skills_in = workspace_root / "skills_in"
|
||||
skills_in.mkdir(parents=True, exist_ok=True)
|
||||
_, clean_name = write_synthetic_skill(skills_in, skill_name, description, unique)
|
||||
|
||||
creds_dir: Path | None = None
|
||||
if _KEYCHAIN_CREDS:
|
||||
creds_dir = workspace_root / "creds_in"
|
||||
creds_dir.mkdir(parents=True, exist_ok=True)
|
||||
(creds_dir / ".credentials.json").write_text(_KEYCHAIN_CREDS, encoding="utf-8")
|
||||
|
||||
container_script = f"""
|
||||
set -e
|
||||
mkdir -p /workspace/.claude/skills
|
||||
cp -R /skills/. /workspace/.claude/skills/ 2>/dev/null || true
|
||||
if [ -f /creds/.credentials.json ]; then
|
||||
mkdir -p /home/evaluator/.claude
|
||||
cp /creds/.credentials.json /home/evaluator/.claude/.credentials.json
|
||||
fi
|
||||
cd /workspace
|
||||
claude -p "$EVAL_QUERY" \\
|
||||
--output-format stream-json --verbose --include-partial-messages \\
|
||||
--dangerously-skip-permissions \\
|
||||
> /output/stream.jsonl 2>/dev/null || true
|
||||
"""
|
||||
|
||||
output_dir = workspace_root / "output"
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
cmd = [
|
||||
"docker", "run", "--rm",
|
||||
"-v", f"{skills_in}:/skills:ro",
|
||||
"-v", f"{output_dir}:/output",
|
||||
"-e", "ANTHROPIC_API_KEY",
|
||||
"-e", f"EVAL_QUERY={query}",
|
||||
]
|
||||
if creds_dir:
|
||||
cmd += ["-v", f"{creds_dir}:/creds:ro"]
|
||||
cmd += [DOCKER_IMAGE, "bash", "-c", container_script]
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, capture_output=True, timeout=timeout + 30)
|
||||
except subprocess.TimeoutExpired:
|
||||
pass
|
||||
|
||||
stream_file = output_dir / "stream.jsonl"
|
||||
if not stream_file.is_file():
|
||||
return False
|
||||
decided, _ = parse_stream_for_trigger(stream_file.read_text(encoding="utf-8", errors="replace") + "\n", clean_name)
|
||||
return bool(decided)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="Run trigger evals in isolation")
|
||||
parser.add_argument("--skill-path", required=True, type=Path)
|
||||
parser.add_argument("--triggers-file", required=True, type=Path)
|
||||
parser.add_argument("--output-dir", required=True, type=Path)
|
||||
parser.add_argument("--isolation", choices=("docker", "local"), required=True)
|
||||
parser.add_argument("--workers", type=int, default=8)
|
||||
parser.add_argument("--runs-per-query", type=int, default=3)
|
||||
parser.add_argument("--timeout", type=int, default=45)
|
||||
parser.add_argument("--threshold", type=float, default=0.5)
|
||||
parser.add_argument("--quiet", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
skill_path = args.skill_path.resolve()
|
||||
triggers_file = args.triggers_file.resolve()
|
||||
if not triggers_file.is_file():
|
||||
print(f"triggers file not found: {triggers_file}", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
skill_name, description, _ = parse_skill_md(skill_path)
|
||||
queries = read_json(triggers_file)
|
||||
|
||||
run_id = new_run_id(f"{skill_name}-triggers")
|
||||
run_dir = (args.output_dir / run_id).resolve()
|
||||
(run_dir / "queries").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
write_json(run_dir / "run.json", {
|
||||
"run_id": run_id,
|
||||
"skill_name": skill_name,
|
||||
"description": description,
|
||||
"isolation": args.isolation,
|
||||
"started_at": utc_now_iso(),
|
||||
"query_count": len(queries),
|
||||
"runs_per_query": args.runs_per_query,
|
||||
"threshold": args.threshold,
|
||||
})
|
||||
|
||||
runner = run_query_docker if args.isolation == "docker" else run_query_local
|
||||
|
||||
def run_one(idx: int, q: dict, run_idx: int) -> tuple[int, bool]:
|
||||
ws = run_dir / "queries" / f"q{idx:03d}-r{run_idx}"
|
||||
triggered = runner(q["query"], skill_name, description, ws, args.timeout)
|
||||
return idx, triggered
|
||||
|
||||
per_query: dict[int, list[bool]] = {}
|
||||
if not args.quiet:
|
||||
print(f"[run_triggers] {len(queries)} queries × {args.runs_per_query} runs, isolation={args.isolation}", file=sys.stderr)
|
||||
|
||||
with ThreadPoolExecutor(max_workers=args.workers) as pool:
|
||||
futures = []
|
||||
for idx, q in enumerate(queries):
|
||||
for run_idx in range(args.runs_per_query):
|
||||
futures.append(pool.submit(run_one, idx, q, run_idx))
|
||||
for fut in as_completed(futures):
|
||||
try:
|
||||
idx, triggered = fut.result()
|
||||
except Exception as e:
|
||||
print(f"Warning: query failed: {e}", file=sys.stderr)
|
||||
continue
|
||||
per_query.setdefault(idx, []).append(triggered)
|
||||
|
||||
results = []
|
||||
for idx, q in enumerate(queries):
|
||||
triggers = per_query.get(idx, [])
|
||||
rate = (sum(triggers) / len(triggers)) if triggers else 0.0
|
||||
should = bool(q["should_trigger"])
|
||||
if should:
|
||||
passed = rate >= args.threshold
|
||||
else:
|
||||
passed = rate < args.threshold
|
||||
results.append({
|
||||
"query": q["query"],
|
||||
"should_trigger": should,
|
||||
"trigger_rate": rate,
|
||||
"triggers": int(sum(triggers)),
|
||||
"runs": len(triggers),
|
||||
"pass": passed,
|
||||
})
|
||||
|
||||
output = {
|
||||
"run_id": run_id,
|
||||
"completed_at": utc_now_iso(),
|
||||
"skill_name": skill_name,
|
||||
"description": description,
|
||||
"isolation": args.isolation,
|
||||
"results": results,
|
||||
"summary": {
|
||||
"total": len(results),
|
||||
"passed": sum(1 for r in results if r["pass"]),
|
||||
"failed": sum(1 for r in results if not r["pass"]),
|
||||
},
|
||||
}
|
||||
write_json(run_dir / "triggers-result.json", output)
|
||||
print(json.dumps(output, indent=2))
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user