sar/.claude/skills/bmad-workflow-builder/scripts/extract-report-json.py

#!/usr/bin/env python3
"""Deterministic extraction of report-data.json from analysis outputs.

Reads scanner outputs (markdown + JSON) and extracts structured data without
LLM synthesis. Ensures no data loss and completes in <10 seconds.

Usage:
  python3 extract-report-json.py {skill-path} {quality-report-dir} -o {output-file}
"""

from __future__ import annotations

import argparse
import json
import re
import sys
from datetime import datetime, timezone
from pathlib import Path


def extract_section(content: str, section_name: str, level: int = 2) -> str | None:
    """Extract a section from markdown by heading name."""
    pattern = r'^#{' + str(level) + r'}\s+' + re.escape(section_name) + r'\s*\n(.*?)(?=^#{1,' + str(level) + r'}\s|\Z)'
    match = re.search(pattern, content, re.MULTILINE | re.DOTALL)
    return match.group(1).strip() if match else None


def extract_journeys(content: str) -> list[dict]:
    """Extract user journey archetypes from enhancement-analysis.md."""
    journeys = []
    # Match ### N. {Name}: {Description}
    pattern = r'^###\s+\d+\.\s+([^:]+):\s+(.+?)(?=^###|\Z)'
    for match in re.finditer(pattern, content, re.MULTILINE | re.DOTALL):
        name = match.group(1).strip()
        section = match.group(2)

        # Extract narrative (after "Narrative." or "Narrative\n")
        narrative_match = re.search(r'(?:Narrative[:.]\s*)?([^\n]+(?:\n[^*\n][^\n]*)*?)(?=\n\*\*|\n[A-Z])', section)
        summary = narrative_match.group(1).strip() if narrative_match else ""

        # Extract friction points
        friction_points = []
        friction_section = re.search(r'\*\*Friction points?[:\*]*\*\*\s*\n(.*?)(?=\n\*\*|\n[A-Z]|$)', section, re.DOTALL)
        if friction_section:
            for line in friction_section.group(1).split('\n'):
                line = line.strip()
                if line.startswith('- '):
                    friction_points.append(line[2:].strip())

        # Extract bright spots
        bright_spots = []
        bright_section = re.search(r'\*\*Bright spots?[:\*]*\*\*\s*\n(.*?)(?=\n\*\*|\n[A-Z]|$)', section, re.DOTALL)
        if bright_section:
            for line in bright_section.group(1).split('\n'):
                line = line.strip()
                if line.startswith('- '):
                    bright_spots.append(line[2:].strip())

        journeys.append({
            'archetype': name,
            'summary': summary,
            'friction_points': friction_points,
            'bright_spots': bright_spots
        })

    return journeys


def extract_autonomous(content: str) -> dict:
    """Extract headless/automation assessment from enhancement-analysis.md."""
    assessment_section = extract_section(content, 'Headless Assessment', level=2)
    if not assessment_section:
        return {}

    # Look for "Current Level:" or "Potential:" pattern
    potential_match = re.search(r'(?:Current Level|Potential)[:\*]*\s*([^\n.]+)', assessment_section)
    potential = potential_match.group(1).strip() if potential_match else "unknown"

    # Get the rest as notes
    notes = assessment_section
    if potential_match:
        notes = assessment_section[potential_match.end():].strip()

    return {
        'potential': potential,
        'notes': notes[:200] if notes else ""  # Truncate to 200 chars
    }


def extract_findings_from_md(content: str, source_scanner: str) -> list[dict]:
    """Extract individual findings from analysis markdown.

    Handles multiple formats:
    - Architecture: level 4 headings under severity sections (### HIGH, etc)
    - Determinism: bold headings with severity markers [HIGH], [LOW]
    - Customization: bold headings with opportunity markers (HIGH-OPPORTUNITY, etc)
    - Enhancement: numbered findings with severity/opportunity markers
    """
    findings = []

    if source_scanner == 'architecture':
        # Architecture format: ### SEVERITY followed by #### N. Title
        severity_pattern = r'^###\s+(CRITICAL|HIGH|MEDIUM|LOW)\s*$'
        severity_sections = re.split(severity_pattern, content, flags=re.MULTILINE)

        for i in range(1, len(severity_sections), 2):
            severity = severity_sections[i].lower() if i < len(severity_sections) else "medium"
            section_content = severity_sections[i + 1] if i + 1 < len(severity_sections) else ""

            if not section_content.strip() or section_content.strip() == "None":
                continue

            # Extract level 4 findings (#### N. Title)
            finding_pattern = r'^####\s+(\d+\.\s+)?(.+?)$'
            for match in re.finditer(finding_pattern, section_content, re.MULTILINE):
                finding_title = match.group(2).strip()
                if finding_title:
                    findings.append({
                        'title': finding_title,
                        'severity': severity,
                        'source': source_scanner
                    })

    elif source_scanner == 'determinism':
        # Determinism format: ### **[SEVERITY] Title**
        pattern = r'###\s+\*\*\[([A-Z]+)\]\s+([^*]+)\*\*'
        for match in re.finditer(pattern, content, re.MULTILINE):
            severity = match.group(1).lower()
            title = match.group(2).strip()
            if title:
                findings.append({
                    'title': title,
                    'severity': severity,
                    'source': source_scanner
                })

    elif source_scanner == 'customization':
        # Customization format: ### N. **Title** (OPPORTUNITY-TYPE)
        pattern = r'###\s+\d+\.\s+\*\*([^*]+)\*\*\s+\(([A-Z-]+)\)'
        for match in re.finditer(pattern, content, re.MULTILINE):
            title = match.group(1).strip()
            opportunity = match.group(2).lower()
            # Map opportunity to severity
            severity = 'high' if 'high' in opportunity else 'medium' if 'medium' in opportunity else 'low'
            if title:
                findings.append({
                    'title': title,
                    'severity': severity,
                    'source': source_scanner
                })

    elif source_scanner == 'enhancement':
        # Enhancement format: ### LEVEL Findings section followed by #### N. Title
        # Extract opportunity sections (HIGH-OPPORTUNITY, SECONDARY-OPPORTUNITY, etc)
        opportunity_pattern = r'^###\s+([A-Z-]+)\s+(?:Findings|Opportunities?)'
        opportunity_sections = re.split(opportunity_pattern, content, flags=re.MULTILINE)

        for i in range(1, len(opportunity_sections), 2):
            opportunity = opportunity_sections[i].lower() if i < len(opportunity_sections) else "medium"
            section_content = opportunity_sections[i + 1] if i + 1 < len(opportunity_sections) else ""

            if not section_content.strip():
                continue

            # Map opportunity to severity
            severity = 'high' if 'high' in opportunity else 'medium' if 'secondary' in opportunity else 'low'

            # Extract level 4 findings (#### N. Title)
            finding_pattern = r'^####\s+(\d+\.\s+)?(.+?)$'
            for match in re.finditer(finding_pattern, section_content, re.MULTILINE):
                finding_title = match.group(2).strip()
                if finding_title:
                    findings.append({
                        'title': finding_title,
                        'severity': severity,
                        'source': source_scanner
                    })

    return findings


def merge_prepass_data(report_dir: Path) -> dict:
    """Load and merge all prepass JSON data."""
    merged = {}

    for json_file in report_dir.glob('*-prepass.json'):
        try:
            data = json.loads(json_file.read_text(encoding='utf-8'))
            merged.update(data)
        except Exception:
            pass  # Skip if not valid JSON

    return merged


def build_report_json(skill_path: str, quality_report_dir: str) -> dict:
    """Extract and build complete report-data.json."""
    report_dir = Path(quality_report_dir)
    skill_name = Path(skill_path).name
    timestamp = datetime.now(timezone.utc).isoformat()

    # Read all analysis files
    architecture_content = (report_dir / 'architecture-analysis.md').read_text(encoding='utf-8') if (report_dir / 'architecture-analysis.md').exists() else ""
    determinism_content = (report_dir / 'determinism-analysis.md').read_text(encoding='utf-8') if (report_dir / 'determinism-analysis.md').exists() else ""
    customization_content = (report_dir / 'customization-analysis.md').read_text(encoding='utf-8') if (report_dir / 'customization-analysis.md').exists() else ""
    enhancement_content = (report_dir / 'enhancement-analysis.md').read_text(encoding='utf-8') if (report_dir / 'enhancement-analysis.md').exists() else ""

    # Extract assessments
    arch_assessment = extract_section(architecture_content, 'Assessment', level=2) or ""
    det_assessment = extract_section(determinism_content, 'Assessment', level=2) or ""
    cust_assessment = extract_section(customization_content, 'Overall Assessment', level=2) or ""
    enh_assessment = extract_section(enhancement_content, 'Summary', level=2) or ""

    # Extract journeys and autonomous from enhancement
    journeys = extract_journeys(enhancement_content)
    autonomous = extract_autonomous(enhancement_content)

    # Build detailed_analysis
    detailed_analysis = {
        'architecture': {
            'assessment': arch_assessment[:500],  # First 500 chars
            'findings': extract_findings_from_md(architecture_content, 'architecture')
        },
        'determinism': {
            'assessment': det_assessment[:500],
            'findings': extract_findings_from_md(determinism_content, 'determinism')
        },
        'customization': {
            'assessment': cust_assessment[:500],
            'posture': 'not-opted-in',  # From content
            'findings': extract_findings_from_md(customization_content, 'customization')
        },
        'enhancement': {
            'assessment': enh_assessment[:500],
            'journeys': journeys,
            'autonomous': autonomous,
            'findings': extract_findings_from_md(enhancement_content, 'enhancement')
        }
    }

    # Build basic structure - minimal for now, will be expanded by report creator if needed
    report_data = {
        'meta': {
            'skill_name': skill_name,
            'skill_path': skill_path,
            'timestamp': timestamp,
            'scanner_count': 4
        },
        'narrative': enh_assessment[:150] if enh_assessment else "",  # Placeholder
        'grade': 'Good',  # Placeholder - report creator sets this
        'broken': [],
        'opportunities': [],
        'strengths': [],
        'recommendations': [],
        'detailed_analysis': detailed_analysis
    }

    return report_data


def main():
    parser = argparse.ArgumentParser(description='Extract report-data.json from analysis outputs')
    parser.add_argument('skill_path', help='Path to the skill being analyzed')
    parser.add_argument('quality_report_dir', help='Directory with analysis outputs and where to write report')
    parser.add_argument('-o', '--output', help='Output file path (default: {quality_report_dir}/report-data.json)')

    args = parser.parse_args()

    output_path = args.output or str(Path(args.quality_report_dir) / 'report-data.json')

    try:
        report_json = build_report_json(args.skill_path, args.quality_report_dir)

        # Write output
        output_file = Path(output_path)
        output_file.write_text(json.dumps(report_json, indent=2, ensure_ascii=False), encoding='utf-8')

        print(f'Report JSON written to {output_path}', file=sys.stderr)
        print(json.dumps({'status': 'success', 'output': output_path}, indent=2))

    except Exception as e:
        print(f'Error: {e}', file=sys.stderr)
        sys.exit(1)


if __name__ == '__main__':
    main()