#!/usr/bin/env python3 """Deterministic extraction of report-data.json from analysis outputs. Reads scanner outputs (markdown + JSON) and extracts structured data without LLM synthesis. Ensures no data loss and completes in <10 seconds. Usage: python3 extract-report-json.py {skill-path} {quality-report-dir} -o {output-file} """ from __future__ import annotations import argparse import json import re import sys from datetime import datetime, timezone from pathlib import Path def extract_section(content: str, section_name: str, level: int = 2) -> str | None: """Extract a section from markdown by heading name.""" pattern = r'^#{' + str(level) + r'}\s+' + re.escape(section_name) + r'\s*\n(.*?)(?=^#{1,' + str(level) + r'}\s|\Z)' match = re.search(pattern, content, re.MULTILINE | re.DOTALL) return match.group(1).strip() if match else None def extract_journeys(content: str) -> list[dict]: """Extract user journey archetypes from enhancement-analysis.md.""" journeys = [] # Match ### N. {Name}: {Description} pattern = r'^###\s+\d+\.\s+([^:]+):\s+(.+?)(?=^###|\Z)' for match in re.finditer(pattern, content, re.MULTILINE | re.DOTALL): name = match.group(1).strip() section = match.group(2) # Extract narrative (after "Narrative." or "Narrative\n") narrative_match = re.search(r'(?:Narrative[:.]\s*)?([^\n]+(?:\n[^*\n][^\n]*)*?)(?=\n\*\*|\n[A-Z])', section) summary = narrative_match.group(1).strip() if narrative_match else "" # Extract friction points friction_points = [] friction_section = re.search(r'\*\*Friction points?[:\*]*\*\*\s*\n(.*?)(?=\n\*\*|\n[A-Z]|$)', section, re.DOTALL) if friction_section: for line in friction_section.group(1).split('\n'): line = line.strip() if line.startswith('- '): friction_points.append(line[2:].strip()) # Extract bright spots bright_spots = [] bright_section = re.search(r'\*\*Bright spots?[:\*]*\*\*\s*\n(.*?)(?=\n\*\*|\n[A-Z]|$)', section, re.DOTALL) if bright_section: for line in bright_section.group(1).split('\n'): line = line.strip() if line.startswith('- '): bright_spots.append(line[2:].strip()) journeys.append({ 'archetype': name, 'summary': summary, 'friction_points': friction_points, 'bright_spots': bright_spots }) return journeys def extract_autonomous(content: str) -> dict: """Extract headless/automation assessment from enhancement-analysis.md.""" assessment_section = extract_section(content, 'Headless Assessment', level=2) if not assessment_section: return {} # Look for "Current Level:" or "Potential:" pattern potential_match = re.search(r'(?:Current Level|Potential)[:\*]*\s*([^\n.]+)', assessment_section) potential = potential_match.group(1).strip() if potential_match else "unknown" # Get the rest as notes notes = assessment_section if potential_match: notes = assessment_section[potential_match.end():].strip() return { 'potential': potential, 'notes': notes[:200] if notes else "" # Truncate to 200 chars } def extract_findings_from_md(content: str, source_scanner: str) -> list[dict]: """Extract individual findings from analysis markdown. Handles multiple formats: - Architecture: level 4 headings under severity sections (### HIGH, etc) - Determinism: bold headings with severity markers [HIGH], [LOW] - Customization: bold headings with opportunity markers (HIGH-OPPORTUNITY, etc) - Enhancement: numbered findings with severity/opportunity markers """ findings = [] if source_scanner == 'architecture': # Architecture format: ### SEVERITY followed by #### N. Title severity_pattern = r'^###\s+(CRITICAL|HIGH|MEDIUM|LOW)\s*$' severity_sections = re.split(severity_pattern, content, flags=re.MULTILINE) for i in range(1, len(severity_sections), 2): severity = severity_sections[i].lower() if i < len(severity_sections) else "medium" section_content = severity_sections[i + 1] if i + 1 < len(severity_sections) else "" if not section_content.strip() or section_content.strip() == "None": continue # Extract level 4 findings (#### N. Title) finding_pattern = r'^####\s+(\d+\.\s+)?(.+?)$' for match in re.finditer(finding_pattern, section_content, re.MULTILINE): finding_title = match.group(2).strip() if finding_title: findings.append({ 'title': finding_title, 'severity': severity, 'source': source_scanner }) elif source_scanner == 'determinism': # Determinism format: ### **[SEVERITY] Title** pattern = r'###\s+\*\*\[([A-Z]+)\]\s+([^*]+)\*\*' for match in re.finditer(pattern, content, re.MULTILINE): severity = match.group(1).lower() title = match.group(2).strip() if title: findings.append({ 'title': title, 'severity': severity, 'source': source_scanner }) elif source_scanner == 'customization': # Customization format: ### N. **Title** (OPPORTUNITY-TYPE) pattern = r'###\s+\d+\.\s+\*\*([^*]+)\*\*\s+\(([A-Z-]+)\)' for match in re.finditer(pattern, content, re.MULTILINE): title = match.group(1).strip() opportunity = match.group(2).lower() # Map opportunity to severity severity = 'high' if 'high' in opportunity else 'medium' if 'medium' in opportunity else 'low' if title: findings.append({ 'title': title, 'severity': severity, 'source': source_scanner }) elif source_scanner == 'enhancement': # Enhancement format: ### LEVEL Findings section followed by #### N. Title # Extract opportunity sections (HIGH-OPPORTUNITY, SECONDARY-OPPORTUNITY, etc) opportunity_pattern = r'^###\s+([A-Z-]+)\s+(?:Findings|Opportunities?)' opportunity_sections = re.split(opportunity_pattern, content, flags=re.MULTILINE) for i in range(1, len(opportunity_sections), 2): opportunity = opportunity_sections[i].lower() if i < len(opportunity_sections) else "medium" section_content = opportunity_sections[i + 1] if i + 1 < len(opportunity_sections) else "" if not section_content.strip(): continue # Map opportunity to severity severity = 'high' if 'high' in opportunity else 'medium' if 'secondary' in opportunity else 'low' # Extract level 4 findings (#### N. Title) finding_pattern = r'^####\s+(\d+\.\s+)?(.+?)$' for match in re.finditer(finding_pattern, section_content, re.MULTILINE): finding_title = match.group(2).strip() if finding_title: findings.append({ 'title': finding_title, 'severity': severity, 'source': source_scanner }) return findings def merge_prepass_data(report_dir: Path) -> dict: """Load and merge all prepass JSON data.""" merged = {} for json_file in report_dir.glob('*-prepass.json'): try: data = json.loads(json_file.read_text(encoding='utf-8')) merged.update(data) except Exception: pass # Skip if not valid JSON return merged def build_report_json(skill_path: str, quality_report_dir: str) -> dict: """Extract and build complete report-data.json.""" report_dir = Path(quality_report_dir) skill_name = Path(skill_path).name timestamp = datetime.now(timezone.utc).isoformat() # Read all analysis files architecture_content = (report_dir / 'architecture-analysis.md').read_text(encoding='utf-8') if (report_dir / 'architecture-analysis.md').exists() else "" determinism_content = (report_dir / 'determinism-analysis.md').read_text(encoding='utf-8') if (report_dir / 'determinism-analysis.md').exists() else "" customization_content = (report_dir / 'customization-analysis.md').read_text(encoding='utf-8') if (report_dir / 'customization-analysis.md').exists() else "" enhancement_content = (report_dir / 'enhancement-analysis.md').read_text(encoding='utf-8') if (report_dir / 'enhancement-analysis.md').exists() else "" # Extract assessments arch_assessment = extract_section(architecture_content, 'Assessment', level=2) or "" det_assessment = extract_section(determinism_content, 'Assessment', level=2) or "" cust_assessment = extract_section(customization_content, 'Overall Assessment', level=2) or "" enh_assessment = extract_section(enhancement_content, 'Summary', level=2) or "" # Extract journeys and autonomous from enhancement journeys = extract_journeys(enhancement_content) autonomous = extract_autonomous(enhancement_content) # Build detailed_analysis detailed_analysis = { 'architecture': { 'assessment': arch_assessment[:500], # First 500 chars 'findings': extract_findings_from_md(architecture_content, 'architecture') }, 'determinism': { 'assessment': det_assessment[:500], 'findings': extract_findings_from_md(determinism_content, 'determinism') }, 'customization': { 'assessment': cust_assessment[:500], 'posture': 'not-opted-in', # From content 'findings': extract_findings_from_md(customization_content, 'customization') }, 'enhancement': { 'assessment': enh_assessment[:500], 'journeys': journeys, 'autonomous': autonomous, 'findings': extract_findings_from_md(enhancement_content, 'enhancement') } } # Build basic structure - minimal for now, will be expanded by report creator if needed report_data = { 'meta': { 'skill_name': skill_name, 'skill_path': skill_path, 'timestamp': timestamp, 'scanner_count': 4 }, 'narrative': enh_assessment[:150] if enh_assessment else "", # Placeholder 'grade': 'Good', # Placeholder - report creator sets this 'broken': [], 'opportunities': [], 'strengths': [], 'recommendations': [], 'detailed_analysis': detailed_analysis } return report_data def main(): parser = argparse.ArgumentParser(description='Extract report-data.json from analysis outputs') parser.add_argument('skill_path', help='Path to the skill being analyzed') parser.add_argument('quality_report_dir', help='Directory with analysis outputs and where to write report') parser.add_argument('-o', '--output', help='Output file path (default: {quality_report_dir}/report-data.json)') args = parser.parse_args() output_path = args.output or str(Path(args.quality_report_dir) / 'report-data.json') try: report_json = build_report_json(args.skill_path, args.quality_report_dir) # Write output output_file = Path(output_path) output_file.write_text(json.dumps(report_json, indent=2, ensure_ascii=False), encoding='utf-8') print(f'Report JSON written to {output_path}', file=sys.stderr) print(json.dumps({'status': 'success', 'output': output_path}, indent=2)) except Exception as e: print(f'Error: {e}', file=sys.stderr) sys.exit(1) if __name__ == '__main__': main()