WoMenQuNaJu/MeetSpot/tools/postmortem_init.py

#!/usr/bin/env python3
"""
Postmortem Onboarding 脚本
分析历史 fix commits，生成初始 postmortem 集合

使用方法:
    python tools/postmortem_init.py [--since 2025-06-01] [--limit 50] [--dry-run]
"""
import argparse
import asyncio
import json
import re
import subprocess
import sys
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional

import yaml

# 添加项目根目录到 Python 路径
sys.path.insert(0, str(Path(__file__).parent.parent))

POSTMORTEM_DIR = Path(__file__).parent.parent / "postmortem"


def get_fix_commits(since: Optional[str] = None, limit: int = 100) -> List[Dict]:
    """获取 fix commits 列表"""
    cmd = [
        "git",
        "log",
        "--grep=^fix",
        "-i",
        "--all",
        "--format=%H|%s|%aI",
    ]
    if since:
        cmd.extend(["--since", since])

    result = subprocess.run(cmd, capture_output=True, text=True, cwd=POSTMORTEM_DIR.parent)
    commits = []

    for line in result.stdout.strip().split("\n"):
        if not line:
            continue
        parts = line.split("|", 2)
        if len(parts) >= 3:
            commits.append(
                {
                    "hash": parts[0],
                    "subject": parts[1],
                    "date": parts[2],
                }
            )

    return commits[:limit]


def get_commit_details(commit_hash: str) -> Dict:
    """获取 commit 的详细信息"""
    cwd = POSTMORTEM_DIR.parent

    # 获取 body
    body_cmd = ["git", "log", "-1", "--format=%b", commit_hash]
    body_result = subprocess.run(body_cmd, capture_output=True, text=True, cwd=cwd)
    body = body_result.stdout.strip()

    # 获取修改的文件
    files_cmd = ["git", "show", commit_hash, "--name-only", "--format="]
    files_result = subprocess.run(files_cmd, capture_output=True, text=True, cwd=cwd)
    files = [f for f in files_result.stdout.strip().split("\n") if f]

    # 获取 diff 内容（限制大小，只看 .py 文件）
    diff_cmd = ["git", "show", commit_hash, "--stat", "-p", "--", "*.py"]
    diff_result = subprocess.run(diff_cmd, capture_output=True, text=True, cwd=cwd)
    diff = diff_result.stdout[:6000]  # 限制 6KB

    return {"body": body, "files": files, "diff": diff}


def assess_commit_quality(commit: Dict, details: Dict) -> float:
    """评估 commit 消息质量，决定是否值得生成 postmortem"""
    score = 0.0
    body = details.get("body", "")
    subject = commit.get("subject", "")

    # 有详细描述
    if len(body) > 50:
        score += 0.3
    if len(body) > 150:
        score += 0.2

    # 有问题描述关键词
    problem_keywords = ["问题", "原因", "修复", "bug", "error", "issue", "cause", "fix"]
    if any(kw in body.lower() for kw in problem_keywords):
        score += 0.2

    # 有结构化格式
    if any(marker in body for marker in ["##", "- ", "1.", "*"]):
        score += 0.1

    # scope 清晰
    if "(" in subject and ")" in subject:
        score += 0.1

    # 修改了重要文件
    important_patterns = ["recommender", "api/index", "llm", "config"]
    if any(
        any(pat in f for pat in important_patterns) for f in details.get("files", [])
    ):
        score += 0.1

    return min(1.0, score)


def parse_llm_response(response: str) -> Dict:
    """健壮的 JSON 解析，处理 LLM 输出的各种格式"""
    # 尝试直接解析
    try:
        return json.loads(response)
    except json.JSONDecodeError:
        pass

    # 去除 markdown 代码块
    cleaned = re.sub(r"^```(?:json)?\s*", "", response, flags=re.MULTILINE)
    cleaned = re.sub(r"\s*```$", "", cleaned, flags=re.MULTILINE)
    cleaned = cleaned.strip()

    try:
        return json.loads(cleaned)
    except json.JSONDecodeError:
        pass

    # 尝试提取 JSON 对象
    match = re.search(r"\{[\s\S]*\}", cleaned)
    if match:
        try:
            return json.loads(match.group())
        except json.JSONDecodeError:
            pass

    # 返回基础结构
    return {
        "title": "解析失败",
        "description": response[:500],
        "severity": "low",
        "tags": ["parse-failed"],
    }


async def generate_postmortem_with_llm(commit: Dict, details: Dict) -> Dict:
    """使用 LLM 生成 postmortem"""
    try:
        from app.llm import LLM

        llm = LLM()
    except ImportError as e:
        print(f"  Warning: Cannot import LLM module: {e}")
        return extract_from_commit(commit, details)
    except Exception as e:
        print(f"  Warning: LLM init failed: {e}")
        return extract_from_commit(commit, details)

    prompt = f"""分析以下 git fix commit，生成一个 postmortem 条目。

Commit 信息:
- Subject: {commit['subject']}
- Date: {commit['date']}
- Body: {details.get('body', '(无)')[:1000]}

修改的文件:
{chr(10).join(details['files'][:15])}

代码变更摘要:
{details['diff'][:3000]}

请生成 JSON 格式的 postmortem，包含以下字段：
1. title: 简短标题（中文，10-30字）
2. description: 问题描述（2-3句话，描述问题现象和影响）
3. root_cause: 根因分析（1-2句话）
4. severity: critical/high/medium/low（根据影响范围判断）
5. triggers: 对象，包含:
   - files: 相关文件模式列表（如 "app/tool/*.py"）
   - functions: 相关函数名列表（从 diff 中提取）
   - patterns: 正则匹配模式列表（用于匹配未来的 diff 内容）
   - keywords: 关键词列表（中英文都可以）
6. fix_pattern: 对象，包含:
   - approach: 修复方法描述
   - key_changes: 关键变更点列表
7. verification: 验证检查点列表（未来修改相关代码时应检查的事项）
8. tags: 标签列表（用于分类，如 geocoding, ui, api 等）

只返回 JSON，不要其他文字。"""

    try:
        response = await llm.ask(
            messages=[{"role": "user", "content": prompt}],
            stream=False,
            temperature=0.2,
        )
        return parse_llm_response(response)
    except Exception as e:
        print(f"  Warning: LLM call failed: {e}")
        return extract_from_commit(commit, details)


def extract_from_commit(commit: Dict, details: Dict) -> Dict:
    """从 commit 消息直接提取（无 LLM fallback）"""
    subject = commit.get("subject", "")
    body = details.get("body", "")
    files = details.get("files", [])

    # 从 scope 提取 tags
    scope_match = re.search(r"fix\((\w+)\)", subject, re.IGNORECASE)
    tags = [scope_match.group(1)] if scope_match else []

    # 清理标题
    title = subject
    title = re.sub(r"^fix(\([^)]+\))?:\s*", "", title, flags=re.IGNORECASE)

    # 提取函数名
    functions = []
    diff = details.get("diff", "")
    func_matches = re.findall(r"def\s+(\w+)\s*\(", diff)
    functions = list(set(func_matches))[:5]

    return {
        "title": title[:50] if title else "Fix commit",
        "description": body[:300] if body else subject,
        "root_cause": "See commit body for details",
        "severity": "medium",
        "triggers": {
            "files": files[:5],
            "functions": functions,
            "patterns": [],
            "keywords": tags or ["general"],
        },
        "fix_pattern": {
            "approach": title,
            "key_changes": [title],
        },
        "verification": ["Review related code changes"],
        "tags": tags or ["general"],
    }


def get_next_pm_id(year: int) -> str:
    """获取下一个 postmortem ID"""
    POSTMORTEM_DIR.mkdir(exist_ok=True)
    existing = list(POSTMORTEM_DIR.glob(f"PM-{year}-*.yaml"))
    if not existing:
        return f"PM-{year}-001"

    max_num = max(int(f.stem.split("-")[-1]) for f in existing)
    return f"PM-{year}-{max_num + 1:03d}"


def save_postmortem(pm_data: Dict, commit: Dict, details: Dict, pm_id: str) -> Path:
    """保存 postmortem 到 YAML 文件"""
    POSTMORTEM_DIR.mkdir(exist_ok=True)

    # 确保 triggers 有完整结构
    triggers = pm_data.get("triggers", {})
    if not isinstance(triggers, dict):
        triggers = {}

    output = {
        "id": pm_id,
        "created_at": datetime.utcnow().isoformat() + "Z",
        "source_commit": commit["hash"][:7],
        "severity": pm_data.get("severity", "medium"),
        "title": pm_data.get("title", "Untitled"),
        "description": pm_data.get("description", ""),
        "root_cause": pm_data.get("root_cause", ""),
        "triggers": {
            "files": triggers.get("files", details.get("files", [])[:5]),
            "functions": triggers.get("functions", []),
            "patterns": triggers.get("patterns", []),
            "keywords": triggers.get("keywords", []),
        },
        "fix_pattern": pm_data.get("fix_pattern", {}),
        "verification": pm_data.get("verification", []),
        "related": {
            "files_changed": details.get("files", []),
        },
        "tags": pm_data.get("tags", []),
    }

    filepath = POSTMORTEM_DIR / f"{pm_id}.yaml"
    with open(filepath, "w", encoding="utf-8") as f:
        yaml.dump(
            output, f, allow_unicode=True, default_flow_style=False, sort_keys=False
        )

    return filepath


async def main():
    parser = argparse.ArgumentParser(description="Postmortem Onboarding")
    parser.add_argument("--since", help="Start date (YYYY-MM-DD)")
    parser.add_argument("--limit", type=int, default=50, help="Max commits to process")
    parser.add_argument("--dry-run", action="store_true", help="Preview only, no generation")
    parser.add_argument(
        "--min-quality",
        type=float,
        default=0.2,
        help="Minimum quality score to generate postmortem",
    )
    args = parser.parse_args()

    print("Fetching fix commits...")
    commits = get_fix_commits(since=args.since, limit=args.limit)
    print(f"Found {len(commits)} fix commits")

    if not commits:
        print("No fix commits found.")
        return

    generated = 0
    skipped = 0

    for i, commit in enumerate(commits):
        print(f"\n[{i + 1}/{len(commits)}] {commit['hash'][:7]}: {commit['subject'][:60]}")

        details = get_commit_details(commit["hash"])
        quality = assess_commit_quality(commit, details)

        print(f"  Quality: {quality:.2f}, Files: {len(details['files'])}")

        if quality < args.min_quality:
            print(f"  Skipped: quality below threshold ({args.min_quality})")
            skipped += 1
            continue

        if args.dry_run:
            print("  [DRY-RUN] Would generate postmortem")
            continue

        # 生成 postmortem
        pm_data = await generate_postmortem_with_llm(commit, details)

        # 生成 ID（使用 commit 日期的年份）
        year = int(commit["date"][:4])
        pm_id = get_next_pm_id(year)

        filepath = save_postmortem(pm_data, commit, details, pm_id)
        print(f"  Saved: {filepath}")
        generated += 1

    print(f"\n{'=' * 50}")
    print(f"Summary: Generated {generated}, Skipped {skipped}")
    if generated > 0:
        print(f"Postmortems saved to: {POSTMORTEM_DIR}/")


if __name__ == "__main__":
    asyncio.run(main())