build_backend_code_knowledge.py 20.8 KB

Raw Blame History Permalink

#!/usr/bin/env python3
"""
扫描后台 Java 多模块仓库，生成代码实现补充知识资产。

输出：
  dist/backend_code/code_atoms.jsonl
  dist/backend_code/01_接口契约.md
  dist/backend_code/02_枚举与状态.md
  dist/backend_code/03_实现约束.md
  dist/backend_code/04_模块映射.md
  dist/backend_code/05_业务实现主题.md
"""

from __future__ import annotations

import argparse
import hashlib
import json
import re
from collections import defaultdict
from pathlib import Path


BASE_DIR = Path(__file__).parent.parent
OUT_DIR = BASE_DIR / "dist" / "backend_code"

CTRL_GLOBS = ("**/src/main/java/**/*Controller.java",)
ENUM_GLOBS = ("**/src/main/java/**/*Enum.java", "**/src/main/java/**/*Type.java", "**/src/main/java/**/*Status.java")
JAVA_GLOBS = ("**/src/main/java/**/*.java",)

AUTH_ANNOS = {
    "DoctorAccess",
    "PatientAccess",
    "InternalAccess",
    "AssistantDeny",
    "AdminAccess",
}

MODULE_MAP = {
    "account": "AUTH",
    "doctor": "AUTH",
    "income": "INCOME",
    "pay": "INCOME",
    "alipay": "INCOME",
    "appointment": "CLINIC",
    "patient": "PATIENT",
    "relation": "PATIENT",
    "followup": "PATIENT",
    "chat": "INQUIRY",
    "solution": "INQUIRY",
    "classical": "INQUIRY",
    "express": "GENERAL",
    "other": "GENERAL",
    "openai": "GENERAL",
    "medconnect": "GENERAL",
    "his": "GENERAL",
    "studio": "GENERAL",
    "wechat": "NOTIFICATION",
    "sms": "NOTIFICATION",
    "netease": "NOTIFICATION",
}

MODULE_NAMES = {
    "AUTH": "认证",
    "INCOME": "收入提现",
    "INQUIRY": "问诊",
    "CLINIC": "门诊",
    "PATIENT": "患者",
    "NOTIFICATION": "通知",
    "BACKSTAGE": "后台",
    "GENERAL": "通用",
}


def clean_text(text: str) -> str:
    return re.sub(r"\s+", " ", str(text or "")).strip()


def top_module_from_path(path: Path, repo_root: Path) -> str:
    rel = path.relative_to(repo_root)
    return rel.parts[0]


def infer_business_module(module_name: str) -> str:
    for prefix, tag in MODULE_MAP.items():
        if module_name.startswith(prefix):
            return tag
    return "GENERAL"


def make_atom_id(atom_type: str, source_path: str, method_or_name: str) -> str:
    raw = f"{atom_type}:{source_path}:{method_or_name}"
    return hashlib.md5(raw.encode("utf-8")).hexdigest()


def read_text(path: Path) -> str:
    try:
        return path.read_text(encoding="utf-8")
    except UnicodeDecodeError:
        return path.read_text(encoding="utf-8", errors="ignore")


def first_doc_comment(text: str, anchor: str) -> str:
    idx = text.find(anchor)
    if idx == -1:
        return ""
    head = text[:idx]
    matches = list(re.finditer(r"/\*\*(.*?)\*/", head, re.S))
    if not matches:
        return ""
    body = matches[-1].group(1)
    lines = []
    for raw in body.splitlines():
        line = re.sub(r"^\s*\*\s?", "", raw).strip()
        if line and not line.startswith("@"):
            lines.append(line)
    return clean_text(" ".join(lines))


def class_request_mapping(text: str) -> str:
    class_idx = text.find("public class")
    if class_idx == -1:
        return ""
    head = text[:class_idx]
    matches = list(re.finditer(r"@RequestMapping\(([^)]*)\)", head, re.S))
    if not matches:
        return ""
    inner = matches[-1].group(1)
    value = re.search(r'"([^"]+)"', inner)
    return value.group(1) if value else ""


def prettify_scope_from_path(base_path: str) -> str:
    parts = [part for part in base_path.split("/") if part and part != "api"]
    if not parts:
        return ""
    if len(parts) >= 2:
        return " / ".join(parts[-2:])
    return parts[-1]


def controller_feature_scope(class_name: str, class_doc: str, base_path: str) -> str:
    doc = clean_text(class_doc)
    if doc and len(doc) <= 24 and "controller" not in doc.lower():
        return doc
    route_scope = prettify_scope_from_path(base_path)
    if route_scope:
        return route_scope
    return class_name.replace("Controller", "")


def extract_auth_annotations(block: str) -> list[str]:
    result = []
    for anno in AUTH_ANNOS:
        if f"@{anno}" in block:
            result.append(anno)
    return sorted(result)


def method_http_and_path(annotation: str) -> tuple[str, str]:
    if annotation.startswith("@GetMapping"):
        verb = "GET"
    elif annotation.startswith("@PostMapping"):
        verb = "POST"
    elif annotation.startswith("@PutMapping"):
        verb = "PUT"
    elif annotation.startswith("@DeleteMapping"):
        verb = "DELETE"
    else:
        verb_match = re.search(r"RequestMethod\.([A-Z]+)", annotation)
        verb = verb_match.group(1) if verb_match else "REQUEST"

    path_match = re.search(r'"([^"]*)"', annotation)
    path = path_match.group(1) if path_match else ""
    return verb, path


def extract_api_atoms(repo_root: Path) -> list[dict]:
    atoms = []
    for glob in CTRL_GLOBS:
        for path in sorted(repo_root.glob(glob)):
            text = read_text(path)
            module_name = top_module_from_path(path, repo_root)
            business_module = infer_business_module(module_name)
            base_path = class_request_mapping(text)
            class_name = path.stem
            class_doc = first_doc_comment(text, f"public class {class_name}")

            pattern = re.compile(
                r"(?P<block>(?:\s*@[\w().,=\"/{}\s:-]+\n)+)\s*public\s+[^{;=\n]+?\s+(?P<method>\w+)\s*\(",
                re.M,
            )
            for match in pattern.finditer(text):
                block = match.group("block")
                method = match.group("method")
                mapping_line = ""
                for line in block.splitlines():
                    line = line.strip()
                    if line.startswith(("@GetMapping", "@PostMapping", "@PutMapping", "@DeleteMapping", "@RequestMapping")):
                        mapping_line = line
                        break
                if not mapping_line:
                    continue
                verb, sub_path = method_http_and_path(mapping_line)
                auth = extract_auth_annotations(block)
                summary = first_doc_comment(text[: match.start()], "")
                if not summary:
                    javadocs = list(re.finditer(r"/\*\*(.*?)\*/", text[: match.start()], re.S))
                    if javadocs:
                        body = javadocs[-1].group(1)
                        summary = clean_text(
                            " ".join(
                                re.sub(r"^\s*\*\s?", "", line).strip()
                                for line in body.splitlines()
                                if re.sub(r"^\s*\*\s?", "", line).strip() and not re.sub(r"^\s*\*\s?", "", line).strip().startswith("@")
                            )
                        )
                full_path = f"{base_path.rstrip('/')}/{sub_path.lstrip('/')}".replace("//", "/")
                feature_scope = controller_feature_scope(class_name, class_doc, base_path)
                rule = f"{verb} {full_path} -> {method}"
                if summary:
                    rule += f"；说明：{summary}"
                if auth:
                    rule += f"；鉴权：{', '.join(auth)}"
                atoms.append(
                    {
                        "atom_id": make_atom_id("api_contract", str(path.relative_to(repo_root)), method),
                        "atom_type": "api_contract",
                        "feature_scope": clean_text(feature_scope) or class_name,
                        "module_name": module_name,
                        "primary_module": business_module,
                        "modules": [business_module],
                        "source_path": str(path),
                        "repo_relative_path": str(path.relative_to(repo_root)),
                        "method_name": method,
                        "http_method": verb,
                        "route_path": full_path,
                        "rule_text": rule,
                        "canon_text": clean_text(rule),
                        "qa_status": "validated",
                    }
                )
    return atoms


def extract_enum_atoms(repo_root: Path) -> list[dict]:
    atoms = []
    seen_paths = set()
    for glob in ENUM_GLOBS:
        for path in sorted(repo_root.glob(glob)):
            if path in seen_paths:
                continue
            seen_paths.add(path)
            text = read_text(path)
            module_name = top_module_from_path(path, repo_root)
            business_module = infer_business_module(module_name)
            enum_match = re.search(r"public enum (\w+)\s*\{(.*?)(?:;|\n\s*[a-zA-Z@])", text, re.S)
            if not enum_match:
                continue
            enum_name = enum_match.group(1)
            body = enum_match.group(2)
            items = []
            for raw in body.splitlines():
                line = clean_text(raw)
                if not line or line.startswith(("/", "*")):
                    continue
                if "(" in line and re.match(r"^[A-Z0-9_]+\(.*", line):
                    name = line.split("(", 1)[0]
                    args = re.findall(r'"([^"]+)"|(-?\d+)', line)
                    flat = [a or b for a, b in args if a or b]
                    desc = " / ".join(flat[:2])
                    items.append(f"{name}({desc})" if desc else name)
                elif re.match(r"^[A-Z0-9_]+[,;]?$", line):
                    items.append(line.rstrip(",;"))
                if len(items) >= 10:
                    break
            doc = first_doc_comment(text, f"public enum {enum_name}")
            rule = f"{enum_name}：{doc or '代码枚举定义'}"
            if items:
                rule += f"；取值样例：{', '.join(items[:6])}"
            atoms.append(
                {
                    "atom_id": make_atom_id("enum_definition", str(path.relative_to(repo_root)), enum_name),
                    "atom_type": "enum_definition",
                    "feature_scope": enum_name,
                    "module_name": module_name,
                    "primary_module": business_module,
                    "modules": [business_module],
                    "source_path": str(path),
                    "repo_relative_path": str(path.relative_to(repo_root)),
                    "enum_name": enum_name,
                    "rule_text": rule,
                    "canon_text": clean_text(rule),
                    "qa_status": "validated",
                }
            )
    return atoms


def extract_constraint_atoms(repo_root: Path) -> list[dict]:
    atoms = []
    for glob in JAVA_GLOBS:
        for path in sorted(repo_root.glob(glob)):
            text = read_text(path)
            module_name = top_module_from_path(path, repo_root)
            business_module = infer_business_module(module_name)
            class_name = path.stem

            for idx, message in enumerate(re.findall(r'Assert\.isTrue\([^,]+,\s*"([^"]+)"\)', text)):
                rule = f"断言约束：{message}"
                atoms.append(
                    {
                        "atom_id": make_atom_id("impl_constraint", str(path.relative_to(repo_root)), f"assert-{idx}"),
                        "atom_type": "impl_constraint",
                        "feature_scope": class_name,
                        "module_name": module_name,
                        "primary_module": business_module,
                        "modules": [business_module],
                        "source_path": str(path),
                        "repo_relative_path": str(path.relative_to(repo_root)),
                        "rule_text": clean_text(rule),
                        "canon_text": clean_text(rule),
                        "qa_status": "validated",
                    }
                )

            for idx, message in enumerate(re.findall(r'throw new BusinessException\("([^"]+)"\)', text)):
                rule = f"业务异常：{message}"
                atoms.append(
                    {
                        "atom_id": make_atom_id("impl_constraint", str(path.relative_to(repo_root)), f"biz-{idx}"),
                        "atom_type": "impl_constraint",
                        "feature_scope": class_name,
                        "module_name": module_name,
                        "primary_module": business_module,
                        "modules": [business_module],
                        "source_path": str(path),
                        "repo_relative_path": str(path.relative_to(repo_root)),
                        "rule_text": clean_text(rule),
                        "canon_text": clean_text(rule),
                        "qa_status": "validated",
                    }
                )

            for idx, expr in enumerate(re.findall(r"@RequestLock\(([^)]*)\)", text)):
                rule = f"请求锁：{clean_text(expr)}"
                atoms.append(
                    {
                        "atom_id": make_atom_id("impl_constraint", str(path.relative_to(repo_root)), f"lock-{idx}"),
                        "atom_type": "impl_constraint",
                        "feature_scope": class_name,
                        "module_name": module_name,
                        "primary_module": business_module,
                        "modules": [business_module],
                        "source_path": str(path),
                        "repo_relative_path": str(path.relative_to(repo_root)),
                        "rule_text": clean_text(rule),
                        "canon_text": clean_text(rule),
                        "qa_status": "validated",
                    }
                )
    dedup = {}
    for atom in atoms:
        key = (atom["repo_relative_path"], atom["rule_text"])
        dedup[key] = atom
    return list(dedup.values())


def render_api_contracts(atoms: list[dict]) -> str:
    lines = [
        "# 后台代码接口契约",
        "",
        "来源：后台仓库 controller 层，提取了 API 路径、方法和鉴权注解。",
        "",
    ]
    groups: dict[str, list[dict]] = defaultdict(list)
    for atom in atoms:
        groups[atom["primary_module"]].append(atom)
    for module in sorted(groups):
        lines.append(f"## {module} / {MODULE_NAMES.get(module, module)}")
        lines.append("")
        for atom in sorted(groups[module], key=lambda x: (x["feature_scope"], x["route_path"], x["method_name"])):
            lines.append(f"### {atom['feature_scope']}")
            lines.append("")
            lines.append(f"- 接口：{atom['http_method']} {atom['route_path']}")
            lines.append(f"- 方法：{atom['method_name']}")
            lines.append(f"- 模块目录：{atom['module_name']}")
            lines.append(f"- 文件：{atom['repo_relative_path']}")
            lines.append(f"- 说明：{atom['rule_text']}")
            lines.append("")
        lines.append("---")
        lines.append("")
    return "\n".join(lines)


def render_enums(atoms: list[dict]) -> str:
    lines = [
        "# 后台代码枚举与状态",
        "",
        "来源：后台仓库 enum/constant 层，适合补状态机、类型取值、默认枚举口径。",
        "",
    ]
    groups: dict[str, list[dict]] = defaultdict(list)
    for atom in atoms:
        groups[atom["primary_module"]].append(atom)
    for module in sorted(groups):
        lines.append(f"## {module} / {MODULE_NAMES.get(module, module)}")
        lines.append("")
        for atom in sorted(groups[module], key=lambda x: x["feature_scope"]):
            lines.append(f"- {atom['feature_scope']}：{atom['rule_text']}")
            lines.append(f"  文件：{atom['repo_relative_path']}")
        lines.append("")
    return "\n".join(lines)


def render_constraints(atoms: list[dict]) -> str:
    lines = [
        "# 后台代码实现约束",
        "",
        "来源：后台仓库中的断言、业务异常和请求锁，适合补实现边界和预评审风险。",
        "",
    ]
    groups: dict[str, list[dict]] = defaultdict(list)
    for atom in atoms:
        groups[atom["primary_module"]].append(atom)
    for module in sorted(groups):
        lines.append(f"## {module} / {MODULE_NAMES.get(module, module)}")
        lines.append("")
        for atom in sorted(groups[module], key=lambda x: (x["feature_scope"], x["repo_relative_path"])):
            lines.append(f"- {atom['feature_scope']}：{atom['rule_text']} | {atom['repo_relative_path']}")
        lines.append("")
    return "\n".join(lines)


def render_module_map(all_atoms: list[dict]) -> str:
    module_paths: dict[str, set[str]] = defaultdict(set)
    for atom in all_atoms:
        module_paths[atom["primary_module"]].add(atom["module_name"])
    lines = [
        "# 后台代码模块映射",
        "",
        "这个映射用于把后台实现层挂回现有产品知识库的辅助模块标签。",
        "",
    ]
    for module in sorted(module_paths):
        lines.append(f"## {module} / {MODULE_NAMES.get(module, module)}")
        lines.append("")
        for item in sorted(module_paths[module]):
            lines.append(f"- {item}")
        lines.append("")
    return "\n".join(lines)


def render_business_topics(api_atoms: list[dict], enum_atoms: list[dict], constraint_atoms: list[dict]) -> str:
    lines = [
        "# 后台代码业务实现主题",
        "",
        "按业务主题汇总后台实现层的接口、状态和约束，便于接入预评审和代码知识库。",
        "",
    ]
    topic_groups: dict[tuple[str, str], dict[str, list[dict]]] = defaultdict(lambda: {"api": [], "enum": [], "constraint": []})
    for atom in api_atoms:
        topic_groups[(atom["primary_module"], atom["feature_scope"])]["api"].append(atom)
    for atom in enum_atoms:
        key = (atom["primary_module"], atom["module_name"])
        topic_groups[key]["enum"].append(atom)
    for atom in constraint_atoms:
        key = (atom["primary_module"], atom["feature_scope"])
        topic_groups[key]["constraint"].append(atom)

    for module in MODULE_NAMES:
        module_topics = [(scope, topic_groups[(module, scope)]) for m, scope in topic_groups if m == module]
        if not module_topics:
            continue
        lines.append(f"## {module} / {MODULE_NAMES.get(module, module)}")
        lines.append("")
        for scope, bucket in sorted(module_topics, key=lambda x: (x[0].lower(), len(x[1]['api']) + len(x[1]['constraint'])), reverse=False)[:40]:
            lines.append(f"### {scope}")
            lines.append("")
            if bucket["api"]:
                lines.append(f"- 接口数量：{len(bucket['api'])}")
                samples = sorted(bucket["api"], key=lambda x: (x["route_path"], x["method_name"]))[:5]
                for atom in samples:
                    auth = ""
                    if "鉴权：" in atom["rule_text"]:
                        auth = atom["rule_text"].split("鉴权：", 1)[1]
                    lines.append(f"- 接口样例：{atom['http_method']} {atom['route_path']}" + (f" | {auth}" if auth else ""))
            if bucket["constraint"]:
                lines.append(f"- 约束数量：{len(bucket['constraint'])}")
                for atom in bucket["constraint"][:5]:
                    lines.append(f"- 约束样例：{atom['rule_text']}")
            if bucket["enum"]:
                lines.append(f"- 枚举数量：{len(bucket['enum'])}")
                for atom in bucket["enum"][:3]:
                    lines.append(f"- 枚举样例：{atom['feature_scope']}")
            lines.append("")
        lines.append("---")
        lines.append("")
    return "\n".join(lines)


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--repo", required=True, help="后台仓库根目录")
    args = parser.parse_args()

    repo_root = Path(args.repo).expanduser().resolve()
    if not repo_root.exists():
        raise SystemExit(f"repo_not_found: {repo_root}")

    api_atoms = extract_api_atoms(repo_root)
    enum_atoms = extract_enum_atoms(repo_root)
    constraint_atoms = extract_constraint_atoms(repo_root)
    all_atoms = api_atoms + enum_atoms + constraint_atoms

    OUT_DIR.mkdir(parents=True, exist_ok=True)
    for old in OUT_DIR.glob("*"):
        if old.is_file():
            old.unlink()

    with (OUT_DIR / "code_atoms.jsonl").open("w", encoding="utf-8") as handle:
        for atom in all_atoms:
            handle.write(json.dumps(atom, ensure_ascii=False) + "\n")

    (OUT_DIR / "01_接口契约.md").write_text(render_api_contracts(api_atoms), encoding="utf-8")
    (OUT_DIR / "02_枚举与状态.md").write_text(render_enums(enum_atoms), encoding="utf-8")
    (OUT_DIR / "03_实现约束.md").write_text(render_constraints(constraint_atoms), encoding="utf-8")
    (OUT_DIR / "04_模块映射.md").write_text(render_module_map(all_atoms), encoding="utf-8")
    (OUT_DIR / "05_业务实现主题.md").write_text(render_business_topics(api_atoms, enum_atoms, constraint_atoms), encoding="utf-8")

    print(f"repo={repo_root}")
    print(f"api_atoms={len(api_atoms)}")
    print(f"enum_atoms={len(enum_atoms)}")
    print(f"constraint_atoms={len(constraint_atoms)}")
    print(f"output={OUT_DIR.relative_to(BASE_DIR)}")


if __name__ == "__main__":
    main()