parse_training_pdf.py 18.1 KB

Raw Blame History Permalink

#!/usr/bin/env python3
"""
产品培训文档 PDF → doc_atoms.jsonl（最高优先级知识源）

用法：
  python3 scripts/parse_training_pdf.py              # 批量处理 pdf/ 目录下所有 PDF
  python3 scripts/parse_training_pdf.py 4.40.0       # 只处理指定版本
  python3 scripts/parse_training_pdf.py --force      # 强制覆盖已存在的 doc_atoms.jsonl

同一版本多个 PDF（如 4.31.10.pdf / 4.31.10 补充1.pdf）自动合并处理。
输出：build/v{version}/doc_atoms.jsonl
"""

import json
import hashlib
import re
import sys
import os
from pathlib import Path
from collections import defaultdict

BASE_DIR = Path(__file__).parent.parent
PDF_DIR = BASE_DIR / "pdf"
BUILD_DIR = BASE_DIR / "build"

# ── 模块关键词（与 parse_testcase 保持一致） ──────────────────────────────
MODULE_KEYWORDS = {
    "AUTH": ["认证", "证照", "身份证", "执业", "资质", "卫健委", "人脸", "医师类别",
             "工作室开通", "开通工作室", "电子签名", "证件", "备案", "互联网医院",
             "医师分类", "合规医"],
    "INCOME": ["提现", "签约", "工猫", "才燊", "结算", "税", "银行卡", "余额",
               "绩效", "收入", "诊金", "优惠券", "折扣", "立减", "分组优惠"],
    "INQUIRY": ["问诊", "咨询", "会话", "主诉", "咨询费", "义诊"],
    "CLINIC": ["开方", "处方", "坐诊", "预约", "挂号", "门诊", "排班", "拍方",
               "药房", "购药", "方案", "明医好方"],
    "PATIENT": ["患者", "就诊人", "档案", "关注", "粉丝"],
    "NOTIFICATION": ["通知", "消息", "待办", "push", "推送"],
    "BACKSTAGE": ["猫头鹰", "审核", "客服", "运营", "后台", "药店端"],
}


def infer_modules(text: str) -> tuple:
    text_lower = text
    scores = defaultdict(int)
    for mod, kws in MODULE_KEYWORDS.items():
        for kw in kws:
            if kw in text_lower:
                scores[mod] += 1
    if not scores:
        return "GENERAL", ["GENERAL"]
    sorted_mods = sorted(scores.items(), key=lambda x: -x[1])
    primary = sorted_mods[0][0]
    modules = [m for m, _ in sorted_mods[:3]]
    return primary, modules


def fingerprint(text: str) -> str:
    return hashlib.sha1(text.strip().encode("utf-8")).hexdigest()[:12]


def normalize(text: str) -> str:
    if not text:
        return ""
    # 修复飞书 PDF 导出的连字符断行（半角减号 + 换行）
    text = re.sub(r"-\n", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


def extract_version_from_filename(name: str) -> str:
    """从文件名提取版本号，如 4.31.10 补充1.pdf → 4.31.10"""
    m = re.search(r"(\d+\.\d+(?:\.\d+)?)", name)
    return m.group(1) if m else None


def clean_pdf_line(line: str) -> str:
    """清理 pypdf 提取的单行文本"""
    # 去除 \x01 等控制字符（pypdf 用作行内分隔符）
    line = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]", "", line)
    return line.strip()


def extract_text_from_pdf(pdf_path: Path) -> str:
    """提取 PDF 文本，优先用 pypdf，降级用 PyPDF2"""
    def read_with_reader(reader) -> str:
        pages_text = []
        total = len(reader.pages)
        for i, page in enumerate(reader.pages):
            raw = page.extract_text() or ""
            # 逐行清理控制字符，过滤纯控制字符行
            lines = []
            for line in raw.split("\n"):
                cleaned = clean_pdf_line(line)
                if cleaned:
                    lines.append(cleaned)
            pages_text.append(f"-- {i+1} of {total} --\n" + "\n".join(lines))
        return "\n\n".join(pages_text)

    try:
        from pypdf import PdfReader
        return read_with_reader(PdfReader(str(pdf_path)))
    except ImportError:
        pass

    try:
        from PyPDF2 import PdfReader
        return read_with_reader(PdfReader(str(pdf_path)))
    except ImportError:
        pass

    raise RuntimeError(
        "未找到 PDF 解析库，请运行：python3 -m pip install pypdf"
    )


# ── 文本解析：提取功能块 ───────────────────────────────────────────────────

def is_feature_title(line: str) -> bool:
    """判断是否是功能块标题（飞书 PDF 导出为 |标题 或 |标题| 格式）"""
    stripped = line.strip()
    if re.match(r"^\|.{2,40}\|$", stripped):
        return True
    if re.match(r"^｜.{2,40}｜$", stripped):
        return True
    # 单竖线开头（飞书常见格式）：|标题（不能是表格行 "| xxx |"）
    if re.match(r"^\|[^|]{2,40}$", stripped):
        return True
    if re.match(r"^｜[^｜]{2,40}$", stripped):
        return True
    return False


def get_feature_title(line: str) -> str:
    return line.strip().strip("|｜").strip()


def parse_feature_blocks(text: str) -> list[dict]:
    """
    将 PDF 全文解析为功能块列表，支持两种飞书 PDF 格式：
    格式A（旧版）：|功能标题|  背景：...  目标：...  1、子章节  1. 规则
    格式B（新版）：功能标题（无竖线）  场景：...  功能设计：...  1. 规则
    """
    text = re.sub(r"--\s*\d+\s*of\s*\d+\s*--", "", text)
    lines = [l.strip() for l in text.split("\n") if l.strip()]

    # 背景/目标的标记词（两种格式均支持）
    BG_PREFIXES = ("背景：", "背景:", "场景：", "场景:")
    GOAL_PREFIXES = ("目标：", "目标:", "功能设计：", "功能设计:", "功能：", "功能:")

    def starts_with_any(line, prefixes):
        return any(line.startswith(p) for p in prefixes)

    def extract_after_prefix(line, prefixes):
        for p in prefixes:
            if line.startswith(p):
                return line[len(p):].strip()
        return line

    def is_numbered_rule_start(line: str) -> bool:
        """是否是编号为1的规则行（表示新的功能块开始）"""
        return bool(re.match(r"^1[.．、]\s*.{3,}", line))

    def is_rule_line(line: str) -> bool:
        return bool(re.match(r"^\d+[.．、）)]\s*", line))

    # 两遍处理：识别无标记的功能标题（格式B/C）
    title_line_indices = set()
    for i, line in enumerate(lines):
        if is_feature_title(line):
            title_line_indices.add(i)
            continue

        # 格式B：当前行后紧跟"背景/场景"行
        if i + 1 < len(lines) and starts_with_any(lines[i + 1], BG_PREFIXES + GOAL_PREFIXES):
            if (not re.match(r"^\d+[、.．）)]", line)
                    and not re.match(r"^\d+\.\d+", line)
                    and 2 < len(line) < 50):
                title_line_indices.add(i)
                continue

        # 格式C：当前行后紧跟编号为1的规则行（序号从1重新开始）
        if i + 1 < len(lines) and is_numbered_rule_start(lines[i + 1]):
            if (not is_rule_line(line)
                    and not re.match(r"^\d+\.\d+", line)  # 排除版本号
                    and not starts_with_any(line, BG_PREFIXES + GOAL_PREFIXES)
                    and 2 < len(line) < 60):
                title_line_indices.add(i)

    blocks = []
    current_block = None
    bg_continued = False
    goal_continued = False

    for i, line in enumerate(lines):
        # 功能标题行
        if i in title_line_indices:
            if current_block:
                blocks.append(current_block)
            current_block = {
                "title": get_feature_title(line),
                "background": "",
                "goal": "",
                "lines": [],
            }
            bg_continued = False
            goal_continued = False
            continue

        if current_block is None:
            continue

        # 背景/场景行
        if starts_with_any(line, BG_PREFIXES):
            current_block["background"] = extract_after_prefix(line, BG_PREFIXES)
            bg_continued = True
            goal_continued = False
            continue

        # 目标/功能设计行
        if starts_with_any(line, GOAL_PREFIXES):
            current_block["goal"] = extract_after_prefix(line, GOAL_PREFIXES)
            goal_continued = True
            bg_continued = False
            continue

        # 续行判断
        is_new_section = bool(re.match(r"^\d+[、.．]", line))
        if bg_continued and not is_new_section:
            current_block["background"] += " " + line
            continue
        if goal_continued and not is_new_section:
            current_block["goal"] += " " + line
            continue

        bg_continued = False
        goal_continued = False
        current_block["lines"].append(line)

    if current_block:
        blocks.append(current_block)

    # 兜底：若未识别到任何功能块（全图片或格式特殊），
    # 把所有编号规则收进一个默认块
    if not blocks:
        fallback = {
            "title": "版本说明",
            "background": "",
            "goal": "",
            "lines": [l for l in lines if re.match(r"^\d+[、.．）)]", l) and len(l) > 8],
        }
        if fallback["lines"]:
            blocks.append(fallback)

    return blocks


def split_into_rules(raw_lines: list[str], feature_title: str) -> list[dict]:
    """
    将功能块的原始行列表拆分为 [{section, rules:[str]}]。
    - 1、xxx（中文顿号）→ 子章节标题
    - 1. xxx / 1.xxx（西文句点）→ 编号规则
    - 其余长文本 → 散文规则
    """
    rule_groups = []
    current_section = feature_title
    current_rules = []
    pending_continuation = None  # 跨行续接

    def flush():
        nonlocal current_rules
        if current_rules:
            rule_groups.append({
                "section": current_section,
                "rules": list(current_rules),
            })
            current_rules = []

    for line in raw_lines:
        line = line.strip()
        if not line:
            continue

        # 子章节标题：1、xxx（中文顿号，通常 < 50 字）
        m_section = re.match(r"^(\d+)[、]\s*(.{2,45})$", line)
        if m_section:
            flush()
            current_section = f"{feature_title} > {m_section.group(2).strip()}"
            pending_continuation = None
            continue

        # 编号规则：1. xxx / 1.xxx（西文句点或全角句点）
        m_rule = re.match(r"^(\d+)[.．]\s*(.+)$", line)
        if m_rule:
            rule_text = m_rule.group(2).strip()
            if pending_continuation:
                # 续完上一条
                current_rules.append(pending_continuation)
                pending_continuation = None
            if len(rule_text) > 5:
                # 如果规则文本以标点结束，直接收录；否则可能有续行
                if re.search(r"[；。？！」\)）]$", rule_text):
                    current_rules.append(rule_text)
                else:
                    pending_continuation = rule_text
            continue

        # 子编号：1）2）→ 附加到上一条规则或独立收录
        m_sub = re.match(r"^(\d+)[）)]\s*(.+)$", line)
        if m_sub:
            sub_text = m_sub.group(2).strip()
            if pending_continuation:
                current_rules.append(pending_continuation)
                pending_continuation = None
            if len(sub_text) > 5:
                current_rules.append(sub_text)
            continue

        # 散文/续行
        if pending_continuation:
            pending_continuation += line
            # 判断是否续行结束
            if re.search(r"[；。？！」\)）]$", pending_continuation) or len(pending_continuation) > 100:
                current_rules.append(pending_continuation)
                pending_continuation = None
            continue

        # 普通长文本（> 10 字）
        if len(line) > 10:
            current_rules.append(line)

    if pending_continuation:
        current_rules.append(pending_continuation)
    flush()

    return rule_groups


def build_atoms(blocks: list[dict], app_version: str, evidence_prefix: str) -> list[dict]:
    atoms = []
    seen = set()

    for block in blocks:
        title = block["title"]
        background = normalize(block.get("background", ""))
        goal = normalize(block.get("goal", ""))

        # 构建前置条件（C）：背景 + 目标
        context_parts = []
        if background:
            context_parts.append(f"背景：{background}")
        if goal:
            context_parts.append(f"目标：{goal}")
        base_context = "；".join(context_parts) if context_parts else title

        rule_groups = split_into_rules(block["lines"], title)

        for group in rule_groups:
            section = group["section"]
            for rule_text in group["rules"]:
                rule_text = normalize(rule_text)
                if not rule_text or len(rule_text) < 5:
                    continue

                # C = 功能背景，A = 子章节/场景，R = 规则内容
                c = base_context
                a = section if section != title else title
                r = rule_text

                canon = f"C={c}|A={a}|R={r}"
                fp = fingerprint(canon)
                if fp in seen:
                    continue
                seen.add(fp)

                primary_module, modules = infer_modules(f"{title} {a} {r}")

                atom_id = f"{app_version}_{fp}"
                atoms.append({
                    "atom_id": atom_id,
                    "app_version": app_version,
                    "atom_type": "doc_rule",
                    "C": c,
                    "A": a,
                    "R": r,
                    "primary_module": primary_module,
                    "modules": modules,
                    "feature_scope": title,
                    "touchpoints": [],
                    "canon_text": canon,
                    "merge_fingerprint": fp,
                    "evidence": f"📚培训文档 {app_version} · {evidence_prefix}",
                    "search_terms": [],
                })

    return atoms


def deduplicate(atoms: list[dict]) -> list[dict]:
    seen = {}
    for atom in atoms:
        fp = atom["merge_fingerprint"]
        if fp not in seen:
            seen[fp] = atom
    return list(seen.values())


# ── 主流程 ────────────────────────────────────────────────────────────────

def group_pdfs_by_version() -> dict[str, list[Path]]:
    """扫描 pdf/ 目录，按版本号分组 PDF 文件"""
    groups = defaultdict(list)
    for pdf_file in sorted(PDF_DIR.glob("*.pdf")):
        version = extract_version_from_filename(pdf_file.name)
        if version:
            groups[f"v{version}"].append(pdf_file)
        else:
            print(f"  ⚠️  无法识别版本号，跳过：{pdf_file.name}")
    return groups


def process_version(version: str, pdf_files: list[Path], force: bool = False) -> dict:
    out_dir = BUILD_DIR / version
    out_path = out_dir / "doc_atoms.jsonl"

    if out_path.exists() and not force:
        return {"version": version, "status": "skipped", "atoms": 0}

    all_atoms = []
    for pdf_path in pdf_files:
        try:
            text = extract_text_from_pdf(pdf_path)
            blocks = parse_feature_blocks(text)
            evidence_prefix = pdf_path.stem  # 文件名（无扩展名）
            atoms = build_atoms(blocks, version, evidence_prefix)
            all_atoms.extend(atoms)
        except Exception as e:
            print(f"  ❌ {pdf_path.name} 解析失败：{e}")

    all_atoms = deduplicate(all_atoms)

    if not all_atoms:
        return {"version": version, "status": "empty", "atoms": 0}

    out_dir.mkdir(parents=True, exist_ok=True)
    with open(out_path, "w", encoding="utf-8") as f:
        for atom in all_atoms:
            f.write(json.dumps(atom, ensure_ascii=False) + "\n")

    # 模块分布统计
    from collections import Counter
    module_dist = Counter(a["primary_module"] for a in all_atoms)
    module_str = "  ".join(f"{m}:{c}" for m, c in module_dist.most_common(5))

    return {
        "version": version,
        "status": "ok",
        "atoms": len(all_atoms),
        "files": [p.name for p in pdf_files],
        "module_dist": module_str,
    }


def main():
    force = "--force" in sys.argv
    version_filter = None
    for arg in sys.argv[1:]:
        if not arg.startswith("--"):
            # 归一化版本号
            v = arg if arg.startswith("v") else f"v{arg}"
            version_filter = v

    if not PDF_DIR.exists():
        print(f"❌ 未找到 pdf/ 目录：{PDF_DIR}")
        sys.exit(1)

    groups = group_pdfs_by_version()
    if not groups:
        print("❌ pdf/ 目录下未找到任何 PDF 文件")
        sys.exit(1)

    if version_filter:
        groups = {k: v for k, v in groups.items() if k == version_filter}
        if not groups:
            print(f"❌ 未找到版本 {version_filter} 的 PDF")
            sys.exit(1)

    versions = sorted(groups.keys(), key=lambda v: [
        int(x) for x in v.lstrip("v").split(".")
    ])

    print(f"找到 {len(versions)} 个版本（{sum(len(v) for v in groups.values())} 个 PDF），开始处理...\n")

    total_atoms = 0
    ok_count = 0
    skip_count = 0
    err_count = 0

    for version in versions:
        pdf_files = groups[version]
        file_names = "、".join(p.name for p in pdf_files)
        print(f"▶ {version}  [{file_names}]")

        result = process_version(version, pdf_files, force)

        if result["status"] == "skipped":
            print(f"  ⏭  跳过: 已存在，使用 --force 覆盖\n")
            skip_count += 1
        elif result["status"] == "empty":
            print(f"  ⚠️  未提取到有效规则（可能是纯图片文档）\n")
            err_count += 1
        elif result["status"] == "ok":
            print(f"  ✅ {version} → {result['atoms']} atoms")
            print(f"  模块: {result['module_dist']}\n")
            total_atoms += result["atoms"]
            ok_count += 1
        else:
            err_count += 1

    print("=" * 50)
    print(f"完成: {ok_count} 个  |  跳过: {skip_count} 个  |  异常: {err_count} 个")
    print(f"总计: {total_atoms} atoms")


if __name__ == "__main__":
    main()