extract_testcase_candidates.py 5.95 KB
#!/usr/bin/env python3
"""
将测试用例 XML 提取为结构化候选项,供模型蒸馏 case_atoms 使用。

输出:
  build/<app_version>/case_candidates.jsonl

用法:
  python3 scripts/extract_testcase_candidates.py
  python3 scripts/extract_testcase_candidates.py 4.57.3
"""

from __future__ import annotations

import json
import re
import sys
import xml.etree.ElementTree as ET
from html.parser import HTMLParser
from pathlib import Path


BASE_DIR = Path(__file__).parent.parent
TESTCASE_DIR = BASE_DIR / "testCase"
BUILD_DIR = BASE_DIR / "build"


class HTMLStripper(HTMLParser):
    def __init__(self) -> None:
        super().__init__()
        self.text_parts: list[str] = []

    def handle_data(self, data: str) -> None:
        text = data.strip()
        if text:
            self.text_parts.append(text)

    def get_text(self) -> str:
        return " ".join(self.text_parts).strip()


def strip_html(html_str: str) -> str:
    if not html_str:
        return ""
    html_str = html_str.replace("&nbsp;", " ").replace("&#160;", " ")
    html_str = html_str.replace("&amp;", "&").replace("&lt;", "<").replace("&gt;", ">")
    parser = HTMLStripper()
    parser.feed(html_str)
    return re.sub(r"\s+", " ", parser.get_text()).strip()


def preprocess_xml(content: str) -> str:
    content = re.sub(
        r'<img\b[^>]*\bsrc=["\']data:image/[^;]+;base64,[A-Za-z0-9+/=\s]+["\'][^>]*>',
        '[截图]',
        content,
        flags=re.IGNORECASE | re.DOTALL,
    )
    content = re.sub(
        r'data:image/[^;]+;base64,[A-Za-z0-9+/=\r\n]{50,}',
        '[截图]',
        content,
        flags=re.IGNORECASE,
    )
    content = re.sub(r'&(?!amp;|lt;|gt;|quot;|apos;|#)', '&amp;', content)
    return content


def extract_version(filename: str) -> str | None:
    match = re.search(r'(\d+\.\d+(?:\.\d+)*)', filename)
    return f"v{match.group(1)}" if match else None


def get_cdata(elem: ET.Element, tag: str) -> str:
    child = elem.find(tag)
    return child.text.strip() if child is not None and child.text else ""


def build_suite_path(path: list[str], suite_name: str) -> list[str]:
    suite_name = suite_name.strip()
    return path + [suite_name] if suite_name else path


def testcase_to_candidate(tc: ET.Element, suite_path: list[str], app_version: str, xml_file: str) -> dict:
    case_name = tc.get("name", "").strip()
    internal_id = tc.get("internalid", "")
    external_id = get_cdata(tc, "externalid")
    case_revision_version = get_cdata(tc, "version")
    summary = strip_html(get_cdata(tc, "summary"))
    preconditions = strip_html(get_cdata(tc, "preconditions"))
    importance = strip_html(get_cdata(tc, "importance"))
    execution_type = strip_html(get_cdata(tc, "execution_type"))

    steps = []
    steps_elem = tc.find("steps")
    if steps_elem is not None:
        for index, step in enumerate(steps_elem.findall("step"), start=1):
            action = strip_html(get_cdata(step, "actions"))
            expected = strip_html(get_cdata(step, "expectedresults"))
            if action or expected:
                steps.append(
                    {
                        "index": index,
                        "action": action,
                        "expected": expected,
                    }
                )

    feature_scope = " > ".join(suite_path[-3:]) if suite_path else case_name
    raw_text_parts = [summary, preconditions]
    raw_text_parts.extend(
        f"step{step['index']}: {step['action']} => {step['expected']}" for step in steps
    )
    raw_text = " | ".join(part for part in raw_text_parts if part)

    return {
        "candidate_type": "testcase",
        "app_version": app_version,
        "suite_name": suite_path[-1] if suite_path else "",
        "suite_path": suite_path,
        "feature_scope": feature_scope,
        "case_name": case_name,
        "internal_id": internal_id,
        "external_id": external_id,
        "case_revision_version": case_revision_version,
        "summary": summary,
        "preconditions": preconditions,
        "importance": importance,
        "execution_type": execution_type,
        "steps": steps,
        "step_count": len(steps),
        "evidence": {
            "xml_file": xml_file,
            "suite_path": suite_path,
            "case_name": case_name,
            "raw_text": raw_text[:2000],
        },
    }


def parse_suite(suite_elem: ET.Element, suite_path: list[str], app_version: str, xml_file: str, output: list[dict]) -> None:
    current_path = build_suite_path(suite_path, suite_elem.get("name", ""))
    for child_suite in suite_elem.findall("testsuite"):
        parse_suite(child_suite, current_path, app_version, xml_file, output)
    for tc in suite_elem.findall("testcase"):
        output.append(testcase_to_candidate(tc, current_path, app_version, xml_file))


def process_file(xml_path: Path) -> tuple[str, int]:
    app_version = extract_version(xml_path.name)
    if not app_version:
        return "", 0
    content = preprocess_xml(xml_path.read_text(encoding="utf-8"))
    root = ET.fromstring(content)

    candidates: list[dict] = []
    parse_suite(root, [], app_version, str(xml_path.relative_to(BASE_DIR)), candidates)

    out_dir = BUILD_DIR / app_version
    out_dir.mkdir(parents=True, exist_ok=True)
    out_path = out_dir / "case_candidates.jsonl"
    with out_path.open("w", encoding="utf-8") as handle:
        for row in candidates:
            handle.write(json.dumps(row, ensure_ascii=False) + "\n")
    return app_version, len(candidates)


def main() -> None:
    version_filter = next((arg for arg in sys.argv[1:] if re.match(r"\d+\.\d+", arg)), None)
    xml_files = sorted(TESTCASE_DIR.glob("*.xml"))
    if version_filter:
        xml_files = [path for path in xml_files if version_filter in path.name]

    total = 0
    for xml_file in xml_files:
        version, count = process_file(xml_file)
        if not version:
            continue
        total += count
        print(f"{version} candidates={count}")
    print(f"total={total}")


if __name__ == "__main__":
    main()