extract_testcase_candidates.py
5.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#!/usr/bin/env python3
"""
将测试用例 XML 提取为结构化候选项,供模型蒸馏 case_atoms 使用。
输出:
build/<app_version>/case_candidates.jsonl
用法:
python3 scripts/extract_testcase_candidates.py
python3 scripts/extract_testcase_candidates.py 4.57.3
"""
from __future__ import annotations
import json
import re
import sys
import xml.etree.ElementTree as ET
from html.parser import HTMLParser
from pathlib import Path
BASE_DIR = Path(__file__).parent.parent
TESTCASE_DIR = BASE_DIR / "testCase"
BUILD_DIR = BASE_DIR / "build"
class HTMLStripper(HTMLParser):
def __init__(self) -> None:
super().__init__()
self.text_parts: list[str] = []
def handle_data(self, data: str) -> None:
text = data.strip()
if text:
self.text_parts.append(text)
def get_text(self) -> str:
return " ".join(self.text_parts).strip()
def strip_html(html_str: str) -> str:
if not html_str:
return ""
html_str = html_str.replace(" ", " ").replace(" ", " ")
html_str = html_str.replace("&", "&").replace("<", "<").replace(">", ">")
parser = HTMLStripper()
parser.feed(html_str)
return re.sub(r"\s+", " ", parser.get_text()).strip()
def preprocess_xml(content: str) -> str:
content = re.sub(
r'<img\b[^>]*\bsrc=["\']data:image/[^;]+;base64,[A-Za-z0-9+/=\s]+["\'][^>]*>',
'[截图]',
content,
flags=re.IGNORECASE | re.DOTALL,
)
content = re.sub(
r'data:image/[^;]+;base64,[A-Za-z0-9+/=\r\n]{50,}',
'[截图]',
content,
flags=re.IGNORECASE,
)
content = re.sub(r'&(?!amp;|lt;|gt;|quot;|apos;|#)', '&', content)
return content
def extract_version(filename: str) -> str | None:
match = re.search(r'(\d+\.\d+(?:\.\d+)*)', filename)
return f"v{match.group(1)}" if match else None
def get_cdata(elem: ET.Element, tag: str) -> str:
child = elem.find(tag)
return child.text.strip() if child is not None and child.text else ""
def build_suite_path(path: list[str], suite_name: str) -> list[str]:
suite_name = suite_name.strip()
return path + [suite_name] if suite_name else path
def testcase_to_candidate(tc: ET.Element, suite_path: list[str], app_version: str, xml_file: str) -> dict:
case_name = tc.get("name", "").strip()
internal_id = tc.get("internalid", "")
external_id = get_cdata(tc, "externalid")
case_revision_version = get_cdata(tc, "version")
summary = strip_html(get_cdata(tc, "summary"))
preconditions = strip_html(get_cdata(tc, "preconditions"))
importance = strip_html(get_cdata(tc, "importance"))
execution_type = strip_html(get_cdata(tc, "execution_type"))
steps = []
steps_elem = tc.find("steps")
if steps_elem is not None:
for index, step in enumerate(steps_elem.findall("step"), start=1):
action = strip_html(get_cdata(step, "actions"))
expected = strip_html(get_cdata(step, "expectedresults"))
if action or expected:
steps.append(
{
"index": index,
"action": action,
"expected": expected,
}
)
feature_scope = " > ".join(suite_path[-3:]) if suite_path else case_name
raw_text_parts = [summary, preconditions]
raw_text_parts.extend(
f"step{step['index']}: {step['action']} => {step['expected']}" for step in steps
)
raw_text = " | ".join(part for part in raw_text_parts if part)
return {
"candidate_type": "testcase",
"app_version": app_version,
"suite_name": suite_path[-1] if suite_path else "",
"suite_path": suite_path,
"feature_scope": feature_scope,
"case_name": case_name,
"internal_id": internal_id,
"external_id": external_id,
"case_revision_version": case_revision_version,
"summary": summary,
"preconditions": preconditions,
"importance": importance,
"execution_type": execution_type,
"steps": steps,
"step_count": len(steps),
"evidence": {
"xml_file": xml_file,
"suite_path": suite_path,
"case_name": case_name,
"raw_text": raw_text[:2000],
},
}
def parse_suite(suite_elem: ET.Element, suite_path: list[str], app_version: str, xml_file: str, output: list[dict]) -> None:
current_path = build_suite_path(suite_path, suite_elem.get("name", ""))
for child_suite in suite_elem.findall("testsuite"):
parse_suite(child_suite, current_path, app_version, xml_file, output)
for tc in suite_elem.findall("testcase"):
output.append(testcase_to_candidate(tc, current_path, app_version, xml_file))
def process_file(xml_path: Path) -> tuple[str, int]:
app_version = extract_version(xml_path.name)
if not app_version:
return "", 0
content = preprocess_xml(xml_path.read_text(encoding="utf-8"))
root = ET.fromstring(content)
candidates: list[dict] = []
parse_suite(root, [], app_version, str(xml_path.relative_to(BASE_DIR)), candidates)
out_dir = BUILD_DIR / app_version
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "case_candidates.jsonl"
with out_path.open("w", encoding="utf-8") as handle:
for row in candidates:
handle.write(json.dumps(row, ensure_ascii=False) + "\n")
return app_version, len(candidates)
def main() -> None:
version_filter = next((arg for arg in sys.argv[1:] if re.match(r"\d+\.\d+", arg)), None)
xml_files = sorted(TESTCASE_DIR.glob("*.xml"))
if version_filter:
xml_files = [path for path in xml_files if version_filter in path.name]
total = 0
for xml_file in xml_files:
version, count = process_file(xml_file)
if not version:
continue
total += count
print(f"{version} candidates={count}")
print(f"total={total}")
if __name__ == "__main__":
main()