extract_pdf_segments.py
4.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/usr/bin/env python3
"""
将培训文档 PDF 提取为结构化段落块,供模型蒸馏 doc_atoms 使用。
输出:
build/<app_version>/doc_segments.jsonl
用法:
python3 scripts/extract_pdf_segments.py
python3 scripts/extract_pdf_segments.py 4.57.3
"""
from __future__ import annotations
import json
import re
import sys
from pathlib import Path
BASE_DIR = Path(__file__).parent.parent
PDF_DIR = BASE_DIR / "pdf"
BUILD_DIR = BASE_DIR / "build"
def extract_version_from_filename(name: str) -> str | None:
match = re.search(r"(\d+\.\d+(?:\.\d+)?)", name)
return f"v{match.group(1)}" if match else None
def clean_pdf_line(line: str) -> str:
line = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]", "", line)
return line.strip()
def extract_text_from_pdf(pdf_path: Path) -> list[dict]:
def read_pages(reader) -> list[dict]:
pages = []
total = len(reader.pages)
for index, page in enumerate(reader.pages, start=1):
raw = page.extract_text() or ""
lines = [clean_pdf_line(line) for line in raw.split("\n")]
lines = [line for line in lines if line]
pages.append(
{
"page": index,
"page_label": f"{index}/{total}",
"lines": lines,
}
)
return pages
try:
from pypdf import PdfReader # type: ignore
return read_pages(PdfReader(str(pdf_path)))
except ImportError:
pass
try:
from PyPDF2 import PdfReader # type: ignore
return read_pages(PdfReader(str(pdf_path)))
except ImportError:
pass
raise RuntimeError("未找到 PDF 解析库,请先安装 pypdf")
def is_title_line(line: str) -> bool:
stripped = line.strip()
if re.match(r"^\|.{2,40}\|?$", stripped):
return True
if re.match(r"^|.{2,40}|?$", stripped):
return True
return False
def normalize_title(line: str) -> str:
return line.strip().strip("||").strip()
def chunk_page_lines(page: dict) -> list[dict]:
blocks = []
current_title = ""
buffer: list[str] = []
def flush() -> None:
nonlocal buffer
if not buffer:
return
text = " ".join(buffer).strip()
if text:
blocks.append(
{
"page": page["page"],
"title": current_title or "未识别标题",
"text": text,
"line_count": len(buffer),
}
)
buffer = []
for line in page["lines"]:
if is_title_line(line):
flush()
current_title = normalize_title(line)
continue
if re.match(r"^\d+[..、)]\s*", line) and buffer:
flush()
buffer.append(line)
flush()
return blocks
def process_pdf(pdf_path: Path) -> tuple[str, int]:
app_version = extract_version_from_filename(pdf_path.name)
if not app_version:
return "", 0
pages = extract_text_from_pdf(pdf_path)
segments = []
for page in pages:
for index, block in enumerate(chunk_page_lines(page), start=1):
segments.append(
{
"candidate_type": "doc_segment",
"app_version": app_version,
"source_file": str(pdf_path.relative_to(BASE_DIR)),
"page": block["page"],
"segment_index": index,
"title": block["title"],
"text": block["text"][:4000],
"line_count": block["line_count"],
}
)
out_dir = BUILD_DIR / app_version
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "doc_segments.jsonl"
with out_path.open("w", encoding="utf-8") as handle:
for row in segments:
handle.write(json.dumps(row, ensure_ascii=False) + "\n")
return app_version, len(segments)
def main() -> None:
version_filter = next((arg for arg in sys.argv[1:] if re.match(r"\d+\.\d+", arg)), None)
pdf_files = sorted(PDF_DIR.glob("*.pdf"))
if version_filter:
pdf_files = [path for path in pdf_files if version_filter in path.name]
total = 0
for pdf_file in pdf_files:
version, count = process_pdf(pdf_file)
if not version:
continue
total += count
print(f"{version} segments={count} file={pdf_file.name}")
print(f"total={total}")
if __name__ == "__main__":
main()