import re
from pathlib import Path
from pypdf import PdfReader, PdfWriter
import tkinter as tk
from tkinter import filedialog, messagebox
CHAPTER_PATTERN = re.compile(r"第\s*[一二三四五六七八九十百千 0-9]+\s*章[^\n\r]*")
EXPORT_SINGLE_PAGES = True
root = None
log_text = None
def sanitize_filename(name: str) -> str:
"""
去掉 Windows 不支持的文件名字符。
"""
return re.sub(r'[\\/:*?"<>|]', "_", name)
def find_chapters_from_outline(reader: PdfReader):
"""
从 PDF 书签 (outline) 中找出章节:
- 遍历所有书签
- 标题里匹配 CHAPTER_PATTERN(第 X 章……)
- 获取对应页码
返回:[{title: '第 1 章 xxx', start: 0}, ...]
"""
chapters = []
try:
outlines = reader.outline
except Exception:
try:
outlines = reader.outlines
except Exception:
outlines = None
if not outlines:
return []
def walk(items):
for item in items:
if isinstance(item, list):
walk(item)
else:
try:
title = item.title
except AttributeError:
title = str(item)
if not isinstance(title, str):
title = str(title)
if not CHAPTER_PATTERN.search(title):
continue
try:
page_num = reader.get_destination_page_number(item)
except Exception:
continue
chapters.append({"title": title.strip(), "start": page_num})
walk(outlines)
unique = {}
for ch in chapters:
if ch["start"] not in unique:
unique[ch["start"]] = ch
chapters = sorted(unique.values(), key=lambda c: c["start"])
return chapters
def find_chapters_from_text(reader: PdfReader):
"""
扫描整个 PDF 正文,猜每一章的'起始页'以及章节标题。
规则大致是:
- 排除'目录/contents'页面
- 一页内允许有多个'第 X 章',逐个判断
- 只要某个匹配出现在页面较前面,且所在行不像目录行 (标题 + ...... + 页码) 就认为是章节开始
返回:[{title: '第 1 章 xxx', start: 0}, ...]
"""
chapters = []
num_pages = len(reader.pages)
for i in range(num_pages):
page = reader.pages[i]
text = page.extract_text() or ""
if not text.strip():
continue
head = text[:100]
if "目录" in head or "Contents" in head or "CONTENTS" in head:
continue
for m in CHAPTER_PATTERN.finditer(text):
if m.start() > 400:
continue
lines = text.splitlines()
line_of_match = ""
char_pos = 0
for line in lines:
next_pos = char_pos + len(line) + 1
if m.start() < next_pos:
line_of_match = line
break
char_pos = next_pos
if re.search(r"[\.·…]{3,}\s*\d+\s*$", line_of_match):
continue
title = m.group(0).strip()
if not any(ch["start"] == i for ch in chapters):
chapters.append({"title": title, "start": i})
break
return chapters
def find_chapters(reader: PdfReader, logger=print):
chapters = find_chapters_from_outline(reader)
if chapters:
logger("✅ 使用 PDF 书签识别章节")
return chapters
logger("⚠️ 此 PDF 没有可用书签,改用正文文本识别章节")
chapters = find_chapters_from_text(reader)
return chapters
def fill_chapter_ranges(chapters, num_pages):
"""
根据 start 页自动计算每章的 end 页。
修改 chapters 列表,增加 end 字段。
"""
for idx, ch in enumerate(chapters):
start = ch["start"]
if idx < len(chapters) - 1:
end = chapters[idx + 1]["start"] - 1
else:
end = num_pages - 1
ch["end"] = end
return chapters
def split_pdf_by_chapters(pdf_path, output_root, logger=None):
"""
真正拆分 PDF 的函数,所有信息通过 logger 输出到日志区域
"""
if logger is None:
logger = print
pdf_path = Path(pdf_path)
output_root = Path(output_root)
if not pdf_path.exists():
msg = f"PDF 文件不存在:{pdf_path}"
logger(msg)
raise FileNotFoundError(msg)
reader = PdfReader(str(pdf_path))
num_pages = len(reader.pages)
book_name = pdf_path.name
logger(f"开始处理:{book_name}")
logger(f"总页数:{num_pages}")
chapters = find_chapters(reader, logger=logger)
if not chapters:
msg = "未识别到任何章节标题,请检查:PDF 是否有书签/正文是否能提取文字/正则是否合适。"
logger(msg)
raise ValueError(msg)
logger(f"共识别到 {len(chapters)} 章:")
for idx, ch in enumerate(chapters, start=1):
logger(f" 第{idx}章 → {ch['title']}(起始页:{ch['start']+1})")
chapters = fill_chapter_ranges(chapters, num_pages)
output_root.mkdir(parents=True, exist_ok=True)
logger(f"输出目录:{output_root}")
for idx, ch in enumerate(chapters, start=1):
title = ch["title"]
start_page = ch["start"]
end_page = ch["end"]
page_count = end_page - start_page + 1
safe_title = sanitize_filename(title)
chapter_dir = output_root / f"{idx:02d}_{safe_title}"
chapter_dir.mkdir(parents=True, exist_ok=True)
logger("")
logger(f"==== 处理章节 {idx}: {title} ====")
logger(f"页码范围:{start_page + 1} - {end_page + 1}(共 {page_count} 页)")
logger(f"章节输出目录:{chapter_dir}")
chapter_writer = PdfWriter()
for p in range(start_page, end_page + 1):
chapter_writer.add_page(reader.pages[p])
chapter_pdf_path = chapter_dir / f"{idx:02d}_{safe_title}.pdf"
with open(chapter_pdf_path, "wb") as f:
chapter_writer.write(f)
logger(f" ✅ 已生成整章 PDF: {chapter_pdf_path.name}")
if EXPORT_SINGLE_PAGES:
for p in range(start_page, end_page + 1):
writer = PdfWriter()
writer.add_page(reader.pages[p])
page_label = f"p{p + 1:04d}.pdf"
single_page_path = chapter_dir / page_label
with open(single_page_path, "wb") as f:
writer.write(f)
logger(" ✅ 已生成单页 PDF 文件(按页命名)")
logger("")
logger("🎉 拆分完成!")
selected_pdf_file = ""
selected_output_dir = ""
def append_log(msg: str):
"""写日志到 Text,并自动滚动"""
if log_text is None:
print(msg)
return
log_text.config(state="normal")
log_text.insert(tk.END, msg + "\n")
log_text.see(tk.END)
log_text.config(state="disabled")
if root is not None:
root.update_idletasks()
def choose_pdf():
"""按钮 1:选择 PDF 文件"""
global selected_pdf_file
path = filedialog.askopenfilename(
title="请选择 PDF 文件",
filetypes=[("PDF 文件", "*.pdf"), ("所有文件", "*.*")]
)
if path:
selected_pdf_file = path
label_pdf.config(text=f"已选择 PDF:{path}")
append_log(f"已选择 PDF 文件:{path}")
def choose_output_and_run():
"""按钮 2:选择输出目录并开始拆分"""
global selected_output_dir, selected_pdf_file
if not selected_pdf_file:
messagebox.showwarning("提示", "请先选择 PDF 文件!")
return
path = filedialog.askdirectory(title="请选择输出目录")
if not path:
return
selected_output_dir = path
label_output.config(text=f"输出目录:{path}")
append_log("")
append_log(f"输出目录设置为:{path}")
append_log("开始拆分,请稍候...\n")
try:
split_pdf_by_chapters(selected_pdf_file, selected_output_dir, logger=append_log)
messagebox.showinfo("完成", "拆分完成!\n请到输出目录查看各章节文件夹。")
except Exception as e:
append_log(f"❌ 出错:{e}")
messagebox.showerror("错误", f"处理过程中出现错误:\n{e}")
if __name__ == "__main__":
root = tk.Tk()
root.title("PDF 章节拆分工具")
root.geometry("400x400")
root.resizable(False, False)
btn_frame = tk.Frame(root)
btn_frame.pack(padx=10, pady=10, fill="x")
btn_pdf = tk.Button(btn_frame, text="1. 请选择你的 PDF 文件", command=choose_pdf)
btn_pdf.pack(fill="x")
label_pdf = tk.Label(btn_frame, text="尚未选择 PDF 文件", anchor="w")
label_pdf.pack(fill="x", pady=(5, 10))
btn_output = tk.Button(btn_frame, text="2. 请选择输出目录并开始拆分", command=choose_output_and_run)
btn_output.pack(fill="x")
label_output = tk.Label(btn_frame, text="尚未选择输出目录", anchor="w")
label_output.pack(fill="x", pady=(5, 0))
log_frame = tk.Frame(root)
log_frame.pack(padx=10, pady=10, fill="both", expand=True)
log_text = tk.Text(log_frame, state="disabled")
log_text.pack(side="left", fill="both", expand=True)
scrollbar = tk.Scrollbar(log_frame, command=log_text.yview)
scrollbar.pack(side="right", fill="y")
log_text.config(yscrollcommand=scrollbar.set)
append_log("日志初始化完成。")
append_log("提示:先选择 PDF 文件,再选择输出目录开始拆分。")
root.mainloop()