Python 实战：构建文档总结、代码生成与智能检索助手

你是否遇到过这些场景？

📄 论文太多看不完：文件夹里躺着 50 篇论文，每篇都 30 页起步…
💻 重复代码写到手软：CRUD 接口、数据清洗脚本，一遍遍重复劳动…
📚 查资料效率低下：为了找一个 API 参数，翻了 10 个网页还没找到…

如果告诉你，用不到 200 行 Python 代码，就能打造一个 AI 助手帮你解决这些问题，你信吗？今天我将带你从零开始，用 Python 打造三个 AI 工具。

一、准备工作：环境与 API 配置

1.1 技术栈选择

技术组件	推荐方案	成本	说明
LLM 模型	DeepSeek / Qwen	免费/低价	国内模型，中文优秀
API 平台	硅基流动 / 魔搭社区	¥0.001/1k tokens	新用户有免费额度
文档解析	PyPDF2 / Unstructured	免费	支持 PDF/Word/Markdown
代码运行	Subprocess / Docker	免费	本地沙箱执行
搜索引擎	Bing Search API	付费（有免费层）	或用 DuckDuckGo 免费版

1.2 环境配置

# 创建虚拟环境
python -m venv ai-tools-env
source ai-tools-env/bin/activate  # Windows 用：ai-tools-env\Scripts\activate

# 安装依赖
pip install openai pypdf2 requests beautifulsoup4 python-dotenv
pip install aiohttp httpx  # 异步请求支持

创建 .env 文件配置密钥：

# API 配置
DEEPSEEK_API_KEY=your_deepseek_api_key
DEEPSEEK_BASE_URL=https://api.deepseek.com/v1

# 或使用硅基流动（支持多个模型）
SILICONFLOW_API_KEY=your_siliconflow_key
SILICONFLOW_BASE_URL=https://api.siliconflow.cn/v1

# 搜索 API（可选）
BING_SEARCH_API_KEY=your_bing_key

1.3 核心工具类封装

import asyncio from typing import List, Optional from pathlib import Path import PyPDF2 from bs4 import BeautifulSoup import aiohttp from dataclasses import dataclass from datetime import datetime @dataclass class DocumentSummary: """文档摘要结果""" title: str summary: str key_points: List[str] reading_time: int # 预计阅读时间（分钟） word_count: int created_at: str class DocumentParser: """文档解析器""" @staticmethod async def parse_pdf(file_path: str) -> str: text = "" with open(file_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) for page in pdf_reader.pages: text += page.extract_text() + "\n" return text @staticmethod async def parse_text(file_path: str) -> str: with open(file_path, 'r', encoding='utf-8') as f: return f.read() @staticmethod async def parse_url(url: str) -> str: async with aiohttp.ClientSession() as session: async with session.get(url) as response: html = await response.text() soup = BeautifulSoup(html, 'html.parser') for script in soup(['script', 'style']): script.decompose() return soup.get_text(separator='\n', strip=True) class TextChunker: """文本分块器""" def __init__(self, chunk_size: int = 3000, overlap: int = 200): self.chunk_size = chunk_size self.overlap = overlap def chunk(self, text: str) -> List[str]: paragraphs = text.split('\n\n') chunks = [] current_chunk = "" for para in paragraphs: if len(current_chunk) + len(para) <= self.chunk_size: current_chunk += para + "\n\n" else: if current_chunk: chunks.append(current_chunk.strip()) if len(para) > self.chunk_size: for i in range(0, len(para), self.chunk_size - self.overlap): chunks.append(para[i:i + self.chunk_size]) current_chunk = "" if current_chunk: chunks.append(current_chunk.strip()) return chunks class DocumentSummarizer: """智能文档总结器""" def __init__(self, llm_client: LLMClient): self.llm = llm_client self.parser = DocumentParser() self.chunker = TextChunker() async def summarize(self, source: str, source_type: str = "file", output_format: str = "markdown") -> DocumentSummary: print(f"📖 正在解析文档：{source}") if source_type == "url": text = await self.parser.parse_url(source) title = await self._extract_title_from_url(text) else: if source.endswith('.pdf'): text = await self.parser.parse_pdf(source) else: text = await self.parser.parse_text(source) title = Path(source).stem word_count = len(text) reading_time = max(1, word_count // 500) print(f"✅ 解析完成，共 {word_count} 字，预计阅读 {reading_time} 分钟") chunks = self.chunker.chunk(text) print(f"🔪 正在分块... 分成 {len(chunks)} 个块") chunk_summaries = await self._summarize_chunks(chunks) final_summary = await self._merge_summaries(chunk_summaries, title) key_points = await self._extract_key_points(final_summary) return DocumentSummary( title=title, summary=final_summary, key_points=key_points, reading_time=reading_time, word_count=word_count, created_at=datetime.now().strftime("%Y-%m-%d %H:%M:%S") ) async def _summarize_chunks(self, chunks: List[str]) -> List[str]: semaphore = asyncio.Semaphore(5) # 限制并发数 async def summarize_chunk(chunk: str, index: int): async with semaphore: prompt = f"请总结以下文本的核心内容，要求：1. 保留关键信息 2. 省略细节 3. 简洁表达 4. 200 字以内。文本内容：{chunk}" response = await self.llm.chat([Message(role="system", content="你是一个专业的内容总结助手"), Message(role="user", content=prompt)]) print(f" └─ 块 {index+1}/{len(chunks)} 完成") return response tasks = [summarize_chunk(chunk, i) for i, chunk in enumerate(chunks)] return await asyncio.gather(*tasks) async def _merge_summaries(self, summaries: List[str], title: str) -> str: combined = "\n\n".join([f"• {s}" for s in summaries]) prompt = f"以下是文档《{title}》的分块摘要，请整合成一篇完整的总结。\n{combined}\n请按格式输出：标题、核心内容、主要观点。" response = await self.llm.chat([Message(role="system", content="你是一个专业的内容整合助手"), Message(role="user", content=prompt)]) return response async def _extract_key_points(self, summary: str) -> List[str]: prompt = f"从以下总结中提取 5-7 个关键要点，每点不超过 20 字。只输出要点列表。\n{summary}" response = await self.llm.chat([Message(role="user", content=prompt)]) return [line.strip() for line in response.split('\n') if line.strip()] async def _extract_title_from_url(self, text: str) -> str: prompt = f"从以下文本中提取文章标题，只返回标题。\n{text[:500]}" response = await self.llm.chat([Message(role="user", content=prompt)]) return response.strip() async def main_summarizer(): llm = LLMClient() summarizer = DocumentSummarizer(llm) result = await summarizer.summarize(source="research_paper.pdf", source_type="file") print(f"📄 标题：{result.title}") print(f"⏱️ 预计阅读时间：{result.reading_time} 分钟") print(f"🔑 关键要点:") for point in result.key_points: print(f" • {point}") if __name__ == "__main__": asyncio.run(main_summarizer())

文档类型	原始阅读时间	AI 总结时间	效率提升
论文（30 页）	60 分钟	30 秒	120 倍
技术文档	20 分钟	15 秒	80 倍
新闻文章	5 分钟	10 秒	30 倍

import re import subprocess import tempfile from typing import Dict, List, Optional, Tuple from enum import Enum import ast class CodeMode(Enum): GENERATE = "generate" EXPLAIN = "explain" OPTIMIZE = "optimize" DEBUG = "debug" TEST = "test" @dataclass class CodeResult: code: str language: str explanation: str tests: Optional[str] = None warnings: List[str] = None class CodeGenerator: def __init__(self, llm_client: LLMClient): self.llm = llm_client self.quality_rules = { "security": [r"eval\s*\(", r"exec\s*\(", r"pickle\.loads?"], "performance": [r"for\s+\w+\s+in\s+range\(len\("] } async def generate(self, requirement: str, language: str = "python", mode: CodeMode = CodeMode.GENERATE, context: str = "") -> CodeResult: mode_prompts = { CodeMode.GENERATE: self._build_generate_prompt, CodeMode.EXPLAIN: self._build_explain_prompt, CodeMode.OPTIMIZE: self._build_optimize_prompt, CodeMode.DEBUG: self._build_debug_prompt, CodeMode.TEST: self._build_test_prompt, } prompt_builder = mode_prompts[mode] prompt = prompt_builder(requirement, language, context) print(f"🤖 正在生成{mode.value}...") response = await self.llm.chat([ Message(role="system", content=self._get_system_prompt(language)), Message(role="user", content=prompt) ]) code, explanation = self._parse_code_response(response, language) warnings = self._security_check(code) tests = None if mode == CodeMode.GENERATE: tests = await self._generate_tests(code, language) return CodeResult(code=code, language=language, explanation=explanation, tests=tests, warnings=warnings) def _get_system_prompt(self, language: str) -> str: return f"""你是一个专业的{language}程序员和教师。 1. 代码必须可直接运行 2. 添加必要的注释和文档字符串 3. 遵循最佳实践和 PEP8 规范 4. 包含错误处理 5. 代码后附上简洁的使用说明""" def _build_generate_prompt(self, requirement: str, language: str, context: str) -> str: if context: return f"请根据以下需求生成{language}代码。需求：{requirement}。上下文代码：```{context}```。请生成完整可运行的代码。" return f"请根据以下需求生成{language}代码。需求：{requirement}。要求：1. 代码完整且可运行 2. 包含输入验证 3. 添加清晰注释。请生成代码：" def _build_explain_prompt(self, code: str, language: str, context: str) -> str: return f"请详细解释以下{language}代码的功能和工作原理。```{code}```。请从整体功能、关键逻辑、数据结构、复杂度等方面解释。" def _build_optimize_prompt(self, code: str, language: str, context: str) -> str: return f"请优化以下{language}代码。```{code}```。目标：提升性能、改善可读性、增强健壮性。请给出优化后的代码和说明。" def _build_debug_prompt(self, code: str, language: str, context: str) -> str: error_info = context if context else "[无]" return f"请分析以下{language}代码中的问题并修复。```{code}```。错误信息：{error_info}。请给出问题分析、修复代码及预防建议。" def _build_test_prompt(self, code: str, language: str, context: str) -> str: return f"请为以下{language}代码生成完整的 pytest 测试用例。```{code}```。要求覆盖正常、边界及异常情况。只输出测试代码。" def _parse_code_response(self, response: str, language: str) -> Tuple[str, str]: code_pattern = rf"```{language}\n(.*?)```" code_match = re.search(code_pattern, response, re.DOTALL) if code_match: code = code_match.group(1).strip() explanation = response.replace(code_match.group(0), "").strip() else: code = response explanation = "无额外说明" return code, explanation def _security_check(self, code: str) -> List[str]: warnings = [] for category, patterns in self.quality_rules.items(): for pattern in patterns: if re.search(pattern, code): warnings.append(f"⚠️ 安全警告：检测到 {pattern} 使用") try: ast.parse(code) except SyntaxError as e: warnings.append(f"⚠️ 语法错误：{e}") return warnings async def _generate_tests(self, code: str, language: str) -> str: prompt = f"为以下{language}代码编写 pytest 测试。要求：函数名以 test_开头，包含正常和异常情况。只输出测试代码。\n```{code}```" response = await self.llm.chat([Message(role="user", content=prompt)]) return response async def execute_code(self, code: str, language: str = "python", timeout: int = 10) -> Dict: with tempfile.NamedTemporaryFile(mode='w', suffix=f'.{language}', delete=False) as f: f.write(code) temp_file = f.name try: result = subprocess.run(['python', temp_file], capture_output=True, text=True, timeout=timeout) return {"success": result.returncode == 0, "output": result.stdout, "error": result.stderr} except subprocess.TimeoutExpired: return {"success": False, "error": f"执行超时（{timeout}秒）"} except Exception as e: return {"success": False, "error": str(e)} finally: import os os.unlink(temp_file)

import aiohttp from typing import List, Dict, Optional from dataclasses import dataclass import re from urllib.parse import quote, urljoin import json import os @dataclass class SearchResult: title: str url: str snippet: str source: str relevance: float = 0.0 @dataclass class ResearchResult: answer: str sources: List[SearchResult] related_questions: List[str] confidence: float class SearchEngine: def __init__(self, bing_api_key: str = None): self.bing_api_key = bing_api_key or os.getenv("BING_SEARCH_API_KEY") self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"} async def search_bing(self, query: str, count: int = 10) -> List[SearchResult]: if not self.bing_api_key: return await self._search_duckduckgo(query, count) url = "https://api.bing.microsoft.com/v7.0/search" params = {"q": query, "count": count, "responseFilter": "webpages"} async with aiohttp.ClientSession() as session: async with session.get(url, params=params, headers={"Ocp-Apim-Subscription-Key": self.bing_api_key}) as response: data = await response.json() results = [] for item in data.get("webPages", {}).get("value", []): results.append(SearchResult(title=item["name"], url=item["url"], snippet=item["snippet"], source="bing")) return results async def _search_duckduckgo(self, query: str, count: int = 10) -> List[SearchResult]: url = f"https://html.duckduckgo.com/html/?q={quote(query)}" async with aiohttp.ClientSession() as session: async with session.get(url, headers=self.headers) as response: html = await response.text() from bs4 import BeautifulSoup soup = BeautifulSoup(html, 'html.parser') results = [] for result in soup.select('.result')[:count]: title_elem = result.select_one('.result__a') snippet_elem = result.select_one('.result__snippet') url_elem = result.select_one('.result__url') if title_elem and url_elem: results.append(SearchResult( title=title_elem.get_text(), url=url_elem.get('href', ''), snippet=snippet_elem.get_text() if snippet_elem else '', source="duckduckgo" )) return results async def search_stackoverflow(self, query: str, count: int = 5) -> List[SearchResult]: search_query = f"site:stackoverflow.com {query}" results = await self._search_duckduckgo(search_query, count) for r in results: r.source = "stackoverflow" return results class IntelligentResearcher: def __init__(self, llm_client: LLMClient, search_engine: SearchEngine): self.llm = llm_client self.search = search_engine async def research(self, question: str, depth: int = 1, sources: List[str] = None) -> ResearchResult: print(f"🔍 正在研究：{question}") search_tasks = [] if not sources or "google" in sources: search_tasks.append(self.search.search_bing(question)) if not sources or "stackoverflow" in sources: search_tasks.append(self.search.search_stackoverflow(question)) search_results_list = await asyncio.gather(*search_tasks) all_results = [] for results in search_results_list: all_results.extend(results) print(f"📊 找到 {len(all_results)} 条相关结果") if depth > 1: all_results = await self._fetch_page_contents(all_results[:5]) answer = await self._synthesize_answer(question, all_results) related = await self._generate_related_questions(question, answer) confidence = self._calculate_confidence(all_results) return ResearchResult(answer=answer, sources=all_results[:5], related_questions=related, confidence=confidence) async def _fetch_page_contents(self, results: List[SearchResult]) -> List[SearchResult]: async def fetch_content(result: SearchResult): try: async with aiohttp.ClientSession() as session: async with session.get(result.url, headers=self.search.headers, timeout=aiohttp.ClientTimeout(total=10)) as response: html = await response.text() from bs4 import BeautifulSoup soup = BeautifulSoup(html, 'html.parser') for script in soup(['script', 'style', 'nav', 'footer']): script.decompose() text = soup.get_text(separator='\n', strip=True) result.snippet = text[:2000] + "..." result.relevance = 1.0 except Exception as e: print(f" ⚠️ 获取失败 {result.url}: {e}") tasks = [fetch_content(r) for r in results] await asyncio.gather(*tasks) return results async def _synthesize_answer(self, question: str, results: List[SearchResult]) -> str: context = "\n\n".join([f"来源{i+1}: {r.title}\n{r.snippet}\n链接：{r.url}" for i, r in enumerate(results[:5])]) prompt = f"基于以下搜索结果回答问题，要求准确引用信息来源，综合多个来源的信息。如果信息冲突，说明不同观点。问题：{question}\n搜索结果：{context}\n请给出详细答案：" answer = await self.llm.chat([ Message(role="system", content="你是一个专业的研究助手，擅长综合多源信息给出准确答案"), Message(role="user", content=prompt) ]) return answer async def _generate_related_questions(self, question: str, answer: str) -> List[str]: prompt = f"基于以下问答，生成 3-5 个相关的深入研究问题。问题：{question}\n答案：{answer[:500]}...\n请生成相关问题，每行一个：" response = await self.llm.chat([Message(role="user", content=prompt)]) return [line.strip() for line in response.split('\n') if line.strip() and not line.startswith('-')][:5] def _calculate_confidence(self, results: List[SearchResult]) -> float: if not results: return 0.0 base_confidence = min(1.0, len(results) / 10) has_docs = any(r.source == "docs" for r in results) if has_docs: base_confidence = min(1.0, base_confidence + 0.2) return round(base_confidence, 2) async def main_researcher(): llm = LLMClient() search = SearchEngine() researcher = IntelligentResearcher(llm, search) result = await researcher.research(question="Python 中 asyncio 和 multiprocessing 的区别是什么？", depth=2) print(f"\n{'='*60}\n📚 研究结果\n{'='*60}") print(f"\n置信度：{result.confidence*100}%\n") print(f"答案:\n{result.answer}\n") print("📖 参考来源:") for i, source in enumerate(result.sources, 1): print(f"{i}. {source.title}\n {source.url}\n 来源：{source.source}\n") if __name__ == "__main__": asyncio.run(main_researcher())

import argparse import asyncio from pathlib import Path import json class AIToolsCLI: def __init__(self): self.llm = LLMClient() self.summarizer = DocumentSummarizer(self.llm) self.code_assistant = InteractiveCodeAssistant(self.llm) self.researcher = IntelligentResearcher(self.llm, SearchEngine()) async def run(self): parser = argparse.ArgumentParser(description="AI 工具集 - 你的智能助手") subparsers = parser.add_subparsers(dest='command', help='可用命令') sum_parser = subparsers.add_parser('summarize', help='总结文档') sum_parser.add_argument('file', help='文件路径或 URL') sum_parser.add_argument('-t', '--type', default='file', choices=['file', 'url'], help='输入类型') code_parser = subparsers.add_parser('code', help='生成/处理代码') code_parser.add_argument('prompt', help='需求或代码') code_parser.add_argument('-m', '--mode', choices=['generate', 'explain', 'optimize', 'debug'], default='generate', help='处理模式') code_parser.add_argument('-l', '--language', default='python', help='编程语言') res_parser = subparsers.add_parser('research', help='研究问题') res_parser.add_argument('question', help='研究问题') res_parser.add_argument('-d', '--depth', type=int, default=1, choices=[1, 2, 3], help='研究深度') args = parser.parse_args() if not args.command: parser.print_help() return if args.command == 'summarize': await self._cmd_summarize(args) elif args.command == 'code': await self._cmd_code(args) elif args.command == 'research': await self._cmd_research(args) async def _cmd_summarize(self, args): print(f"📖 正在总结：{args.file}") result = await self.summarizer.summarize(source=args.file, source_type=args.type) output = f"# {result.title}\n**统计信息**\n- 字数：{result.word_count}\n- 预计阅读时间：{result.reading_time} 分钟\n\n**关键要点**\n" + "\n".join(f'{i+1}. {p}' for i, p in enumerate(result.key_points)) + f"\n\n**总结**\n{result.summary}" print(output) async def _cmd_code(self, args): print(f"💻 正在处理：{args.prompt[:50]}...") result = await self.code_assistant.generator.generate(requirement=args.prompt, language=args.language, mode=CodeMode(args.mode)) print(f"\n```{args.language}\n{result.code}\n```\n") print(f"**说明**\n{result.explanation}\n") if result.warnings: print("**警告**") for w in result.warnings: print(f" {w}") async def _cmd_research(self, args): print(f"🔍 正在研究：{args.question}") result = await self.researcher.research(question=args.question, depth=args.depth) print(f"\n# 研究结果\n**置信度**: {result.confidence*100}%\n## 答案\n{result.answer}\n## 参考来源") for i, source in enumerate(result.sources, 1): print(f"{i}. **{source.title}**\n 链接：{source.url}\n 来源：{source.source}\n") async def main(): cli = AIToolsCLI() await cli.run() if __name__ == "__main__": asyncio.run(main())

工具	核心价值	适用场景
智能文档总结器	10 秒读完 100 页	论文研读、报告分析
AI 代码生成器	说人话写代码	快速原型、学习参考
智能资料助手	秒速精准检索	技术调研、问题解决

Python 实战：构建文档总结、代码生成与智能检索助手