如果你想让 AI 不止停留在对话框,而是真正帮你干活,那它必须能自己调接口、查网页。这一层能力就是执行式 AI 的骨架。下面的代码和思路是我在实践过程中总结的,希望能帮你少走弯路。
一个最精简的 Agent 骨架
先看一个极简版本——它用大模型理解任务,然后依次执行规划好的步骤。
class AIAgent:
def __init__(self, llm, tools=None):
self.llm = llm
self.tools = tools or []
self.memory = []
def execute(self, task):
understanding = self._understand(task)
plan = self._plan(understanding)
results = []
for step in plan:
result = self._execute_step(step)
results.append(result)
if not self._verify(result):
plan = self._replan(step, result)
output = self._summarize(results)
return output
def _understand(self, task):
return self.llm.generate(f"分析以下任务的核心目标:{task}")
def _plan(self, understanding):
plan_text = self.llm.generate(f"为以下目标制定执行计划:{understanding}")
return [line.strip() for line in plan_text.split('\n') if line.strip()]
def _execute_step(self, step):
tool = self._select_tool(step)
result = tool.execute(step)
self.memory.append({'step': step, 'tool': tool.name, 'result': result})
return result
def _verify(self, result):
return result.get('success', False)
def _replan(self, failed_step, result):
new_plan = self.llm.generate(f"步骤'{failed_step}'执行失败,结果:{result},请调整计划")
return [line.strip() for line in new_plan.split('\n') if line.strip()]
def _summarize(self, results):
return self.llm.generate(f"总结以下执行结果:{results}")
def _select_tool(self, step):
for tool in self.tools:
if tool.can_handle(step):
return tool
return DefaultTool()
这个类本身没什么魔法,关键在于把 LLM 的生成结果当成分步指令,用工具去落地。如果某一步失败,就重新规划。
让 Agent 具备'思考-行动'循环:ReAct
更实际一点的框架是 ReAct(Reasoning + Acting),它会输出思考过程,再选择工具,观察结果,继续循环。
class ReActAgent:
def __init__(self, llm, tools):
self.llm = llm
self.tools = {tool.name: tool for tool in tools}
self.max_iterations = 10
def run(self, task):
context = f"任务:{task}\n"
for i in range(self.max_iterations):
thought = self._think(context)
print(f"[思考] {thought}")
if "任务完成" in thought or "Final Answer:" in thought:
return self._extract_answer(thought)
action, action_input = self._decide_action(thought)
print(f"[行动] {action}({action_input})")
observation = self._observe(action, action_input)
print(f"[观察] {observation}")
context += f"\n思考:{thought}\n行动:{action}({action_input})\n观察:{observation}"
return "达到最大迭代次数,任务未完成"
def _think(self, context):
prompt = f""" {context} 请思考下一步应该做什么。如果任务已完成,请回答"任务完成:[结果]" """
return self.llm.generate(prompt)
def _decide_action(self, thought):
response = self.llm.generate(f"根据思考'{thought}',选择要执行的工具和参数")
return self._parse_action(response)
def _observe(self, action, action_input):
if action in self.tools:
return self.tools[action].execute(action_input)
return f"未知工具:{action}"
def _extract_answer(self, thought):
return thought.split("任务完成:")[-1].strip()
def _parse_action(self, response):
lines = response.strip().split('\n')
action = "default"
action_input = ""
for line in lines:
if "工具:" in line or "tool:" in line.lower():
action = line.split(":")[-1].strip()
if "参数:" in line or "input:" in line.lower():
action_input = line.split(":")[-1].strip()
return action, action_input
配套的工具类可以很简单:
class FileTool:
name = "file_tool"
def execute(self, input_data):
return f"文件操作完成:{input_data}"
class WebTool:
name = "web_tool"
def execute(self, input_data):
return f"网络请求完成:{input_data}"
class MockLLM:
def generate(self, prompt):
if "思考" in prompt:
return "我需要先搜索相关信息"
elif "选择" in prompt:
return "工具:web_tool\n参数:搜索 AI Agent"
return "处理完成"
怎么用起来:从选模型到控制成本
模型选择
我现在通常这样看:
- 简单任务用性价比高的模型(比如 GPT-3.5 或国产小模型),响应快、便宜;
- 复杂推理或需要大量代码生成的场景,上 GPT-4 或 Claude;
- 数据敏感的本地部署就选 LLaMA 或 Qwen 系列。
评估效果
评估不仅仅是看跑通了多少个用例,还得关注步骤数和用户满意度。下面是一个简单的评估框架:
def evaluate_agent(agent, test_cases):
metrics = {'success_rate': 0, 'avg_time': 0, 'avg_steps': 0, 'user_satisfaction': 0}
results = []
for case in test_cases:
start = time.time()
result = agent.execute(case['task'])
elapsed = time.time() - start
results.append({
'success': result == case['expected'],
'time': elapsed,
'steps': len(agent.memory),
'quality': rate_quality(result, case['expected'])
})
metrics['success_rate'] = sum(r['success'] for r in results) / len(results)
metrics['avg_time'] = sum(r['time'] for r in results) / len(results)
metrics['avg_steps'] = sum(r['steps'] for r in results) / len(results)
metrics['user_satisfaction'] = sum(r['quality'] for r in results) / len(results)
return metrics
成本控制
优化提示词以减少 token 消耗,用缓存避免重复调用;权限最小化,敏感操作需二次确认。
别踩这些坑
我曾见过一个团队为了'全面自动化'把 Agent 能力吹得太大,结果上线后各种边界问题,又没有人工兜底,项目最终搁浅。教训很简单:不要为了 AI 而 AI;明确场景边界,预留人工干预入口。
进一步学习
- ReAct 论文:Synergizing Reasoning and Acting in Language Models (2023)
- Toolformer - 语言模型自学使用工具 (2023)
- AutoGPT 源码:https://github.com/Significant-Gravitas/AutoGPT
- LangChain 文档:https://python.langchain.com


