深入理解多智能体近端策略优化（MAPPO）算法与调参 | 极客日志

深入理解多智能体近端策略优化（MAPPO）算法与调参 | 极客日志

特性	On-Policy	Off-Policy
样本利用率	低（一次性使用）	高（可重复利用）
缓冲区	无长期缓冲区	使用 replay buffer
策略一致性	必须与当前策略一致	可利用任意策略生成的数据
学习效率	依赖高频环境交互	适合数据采样成本高的场景
收敛性	理论收敛性强	收敛性依赖数据分布
应用场景	实时决策、多智能体	复杂连续动作、高成本环境

"""Initialize policy network πθ with parameters θ Initialize value network Vϕ with parameters ϕ for iteration = 1, 2, ..., N do # Collect trajectories (states, actions, rewards) using current policy πθ trajectories = [] for episode = 1, ..., M do state = environment.reset() episode_trajectory = [] for t = 1, ..., T do action = πθ(state) # Sample action from current policy next_state, reward, done = environment.step(action) episode_trajectory.append((state, action, reward)) if done: break state = next_state trajectories.append(episode_trajectory) # Compute advantages using GAE or Monte Carlo returns advantages, returns = compute_advantages(trajectories, Vϕ) # Update policy πθ using clipped surrogate objective for _ in range(policy_update_epochs): policy_loss = compute_policy_loss(πθ, advantages) θ = θ - α_policy * ∇(policy_loss) # Update value network Vϕ for _ in range(value_update_epochs): value_loss = compute_value_loss(Vϕ, returns) ϕ = ϕ - α_value * ∇(value_loss)

"""Initialize Q-network Qθ with parameters θ Initialize target network Qθ' with parameters θ' = θ Initialize replay buffer D for iteration = 1, 2, ..., N do state = environment.reset() for t = 1, ..., T do # ε-greedy policy for exploration if random() < ε: action = random_action() else: action = argmax(Qθ(state)) next_state, reward, done = environment.step(action) # Store transition in replay buffer D.append((state, action, reward, next_state, done)) # Sample minibatch from replay buffer minibatch = random_sample(D, batch_size) # Compute target values target_values = [] for (s, a, r, s_, done) in minibatch: if done: target = r else: target = r + γ * max(Qθ'(s')) target_values.append(target) # Update Q-network Qθ loss = compute_loss(Qθ, minibatch, target_values) θ = θ - α * ∇(loss) # Update state if done: break state = next_state # Periodically update target network if iteration % target_update_freq == 0: θ' = θ