"""《PPO 算法的代码》 时间:2024.12 环境:gym 作者:不去幼儿园 """import torch # Import PyTorch, a popular machine learning libraryimport torch.nn as nn # Import the neural network moduleimport torch.optim as optim # Import optimization algorithmsfrom torch.distributions import Categorical # Import Categorical for probabilistic action samplingimport numpy as np # Import NumPy for numerical computationsimport gym # Import OpenAI Gym for environment simulation
逐行解释 PPO 代码和公式
以下是对实现的 PyTorch PPO 算法代码的详细解释,逐行结合公式解析:
1. Actor-Critic 神经网络
# Define Actor-Critic NetworkclassActorCritic(nn.Module):
# Define the Actor-Critic modeldef__init__(self, state_dim, action_dim):
# Initialize with state and action dimensionssuper(ActorCritic, self).__init__() # Call parent class constructorself.shared_layer = nn.Sequential( # Shared network layers for feature extraction
nn.Linear(state_dim, 128), # Fully connected layer with 128 neurons
nn.ReLU() # ReLU activation function
)
self.actor = nn.Sequential( # Define the actor (policy) network
nn.Linear(128, action_dim), # Fully connected layer to output action probabilities
nn.Softmax(dim=-1) # Softmax to ensure output is a probability distribution
)
self.critic = nn.Linear(128, 1) # Define the critic (value) network to output state valuedefforward(self, state):
# Forward pass for the model
shared = self.shared_layer(state) # Pass state through shared layers
action_probs = self.actor(shared) # Get action probabilities from actor network
state_value = self.critic(shared) # Get state value from critic networkreturn action_probs, state_value # Return action probabilities and state value
# Memory to store experiencesclassMemory:
# Class to store agent's experiencedef__init__(self):
# Initialize memoryself.states = [] # List to store statesself.actions = [] # List to store actionsself.logprobs = [] # List to store log probabilities of actionsself.rewards = [] # List to store rewardsself.is_terminals = [] # List to store terminal state flagsdefclear(self):
# Clear memory after an updateself.states = [] # Clear stored statesself.actions = [] # Clear stored actionsself.logprobs = [] # Clear stored log probabilitiesself.rewards = [] # Clear stored rewardsself.is_terminals = [] # Clear terminal state flags
用于存储一个 episode 的经验数据:
states: 状态
actions: 动作
logprobs: 动作的对数概率
rewards: 即时奖励
is_terminals: 是否为终止状态(布尔值)
作用:为后续策略更新提供样本数据。
3. PPO 初始化
# PPO Agent classclassPPO:
# Define the PPO agentdef__init__(self, state_dim, action_dim, lr=0.002, gamma=0.99, eps_clip=0.2, K_epochs=4):
self.policy = ActorCritic(state_dim, action_dim).to(device) # Initialize the Actor-Critic modelself.optimizer = optim.Adam(self.policy.parameters(), lr=lr) # Adam optimizer for parameter updatesself.policy_old = ActorCritic(state_dim, action_dim).to(device) # Copy of the policy for stabilityself.policy_old.load_state_dict(self.policy.state_dict()) # Synchronize parametersself.MseLoss = nn.MSELoss() # Mean Squared Error loss for critic updatesself.gamma = gamma # Discount factor for rewardsself.eps_clip = eps_clip # Clipping parameter for PPOself.K_epochs = K_epochs # Number of epochs for optimization
policy:当前策略网络,用于输出动作概率和状态值。
gamma:折扣因子,用于奖励的时间衰减。
eps_clip:剪辑阈值 $\epsilon$,用于限制策略更新幅度。
policy_old:旧策略网络,用于计算概率比率 $r_t(\theta)$。
4. 动作选择
defselect_action(self, state, memory):
state = torch.FloatTensor(state).to(device) # Convert state to PyTorch tensor
action_probs, _ = self.policy_old(state) # Get action probabilities from old policy
dist = Categorical(action_probs) # Create a categorical distribution
action = dist.sample() # Sample an action from the distribution
memory.states.append(state) # Store state in memory
memory.actions.append(action) # Store action in memory
memory.logprobs.append(dist.log_prob(action)) # Store log probability of the actionreturn action.item() # Return action as a scalar value
# Hyperparameters
device = torch.device("cuda"if torch.cuda.is_available() else"cpu") # Use GPU if available
env = gym.make("CartPole-v1") # Initialize CartPole environment
state_dim = env.observation_space.shape[0] # Dimension of state space
action_dim = env.action_space.n # Number of possible actions
lr = 0.002# Learning rate
gamma = 0.99# Discount factor
eps_clip = 0.2# Clipping parameter
K_epochs = 4# Number of epochs for policy update
max_episodes = 1000# Maximum number of episodes
max_timesteps = 300# Maximum timesteps per episode# PPO Training
ppo = PPO(state_dim, action_dim, lr, gamma, eps_clip, K_epochs) # Initialize PPO agent
memory = Memory() # Initialize memoryfor episode inrange(1, max_episodes + 1):
# Loop over episodes
state = env.reset() # Reset environment
total_reward = 0# Initialize total rewardfor t inrange(max_timesteps):
# Loop over timesteps
action = ppo.select_action(state, memory) # Select action using PPO
state, reward, done, _ = env.step(action) # Take action and observe results
memory.rewards.append(reward) # Store reward in memory
memory.is_terminals.append(done) # Store terminal state flag in memory
total_reward += reward # Accumulate total rewardif done: # If episode is donebreak# Exit loop
ppo.update(memory) # Update PPO agent
memory.clear() # Clear memoryprint(f"Episode {episode}, Total Reward: {total_reward}") # Print episode statistics
env.close() # Close the environment