DDPG算法详解
算法概述
DDPG(Deep Deterministic Policy Gradient)是一种适用于连续动作空间的深度强化学习算法,它将DQN的Q值网络扩展到确定性策略,能够直接输出连续动作,使算法在连续控制任务中表现出色。
核心实现代码
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
class DDPGAgent:
def __init__(self, state_dim, action_dim, actor_lr=0.0001, critic_lr=0.001, gamma=0.99):
# Actor网络 - 直接输出确定性动作
self.actor = nn.Sequential(
nn.Linear(state_dim, 400),
nn.ReLU(),
nn.Linear(400, 300),
nn.ReLU(),
nn.Linear(300, action_dim),
nn.Tanh() # 输出限制在[-1, 1]
)
# Critic网络 - 估计Q值
self.critic = nn.Sequential(
nn.Linear(state_dim + action_dim, 400),
nn.ReLU(),
nn.Linear(400, 300),
nn.ReLU(),
nn.Linear(300, 1)
)
self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr)
self.gamma = gamma
self.memory = ReplayBuffer(100000, state_dim, action_dim)
self.batch_size = 128
def select_action(self, state, noise=0.1):
state = torch.FloatTensor(state)
# 添加噪声用于探索
action = self.actor(state).detach().numpy() + np.random.normal(0, noise, size=self.actor_output_dim)
return np.clip(action, -1, 1) # 限制动作在合理范围
def train(self):
if len(self.memory) < self.batch_size:
return
# 采样经验
states, actions, rewards, next_states, dones = self.memory.sample(self.batch_size)
states = torch.FloatTensor(states)
actions = torch.FloatTensor(actions)
rewards = torch.FloatTensor(rewards)
next_states = torch.FloatTensor(next_states)
dones = torch.FloatTensor(dones)
# 计算Critic目标 (DDPG使用确定性策略)
next_actions = self.actor(next_states)
next_q_values = self.critic(torch.cat([next_states, next_actions], dim=1)).detach()
target_q = rewards + (1 - dones) * self.gamma * next_q_values
# 更新Critic
current_q = self.critic(torch.cat([states, actions], dim=1))
critic_loss = nn.MSELoss()(current_q, target_q)
self.critic_optimizer.zero_grad()
critic_loss.backward()
torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5)
self.critic_optimizer.step()
# 更新Actor - 使用策略梯度
actor_loss = -self.critic(states, self.actor(states)).mean()
self.actor_optimizer.zero_grad()
actor_loss.backward()
torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 0.5)
self.actor_optimizer.step()
# 软更新
self.memory.add(states, actions, rewards, next_states, dones)
算法特点
- 确定性策略:Actor直接输出连续动作,而非动作分布
- 目标网络:使用独立的target网络稳定训练
- 经验回放:存储和复用历史经验
- 噪声探索:添加高斯噪声进行策略探索
- 软更新:目标网络缓慢追踪Actor网络