A3C算法详解
算法概述
A3C(Asynchronous Advantage Actor-Critic)结合了异策略的Actor-Critic方法和off-policy的样本复用特性。A3C通过引入异步更新机制,显著提高了样本效率,是DQN算法的重要改进之一。
核心实现代码
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
class A3CAgent:
def __init__(self, state_dim, action_dim, lr=0.0001, gamma=0.99):
self.state_dim = state_dim
self.action_dim = action_dim
self.gamma = gamma
self.lr = lr
# Actor网络
self.actor = nn.Sequential(
nn.Linear(state_dim, 256),
nn.ReLU(),
nn.Linear(256, action_dim),
nn.Softmax(dim=-1)
)
# Critic网络
self.critic = nn.Sequential(
nn.Linear(state_dim, 256),
nn.ReLU(),
nn.Linear(256, 1)
)
self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr)
self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr)
def select_action(self, state):
state = torch.FloatTensor(state)
probs = self.actor(state)
dist = Categorical(probs)
action = dist.sample()
return action.item()
def train(self, replay_buffer, batch_size=64, epochs=10):
# A3C使用经验回放和多轮次更新
if len(replay_buffer) < batch_size:
return
for _ in range(epochs):
# 随机采样batch
batch = np.random.choice(replay_buffer, batch_size, replace=False)
states = torch.FloatTensor([t[0] for t in batch])
actions = torch.LongTensor([t[1] for t in batch])
rewards = torch.FloatTensor([t[2] for t in batch])
next_states = torch.FloatTensor([t[3] for t in batch])
dones = torch.FloatTensor([t[4] for t in batch])
# 计算TD目标
next_values = self.critic(next_states).detach()
target_values = rewards + (1 - dones) * self.gamma * next_values
# 更新Critic
value_loss = nn.MSELoss()(self.critic(states).squeeze(), target_values)
self.critic_optimizer.zero_grad()
value_loss.backward()
self.critic_optimizer.step()
# 计算优势
values = self.critic(states).detach()
advantages = target_values - values
# 更新Actor
logits = self.actor(states)
dist = Categorical(logits=logits)
log_probs = dist.log_prob(actions)
actor_loss = -(log_probs * advantages.detach()).mean()
self.actor_optimizer.zero_grad()
actor_loss.backward()
torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 0.5)
self.actor_optimizer.step()
关键特性
- 异步更新:多个线程同时从回放缓冲区采样并更新模型
- 优势Actor-Critic:使用优势函数而非简单TD误差更新策略
- 经验回放:存储和复用历史经验提高样本效率
- 目标网络:使用独立的目标网络稳定训练过程