372 lines
17 KiB
Python
372 lines
17 KiB
Python
import os
|
||
import pickle
|
||
from copy import deepcopy
|
||
import numpy as np
|
||
import pandas as pd
|
||
import torch
|
||
import torch.nn as nn
|
||
import torch.nn.functional as F
|
||
from environment import ESSEnv
|
||
from tools import get_episode_return, test_one_episode, optimization_base_result
|
||
|
||
|
||
class ActorPPO(nn.Module):
|
||
def __init__(self, mid_dim, state_dim, action_dim):
|
||
super().__init__()
|
||
self.net = nn.Sequential(nn.Linear(state_dim, mid_dim), nn.ReLU(),
|
||
nn.Linear(mid_dim, mid_dim), nn.ReLU(),
|
||
nn.Linear(mid_dim, mid_dim), nn.Hardswish(),
|
||
nn.Linear(mid_dim, action_dim))
|
||
# the logarithm (log) of standard deviation (std) of action, it is a trainable parameter
|
||
self.a_logstd = nn.Parameter(torch.zeros((1, action_dim)) - 0.5, requires_grad=True)
|
||
self.sqrt_2pi_log = np.log(np.sqrt(2 * np.pi))
|
||
|
||
def forward(self, state):
|
||
return self.net(state).tanh() # action.tanh() limit the data output of action
|
||
|
||
def get_action(self, state):
|
||
a_avg = self.forward(state) # too big for the action
|
||
a_std = self.a_logstd.exp()
|
||
|
||
noise = torch.randn_like(a_avg)
|
||
action = a_avg + noise * a_std
|
||
return action, noise
|
||
|
||
def get_logprob_entropy(self, state, action):
|
||
a_avg = self.forward(state)
|
||
a_std = self.a_logstd.exp()
|
||
|
||
delta = ((a_avg - action) / a_std).pow(2) * 0.5
|
||
logprob = -(self.a_logstd + self.sqrt_2pi_log + delta).sum(1) # new_logprob
|
||
|
||
dist_entropy = (logprob.exp() * logprob).mean() # policy entropy
|
||
return logprob, dist_entropy
|
||
|
||
def get_old_logprob(self, _action, noise): # noise = action - a_noise
|
||
delta = noise.pow(2) * 0.5
|
||
return -(self.a_logstd + self.sqrt_2pi_log + delta).sum(1) # old_logprob
|
||
|
||
|
||
class CriticAdv(nn.Module):
|
||
def __init__(self, mid_dim, state_dim, _action_dim):
|
||
super().__init__()
|
||
self.net = nn.Sequential(nn.Linear(state_dim, mid_dim), nn.ReLU(),
|
||
nn.Linear(mid_dim, mid_dim), nn.ReLU(),
|
||
nn.Linear(mid_dim, mid_dim), nn.Hardswish(),
|
||
nn.Linear(mid_dim, 1))
|
||
|
||
def forward(self, state):
|
||
return self.net(state) # Advantage value
|
||
|
||
|
||
def smooth_rewards(rewards, window=10):
|
||
rewards = rewards.unsqueeze(0).unsqueeze(0) # 将 rewards 转为 [1, 1, len] 的形状以适应 conv1d
|
||
kernel = torch.ones(1, 1, window, device=rewards.device) / window # 创建一个均匀的滑动平均核
|
||
smoothed_rewards = F.conv1d(rewards, kernel, padding='valid') # 滑动平均
|
||
smoothed_rewards = smoothed_rewards.squeeze(0).squeeze(0) # 去掉多余的维度
|
||
|
||
# 保持与原始奖励序列相同的长度,将前 window-1 个奖励保持不变
|
||
return torch.cat((rewards[0, 0, :window-1], smoothed_rewards))
|
||
|
||
|
||
class AgentPPO:
|
||
def __init__(self):
|
||
super().__init__()
|
||
self.state = None
|
||
self.device = None
|
||
self.action_dim = None
|
||
self.get_obj_critic = None
|
||
|
||
self.criterion = torch.nn.SmoothL1Loss()
|
||
self.cri = self.cri_target = self.if_use_cri_target = self.cri_optim = self.ClassCri = None
|
||
self.act = self.act_target = self.if_use_act_target = self.act_optim = self.ClassAct = None
|
||
|
||
'''init modify'''
|
||
self.ClassCri = CriticAdv
|
||
self.ClassAct = ActorPPO
|
||
|
||
self.ratio_clip = 0.2 # ratio.clamp(1 - clip, 1 + clip)
|
||
self.lambda_entropy = 0.02 # could be 0.01~0.05
|
||
self.lambda_gae_adv = 0.98 # could be 0.95~0.99, (Generalized Advantage Estimation. ICLR.2016.)
|
||
self.get_reward_sum = None # self.get_reward_sum_gae if if_use_gae else self.get_reward_sum_raw
|
||
self.trajectory_list = None
|
||
|
||
def init(self, net_dim, state_dim, action_dim, learning_rate=1e-4, if_use_gae=True, gpu_id=0):
|
||
self.device = torch.device(f"cuda:{gpu_id}" if (torch.cuda.is_available() and (gpu_id >= 0)) else "cpu")
|
||
self.trajectory_list = list()
|
||
# choose whether to use gae or not
|
||
self.get_reward_sum = self.get_reward_sum_gae if if_use_gae else self.get_reward_sum_raw
|
||
|
||
self.cri = self.ClassCri(net_dim, state_dim, action_dim).to(self.device)
|
||
self.act = self.ClassAct(net_dim, state_dim, action_dim).to(self.device) if self.ClassAct else self.cri
|
||
self.cri_target = deepcopy(self.cri) if self.if_use_cri_target else self.cri
|
||
self.act_target = deepcopy(self.act) if self.if_use_act_target else self.act
|
||
|
||
self.cri_optim = torch.optim.Adam(self.cri.parameters(), learning_rate)
|
||
self.act_optim = torch.optim.Adam(self.act.parameters(), learning_rate) if self.ClassAct else self.cri
|
||
|
||
def select_action(self, state):
|
||
# states = torch.as_tensor((state,), dtype=torch.float32, device=self.device)
|
||
# _, noises = self.act.get_action(states)
|
||
actions = np.random.uniform(env.action_space.low, env.action_space.high)
|
||
return actions, 0
|
||
|
||
def explore_env(self, env, target_step):
|
||
state = self.state # sent state to agent and then agent sent state to method
|
||
trajectory_temp = list()
|
||
last_done = 0
|
||
for i in range(target_step):
|
||
action, noise = self.select_action(state)
|
||
state, next_state, reward, done, = env.step(np.tanh(action))
|
||
trajectory_temp.append((state, reward, done, action, noise))
|
||
if done:
|
||
state = env.reset()
|
||
last_done = i
|
||
else:
|
||
state = next_state
|
||
self.state = state
|
||
|
||
'''splice list'''
|
||
# store 0 trajectory information to list
|
||
trajectory_list = self.trajectory_list + trajectory_temp[:last_done + 1]
|
||
self.trajectory_list = trajectory_temp[last_done:]
|
||
return trajectory_list
|
||
|
||
def update_net(self, buffer, batch_size, repeat_times, soft_update_tau):
|
||
"""put data extract and update network together"""
|
||
with torch.no_grad():
|
||
buf_len = buffer[0].shape[0]
|
||
# decompose buffer data
|
||
buf_state, buf_action, buf_noise, buf_reward, buf_mask = [ten.to(self.device) for ten in buffer]
|
||
|
||
'''get buf_r_sum, buf_logprob'''
|
||
bs = 4096
|
||
buf_value = [self.cri_target(buf_state[i:i + bs]) for i in range(0, buf_len, bs)]
|
||
buf_value = torch.cat(buf_value, dim=0)
|
||
|
||
# 不需要获取logprob,因为我们不再优化actor网络
|
||
buf_logprob = torch.zeros_like(buf_value) # 随机策略不会有有效的logprob
|
||
|
||
buf_r_sum, buf_advantage = self.get_reward_sum(buf_len, buf_reward, buf_mask, buf_value)
|
||
# normalize advantage
|
||
buf_advantage = (buf_advantage - buf_advantage.mean()) / (buf_advantage.std() + 1e-5)
|
||
buf_advantage = smooth_rewards(buf_advantage, window=10)
|
||
del buf_noise, buffer[:]
|
||
|
||
'''PPO: Surrogate objective of Trust Region'''
|
||
obj_critic = obj_actor = None
|
||
|
||
# 由于使用随机策略,更新网络的过程被跳过
|
||
# 如果没有需要更新的策略网络,则只需要进行critic网络的简单优化
|
||
for _ in range(int(buf_len / batch_size * repeat_times)):
|
||
indices = torch.randint(buf_len, size=(batch_size,), requires_grad=False, device=self.device)
|
||
|
||
state = buf_state[indices]
|
||
action = buf_action[indices]
|
||
r_sum = buf_r_sum[indices]
|
||
logprob = buf_logprob[indices]
|
||
advantage = buf_advantage[indices]
|
||
|
||
# 只更新critic网络
|
||
value = self.cri(state).squeeze(1) # critic network预测状态的总奖励(Q值)
|
||
obj_critic = self.criterion(value, r_sum) # 计算value loss
|
||
self.optim_update(self.cri_optim, obj_critic) # 更新critic网络的参数
|
||
# 选择是否使用软更新
|
||
self.soft_update(self.cri_target, self.cri, soft_update_tau) if self.cri_target is not self.cri else None
|
||
|
||
a_std_log = getattr(self.act, 'a_std_log', torch.zeros(1))
|
||
return obj_critic.item(), 0, a_std_log.mean().item() # 未更新actor网络,因此obj_actor为0
|
||
|
||
def get_reward_sum_raw(self, buf_len, buf_reward, buf_mask, buf_value) -> (torch.Tensor, torch.Tensor):
|
||
buf_r_sum = torch.empty(buf_len, dtype=torch.float32, device=self.device) # reward sum
|
||
|
||
pre_r_sum = 0
|
||
for i in range(buf_len - 1, -1, -1):
|
||
buf_r_sum[i] = buf_reward[i] + buf_mask[i] * pre_r_sum
|
||
pre_r_sum = buf_r_sum[i]
|
||
buf_advantage = buf_r_sum - (buf_mask * buf_value[:, 0])
|
||
return buf_r_sum, buf_advantage
|
||
|
||
def get_reward_sum_gae(self, buf_len, ten_reward, ten_mask, ten_value) -> (torch.Tensor, torch.Tensor):
|
||
buf_r_sum = torch.empty(buf_len, dtype=torch.float32, device=self.device) # old policy value
|
||
buf_advantage = torch.empty(buf_len, dtype=torch.float32, device=self.device) # advantage value
|
||
|
||
pre_r_sum = 0.0
|
||
pre_advantage = 0.0 # advantage value of previous step
|
||
for i in range(buf_len - 1, -1, -1):
|
||
buf_r_sum[i] = ten_reward[i] + ten_mask[i] * pre_r_sum
|
||
pre_r_sum = buf_r_sum[i]
|
||
buf_advantage[i] = ten_reward[i] + ten_mask[i] * (pre_advantage - ten_value[i])
|
||
pre_advantage = ten_value[i] + buf_advantage[i] * self.lambda_gae_adv
|
||
return buf_r_sum, buf_advantage
|
||
|
||
@staticmethod
|
||
def optim_update(optimizer, objective):
|
||
optimizer.zero_grad()
|
||
objective.backward()
|
||
optimizer.step()
|
||
|
||
@staticmethod
|
||
def soft_update(target_net, current_net, tau):
|
||
for tar, cur in zip(target_net.parameters(), current_net.parameters()):
|
||
tar.data.copy_(cur.data.__mul__(tau) + tar.data.__mul__(1.0 - tau))
|
||
|
||
|
||
class Arguments:
|
||
def __init__(self, agent=None, env=None):
|
||
self.agent = agent # Deep Reinforcement Learning algorithm
|
||
self.env = env # the environment for training
|
||
self.cwd = None # current work directory. None means set automatically
|
||
self.if_remove = False # remove the cwd folder? (True, False, None:ask me)
|
||
self.visible_gpu = '0' # for example: os.environ['CUDA_VISIBLE_DEVICES'] = '0, 2,'
|
||
self.num_threads = 32 # cpu_num for evaluate model, torch.set_num_threads(self.num_threads)
|
||
|
||
'''Arguments for training'''
|
||
self.num_episode = 1000 # to control the train episodes for PPO
|
||
self.gamma = 0.995 # discount factor of future rewards
|
||
self.learning_rate = 1e-4 # 1e-4 2 ** -14 2e-4
|
||
self.soft_update_tau = 2 ** -8 # 1e-3 2 ** -8
|
||
|
||
self.net_dim = 256 # the network width
|
||
self.batch_size = 4096 # num of transitions sampled from replay buffer.
|
||
self.repeat_times = 2 ** 5 # collect target_step, then update network
|
||
self.target_step = 4096 # repeatedly update network to keep critic's loss small
|
||
self.max_memo = self.target_step # capacity of replay buffer
|
||
self.if_gae_or_raw = True # GAE for on-policy sparse reward: Generalized Advantage Estimation.
|
||
|
||
'''Arguments for evaluate'''
|
||
self.random_seed = 1234
|
||
# self.random_seed_list = [1234, 2234, 3234, 4234, 5234]
|
||
self.random_seed_list = [1]
|
||
self.train = True
|
||
self.save_network = True
|
||
self.test_network = True
|
||
self.save_test_data = True
|
||
self.compare_with_gurobi = True
|
||
self.plot_on = True
|
||
|
||
def init_before_training(self):
|
||
if self.cwd is None:
|
||
agent_name = self.agent.__class__.__name__
|
||
self.cwd = f'./{agent_name}'
|
||
|
||
np.random.seed(self.random_seed)
|
||
torch.manual_seed(self.random_seed)
|
||
torch.set_num_threads(self.num_threads)
|
||
torch.set_default_dtype(torch.float32)
|
||
os.environ['CUDA_VISIBLE_DEVICES'] = str(self.visible_gpu)
|
||
|
||
|
||
def update_buffer(_trajectory):
|
||
_trajectory = list(map(list, zip(*_trajectory))) # 2D-list transpose, here cut the trajectory into 5 parts
|
||
ten_state = torch.as_tensor(_trajectory[0]) # tensor state here
|
||
ten_reward = torch.as_tensor(_trajectory[1], dtype=torch.float32)
|
||
# _trajectory[2] = done, 将 done 替换为掩码,节省内存
|
||
ten_mask = (1.0 - torch.as_tensor(_trajectory[2], dtype=torch.float32)) * gamma
|
||
ten_action = torch.as_tensor(_trajectory[3])
|
||
ten_noise = torch.as_tensor(_trajectory[4], dtype=torch.float32)
|
||
|
||
buffer[:] = (ten_state, ten_action, ten_noise, ten_reward, ten_mask) # list store tensors
|
||
|
||
_steps = ten_reward.shape[0] # how many steps are collected in all trajectories
|
||
_r_exp = ten_reward.mean() # the mean reward
|
||
return _steps, _r_exp
|
||
|
||
|
||
if __name__ == '__main__':
|
||
args = Arguments()
|
||
reward_record = {'episode': [], 'steps': [], 'mean_episode_reward': [], 'unbalance': []}
|
||
loss_record = {'episode': [], 'steps': [], 'critic_loss': [], 'actor_loss': [], 'entropy_loss': []}
|
||
args.visible_gpu = '0'
|
||
for seed in args.random_seed_list:
|
||
args.random_seed = seed
|
||
args.agent = AgentPPO()
|
||
agent_name = f'{args.agent.__class__.__name__}'
|
||
args.agent.cri_target = True
|
||
args.env = ESSEnv()
|
||
args.init_before_training()
|
||
'''init agent and environment'''
|
||
agent = args.agent
|
||
env = args.env
|
||
agent.init(args.net_dim, env.state_space.shape[0], env.action_space.shape[0], args.learning_rate,
|
||
args.if_gae_or_raw)
|
||
cwd = args.cwd
|
||
gamma = args.gamma
|
||
batch_size = args.batch_size # how much data should be used to update net
|
||
target_step = args.target_step # how manysteps of one episode should stop
|
||
repeat_times = args.repeat_times # how many times should update for one batch size data
|
||
soft_update_tau = args.soft_update_tau
|
||
num_episode = args.num_episode
|
||
agent.state = env.reset()
|
||
'''init buffer'''
|
||
buffer = list()
|
||
'''init training parameters'''
|
||
# args.train = False
|
||
# args.save_network = False
|
||
# args.test_network = False
|
||
# args.save_test_data = False
|
||
# args.compare_with_gurobi = False
|
||
# args.plot_on = False
|
||
if args.train:
|
||
for i_episode in range(num_episode):
|
||
with torch.no_grad():
|
||
trajectory_list = agent.explore_env(env, target_step)
|
||
steps, r_exp = update_buffer(trajectory_list)
|
||
critic_loss, actor_loss, entropy_loss = agent.update_net(buffer, batch_size, repeat_times,
|
||
soft_update_tau)
|
||
loss_record['critic_loss'].append(critic_loss)
|
||
loss_record['actor_loss'].append(actor_loss)
|
||
loss_record['entropy_loss'].append(entropy_loss)
|
||
|
||
with torch.no_grad():
|
||
episode_reward, episode_unbalance = get_episode_return(env, agent.act, agent.device)
|
||
reward_record['mean_episode_reward'].append(episode_reward)
|
||
reward_record['unbalance'].append(episode_unbalance)
|
||
print(f'epsiode: {i_episode}, reward: {episode_reward}, unbalance: {episode_unbalance}')
|
||
|
||
act_save_path = f'{args.cwd}/actor_ram.pth'
|
||
loss_record_path = f'{args.cwd}/loss_ram.pkl'
|
||
reward_record_path = f'{args.cwd}/reward_ram.pkl'
|
||
|
||
if args.save_network:
|
||
with open(loss_record_path, 'wb') as tf:
|
||
pickle.dump(loss_record, tf)
|
||
with open(reward_record_path, 'wb') as tf:
|
||
pickle.dump(reward_record, tf)
|
||
torch.save(agent.act.state_dict(), act_save_path)
|
||
print('actor parameters have been saved')
|
||
|
||
if args.test_network:
|
||
args.cwd = agent_name
|
||
agent.act.load_state_dict(torch.load(act_save_path))
|
||
print('parameters have been reload and test')
|
||
record = test_one_episode(env, agent.act, agent.device)
|
||
eval_data = pd.DataFrame(record['system_info'])
|
||
eval_data.columns = ['time_step', 'price', 'load', 'action', 'real_action', 'soc', 'battery',
|
||
'gen1', 'gen2', 'gen3', 'pv', 'wind', 'unbalance', 'operation_cost', 'reward']
|
||
if args.save_test_data:
|
||
test_data_save_path = f'{args.cwd}/test_ram.pkl'
|
||
with open(test_data_save_path, 'wb') as tf:
|
||
pickle.dump(record, tf)
|
||
|
||
'''compare with gurobi data and results'''
|
||
if args.compare_with_gurobi:
|
||
month = record['init_info'][0][0]
|
||
day = record['init_info'][0][1]
|
||
initial_soc = record['init_info'][0][3]
|
||
base_result = optimization_base_result(env, month, day, initial_soc)
|
||
if args.plot_on:
|
||
from plotDRL import PlotArgs, make_dir, plot_evaluation_information, plot_optimization_result
|
||
|
||
plot_args = PlotArgs()
|
||
plot_args.feature_change = 'ram'
|
||
args.cwd = agent_name
|
||
plot_dir = make_dir(args.cwd, plot_args.feature_change)
|
||
plot_optimization_result(base_result, plot_dir)
|
||
plot_evaluation_information(args.cwd + '/' + 'test_ram.pkl', plot_dir)
|
||
'''compare the different cost get from gurobi and PPO'''
|
||
ration = sum(eval_data['operation_cost']) / sum(base_result['step_cost'])
|
||
print('rl_cost:', sum(eval_data['operation_cost']))
|
||
print('gurobi_cost:', sum(base_result['step_cost']))
|
||
print('ration:', ration)
|