import json import os import pickle from copy import deepcopy import numpy as np import pandas as pd import torch import torch.nn as nn from data_manager import * from environment import ESSEnv from tools import optimization_base_result def load_llm_actions(file_path): with open(file_path, 'r') as file: data = json.load(file) return data class ActorPPO(nn.Module): def __init__(self, mid_dim, state_dim, action_dim): super().__init__() self.net = nn.Sequential(nn.Linear(state_dim, mid_dim), nn.ReLU(), nn.Linear(mid_dim, mid_dim), nn.ReLU(), nn.Linear(mid_dim, mid_dim), nn.Hardswish(), nn.Linear(mid_dim, action_dim)) # the logarithm (log) of standard deviation (std) of action, it is a trainable parameter self.a_logstd = nn.Parameter(torch.zeros((1, action_dim)) - 0.5, requires_grad=True) self.sqrt_2pi_log = np.log(np.sqrt(2 * np.pi)) def forward(self, state): return self.net(state).tanh() # action.tanh() limit the data output of action def get_action(self, state): a_avg = self.forward(state) # too big for the action a_std = self.a_logstd.exp() noise = torch.randn_like(a_avg) action = a_avg + noise * a_std return action, noise def get_logprob_entropy(self, state, action): a_avg = self.forward(state) a_std = self.a_logstd.exp() delta = ((a_avg - action) / a_std).pow(2) * 0.5 logprob = -(self.a_logstd + self.sqrt_2pi_log + delta).sum(1) # new_logprob dist_entropy = (logprob.exp() * logprob).mean() # policy entropy return logprob, dist_entropy def get_old_logprob(self, _action, noise): # noise = action - a_noise delta = noise.pow(2) * 0.5 return -(self.a_logstd + self.sqrt_2pi_log + delta).sum(1) # old_logprob class CriticAdv(nn.Module): def __init__(self, mid_dim, state_dim, _action_dim): super().__init__() self.net = nn.Sequential(nn.Linear(state_dim, mid_dim), nn.ReLU(), nn.Linear(mid_dim, mid_dim), nn.ReLU(), nn.Linear(mid_dim, mid_dim), nn.Hardswish(), nn.Linear(mid_dim, 1) ) def forward(self, state): return self.net(state) # Advantage value class AgentPPO: def __init__(self): super().__init__() self.state = None self.device = None self.action_dim = None self.get_obj_critic = None self.current_index = 0 self.criterion = torch.nn.SmoothL1Loss() self.cri = self.cri_target = self.if_use_cri_target = self.cri_optim = self.ClassCri = None self.act = self.act_target = self.if_use_act_target = self.act_optim = self.ClassAct = None self.ClassCri = CriticAdv self.ClassAct = ActorPPO self.ratio_clip = 0.2 # ratio.clamp(1 - clip, 1 + clip) self.lambda_entropy = 0.02 # could be 0.01~0.05 self.lambda_gae_adv = 0.98 # could be 0.95~0.99, GAE (Generalized Advantage Estimation. ICLR.2016.) self.get_reward_sum = None # self.get_reward_sum_gae if if_use_gae else self.get_reward_sum_raw self.trajectory_list = None self.llm_actions = load_llm_actions('data/results.json') def init(self, net_dim, state_dim, action_dim, learning_rate=1e-4, if_use_gae=False, gpu_id=0): self.device = torch.device(f"cuda:{gpu_id}" if (torch.cuda.is_available() and (gpu_id >= 0)) else "cpu") self.trajectory_list = list() # choose whether to use gae or not self.get_reward_sum = self.get_reward_sum_gae if if_use_gae else self.get_reward_sum_raw self.cri = self.ClassCri(net_dim, state_dim, action_dim).to(self.device) self.act = self.ClassAct(net_dim, state_dim, action_dim).to(self.device) if self.ClassAct else self.cri self.cri_target = deepcopy(self.cri) if self.if_use_cri_target else self.cri self.act_target = deepcopy(self.act) if self.if_use_act_target else self.act self.cri_optim = torch.optim.Adam(self.cri.parameters(), learning_rate) self.act_optim = torch.optim.Adam(self.act.parameters(), learning_rate) if self.ClassAct else self.cri def select_action(self, state): states = torch.as_tensor((state,), dtype=torch.float32, device=self.device) actions, noises = self.act.get_action(states) return actions[0].detach().cpu().numpy(), noises[0].detach().cpu().numpy() def explore_env(self, env, target_step, index): state = self.state # sent state to agent and then agent sent state to method trajectory_temp = list() last_done = 0 for i in range(target_step): action, noise = self.select_action(state) llm_action = self.llm_actions[index] action = [0.95 * action[i] + 0.05 * llm_action[i] for i in range(5)] state, next_state, reward, done, = env.step(np.tanh(action)) trajectory_temp.append((state, reward, done, action, noise)) if done: state = env.reset() last_done = i else: state = next_state self.state = state '''splice list''' # store 0 trajectory information to list trajectory_list = self.trajectory_list + trajectory_temp[:last_done + 1] self.trajectory_list = trajectory_temp[last_done:] return trajectory_list def update_net(self, buffer, batch_size, repeat_times, soft_update_tau): """put data extract and update network together""" with torch.no_grad(): buf_len = buffer[0].shape[0] # decompose buffer data buf_state, buf_action, buf_noise, buf_reward, buf_mask = [ten.to(self.device) for ten in buffer] '''get buf_r_sum, buf_logprob''' bs = 4096 buf_value = [self.cri_target(buf_state[i:i + bs]) for i in range(0, buf_len, bs)] buf_value = torch.cat(buf_value, dim=0) buf_logprob = self.act.get_old_logprob(buf_action, buf_noise) buf_r_sum, buf_advantage = self.get_reward_sum(buf_len, buf_reward, buf_mask, buf_value) # detach() # normalize advantage buf_advantage = (buf_advantage - buf_advantage.mean()) / (buf_advantage.std() + 1e-5) del buf_noise, buffer[:] '''PPO: Surrogate objective of Trust Region''' obj_critic = obj_actor = None for _ in range(int(buf_len / batch_size * repeat_times)): indices = torch.randint(buf_len, size=(batch_size,), requires_grad=False, device=self.device) state = buf_state[indices] action = buf_action[indices] r_sum = buf_r_sum[indices] logprob = buf_logprob[indices] advantage = buf_advantage[indices] new_logprob, obj_entropy = self.act.get_logprob_entropy(state, action) # it's obj_actor ratio = (new_logprob - logprob.detach()).exp() surrogate1 = advantage * ratio surrogate2 = advantage * ratio.clamp(1 - self.ratio_clip, 1 + self.ratio_clip) obj_surrogate = -torch.min(surrogate1, surrogate2).mean() obj_actor = obj_surrogate + obj_entropy * self.lambda_entropy self.optim_update(self.act_optim, obj_actor) # update actor value = self.cri(state).squeeze(1) # critic network predicts the reward_sum (Q value) of state # use smoothloss L1 to evaluate the value loss # obj_critic = self.criterion(value, r_sum) / (r_sum.std() + 1e-6) obj_critic = self.criterion(value, r_sum) self.optim_update(self.cri_optim, obj_critic) # calculate and update the back propogation of value loss # choose whether to use soft update self.soft_update(self.cri_target, self.cri, soft_update_tau) if self.cri_target is not self.cri else None a_std_log = getattr(self.act, 'a_std_log', torch.zeros(1)) return obj_critic.item(), obj_actor.item(), a_std_log.mean().item() # logging_tuple def get_reward_sum_raw(self, buf_len, buf_reward, buf_mask, buf_value) -> (torch.Tensor, torch.Tensor): buf_r_sum = torch.empty(buf_len, dtype=torch.float32, device=self.device) # reward sum pre_r_sum = 0 for i in range(buf_len - 1, -1, -1): buf_r_sum[i] = buf_reward[i] + buf_mask[i] * pre_r_sum pre_r_sum = buf_r_sum[i] buf_advantage = buf_r_sum - (buf_mask * buf_value[:, 0]) return buf_r_sum, buf_advantage def get_reward_sum_gae(self, buf_len, ten_reward, ten_mask, ten_value) -> (torch.Tensor, torch.Tensor): buf_r_sum = torch.empty(buf_len, dtype=torch.float32, device=self.device) # old policy value buf_advantage = torch.empty(buf_len, dtype=torch.float32, device=self.device) # advantage value pre_r_sum = 0.0 pre_advantage = 0.0 # advantage value of previous step for i in range(buf_len - 1, -1, -1): buf_r_sum[i] = ten_reward[i] + ten_mask[i] * pre_r_sum pre_r_sum = buf_r_sum[i] buf_advantage[i] = ten_reward[i] + ten_mask[i] * (pre_advantage - ten_value[i]) # fix a bug here pre_advantage = ten_value[i] + buf_advantage[i] * self.lambda_gae_adv return buf_r_sum, buf_advantage @staticmethod def optim_update(optimizer, objective): optimizer.zero_grad() objective.backward() optimizer.step() @staticmethod def soft_update(target_net, current_net, tau): for tar, cur in zip(target_net.parameters(), current_net.parameters()): tar.data.copy_(cur.data.__mul__(tau) + tar.data.__mul__(1.0 - tau)) class Arguments: def __init__(self, agent=None, env=None): self.agent = agent # Deep Reinforcement Learning algorithm self.env = env # the environment for training self.cwd = None # current work directory. None means set automatically self.visible_gpu = '0' # for example: os.environ['CUDA_VISIBLE_DEVICES'] = '0, 2,' self.num_threads = 32 # cpu_num for evaluate model, torch.set_num_threads(self.num_threads) '''Arguments for training''' self.num_episode = 1000 # to control the train episodes for PPO self.gamma = 0.995 # discount factor of future rewards self.learning_rate = 1e-4 # 2e-4 / 6e-5 / 2 ** -4 self.soft_update_tau = 2 ** -8 # 5e-3 / 4e-4 / 2 ** -8 1e-3 self.net_dim = 256 # the network width self.batch_size = 4096 # num of transitions sampled from replay buffer. self.repeat_times = 2 ** 5 # collect target_step, then update network self.target_step = 4096 # repeatedly update network to keep critic's loss small self.max_memo = self.target_step # capacity of replay buffer self.if_gae_or_raw = True # GAE for on-policy sparse reward: Generalized Advantage Estimation. '''Arguments for evaluate''' self.random_seed = 1234 # initialize random seed in self.init_before_training() # self.random_seed_list = [1234, 2234, 3234] self.random_seed_list = [3234] self.train = True self.save_network = True self.test_network = True self.save_test_data = True self.compare_with_gurobi = True self.plot_on = True def init_before_training(self): if self.cwd is None: agent_name = self.agent.__class__.__name__ self.cwd = f'./{agent_name}' np.random.seed(self.random_seed) torch.manual_seed(self.random_seed) torch.set_num_threads(self.num_threads) torch.set_default_dtype(torch.float32) os.environ['CUDA_VISIBLE_DEVICES'] = str(self.visible_gpu) def update_buffer(_trajectory): _trajectory = list(map(list, zip(*_trajectory))) # 2D-list transpose, here cut the trajectory into 5 parts ten_state = torch.as_tensor(_trajectory[0], dtype=torch.float32) # tensor state here ten_reward = torch.as_tensor(_trajectory[1], dtype=torch.float32) # _trajectory[2] = done, replace done by mask, save memory ten_mask = (1.0 - torch.as_tensor(_trajectory[2], dtype=torch.float32)) * gamma ten_action = torch.as_tensor(_trajectory[3], dtype=torch.float32) ten_noise = torch.as_tensor(_trajectory[4], dtype=torch.float32) buffer[:] = (ten_state, ten_action, ten_noise, ten_reward, ten_mask) # list store tensors _steps = ten_reward.shape[0] # how many steps are collected in all trajectories _r_exp = ten_reward.mean() # the mean reward return _steps, _r_exp def test_one_episode(env, act, device, index): """to get evaluate information, here record the unbalance of after taking action""" record_state = [] record_action = [] record_reward = [] record_unbalance = [] record_system_info = [] # [time, price, netload, action, real action, soc, output*4, unbalance, cost] record_init_info = [] # include month,day,time,intial soc env.TRAIN = False state = env.reset() record_init_info.append([env.month, env.day, env.current_time, env.battery.current_capacity]) print(f'current testing month is {env.month}, day is {env.day},initial_soc is {env.battery.current_capacity}') llm_actions = load_llm_actions('data/results.json') for i in range(24): s_tensor = torch.as_tensor((state,), device=device) a_tensor = act(s_tensor) rl_action = a_tensor.detach().cpu().numpy()[0] llm_action = llm_actions[index] action = [0.95 * rl_action[i] + 0.05 * llm_action[i] for i in range(5)] real_action = action state, next_state, reward, done = env.step(action) record_system_info.append([state[0], state[1], state[3] + env.wind.current_power, action, real_action, env.battery.SOC(), env.battery.energy_change, next_state[4], next_state[5], next_state[6], env.solar.current_power, env.wind.current_power, env.unbalance, env.operation_cost, reward]) record_state.append(state) record_action.append(real_action) record_reward.append(reward) record_unbalance.append(env.unbalance) state = next_state # add information of last step dg1, dh2, dg3, soc, tem, irr record_system_info[-1][7:12] = [env.final_step_outputs[0], env.final_step_outputs[1], env.final_step_outputs[2], env.final_step_outputs[4], env.final_step_outputs[5]] record_system_info[-1][5] = env.final_step_outputs[3] record = {'init_info': record_init_info, 'system_info': record_system_info, 'state': record_state, 'action': record_action, 'reward': record_reward, 'unbalance': record_unbalance} return record def test_llm(env, index): record_system_info = [] record_init_info = [] env.TRAIN = False record_init_info.append([env.month, env.day, env.current_time, env.battery.current_capacity]) llm_actions = load_llm_actions('data/results.json') cumulative_reward = 0 for i in range(24): action = llm_actions[index + i] real_action = action state, next_state, reward, done = env.step(action) record_system_info.append([state[0], state[1], state[3] + env.wind.current_power, action, real_action, env.battery.SOC(), env.battery.energy_change, next_state[4], next_state[5], next_state[6], env.solar.current_power, env.wind.current_power, env.unbalance, env.operation_cost, reward]) cumulative_reward += reward if done: break record_system_info[-1][7:12] = [env.final_step_outputs[0], env.final_step_outputs[1], env.final_step_outputs[2], env.final_step_outputs[4], env.final_step_outputs[5]] record_system_info[-1][5] = env.final_step_outputs[3] record = {'system_info': record_system_info, 'cumulative_reward': cumulative_reward} return record def get_episode_return(env, act, device, index): episode_reward = 0.0 episode_unbalance = 0.0 state = env.reset() llm_actions = load_llm_actions('data/results.json') for i in range(24): s_tensor = torch.as_tensor((state,), device=device) a_tensor = act(s_tensor) rl_action = a_tensor.detach().cpu().numpy()[0] llm_action = llm_actions[index] action = [0.95 * rl_action[i] + 0.05 * llm_action[i] for i in range(5)] state, next_state, reward, done, = env.step(action) state = next_state episode_reward += reward episode_unbalance += env.real_unbalance if done: break return episode_reward, episode_unbalance if __name__ == '__main__': args = Arguments() reward_record = {'episode': [], 'steps': [], 'mean_episode_reward': [], 'unbalance': []} loss_record = {'episode': [], 'steps': [], 'critic_loss': [], 'actor_loss': [], 'entropy_loss': []} args.visible_gpu = '0,1' for seed in args.random_seed_list: args.random_seed = seed args.agent = AgentPPO() args.agent.cri_target = True agent_name = f'{args.agent.__class__.__name__}' args.env = ESSEnv() args.init_before_training() '''init agent and environment''' agent = args.agent env = args.env agent.init(args.net_dim, env.state_space.shape[0], env.action_space.shape[0], args.learning_rate, args.if_gae_or_raw) cwd = args.cwd gamma = args.gamma batch_size = args.batch_size # how much data should be used to update net target_step = args.target_step # how many steps of one episode should stop repeat_times = args.repeat_times # how many times should update for one batch size data soft_update_tau = args.soft_update_tau agent.state = env.reset() '''init buffer''' buffer = list() '''init training parameters''' num_episode = args.num_episode args.train = False args.save_network = False # args.test_network = False # args.save_test_data = False # args.compare_with_gurobi = False # args.plot_on = False if args.train: for i_episode in range(num_episode): with torch.no_grad(): index = (sum(Constant.MONTHS_LEN[:env.month - 1]) + env.day - 1) * 24 + env.current_time trajectory_list = agent.explore_env(env, target_step, index) steps, r_exp = update_buffer(trajectory_list) critic_loss, actor_loss, entropy_loss = agent.update_net(buffer, batch_size, repeat_times, soft_update_tau) loss_record['critic_loss'].append(critic_loss) loss_record['actor_loss'].append(actor_loss) loss_record['entropy_loss'].append(entropy_loss) with torch.no_grad(): episode_reward, episode_unbalance = get_episode_return(env, agent.act, agent.device, index) reward_record['mean_episode_reward'].append(episode_reward) reward_record['unbalance'].append(episode_unbalance) print(f'epsiode: {i_episode}, reward: {episode_reward}, unbalance: {episode_unbalance}') act_save_path = f'{args.cwd}/actor_llm_1015.pth' loss_record_path = f'{args.cwd}/loss_llm_1015.pkl' reward_record_path = f'{args.cwd}/reward_llm_1015.pkl' if args.save_network: with open(loss_record_path, 'wb') as tf: pickle.dump(loss_record, tf) with open(reward_record_path, 'wb') as tf: pickle.dump(reward_record, tf) torch.save(agent.act.state_dict(), act_save_path) print('actor parameters have been saved') if args.test_network: args.cwd = agent_name agent.act.load_state_dict(torch.load(act_save_path)) print('parameters have been reload and test') index = (sum(Constant.MONTHS_LEN[:env.month - 1]) + env.day - 1) * 24 + env.current_time record = test_one_episode(env, agent.act, agent.device, index) # re = test_llm(env, index) rl_data = pd.DataFrame(record['system_info']) # llm_data = pd.DataFrame(re['system_info']) rl_data.columns = ['time_step', 'price', 'netload', 'action', 'real_action', 'soc', 'battery', 'gen1', 'gen2', 'gen3', 'pv', 'wind', 'unbalance', 'operation_cost', 'reward'] if args.save_test_data: test_data_save_path = f'{args.cwd}/test_llm_1015.pkl' # test_llm_save_path = f'{args.cwd}/test_only_llm_1015.pkl' with open(test_data_save_path, 'wb') as tf: pickle.dump(record, tf) # with open(test_llm_save_path, 'wb') as f: # pickle.dump(re, f) '''compare with gurobi data and results''' if args.compare_with_gurobi: month = record['init_info'][0][0] day = record['init_info'][0][1] initial_soc = record['init_info'][0][3] base_result = optimization_base_result(env, month, day, initial_soc) if args.plot_on: from plotDRL import * plot_args = PlotArgs() plot_args.feature_change = 'llm_1015' args.cwd = agent_name plot_dir = make_dir(args.cwd, plot_args.feature_change) # plot_optimization_result(base_result, plot_dir) # plot_evaluation_information(args.cwd + '/' + 'test_llm_1015.pkl', plot_dir) plot_soc(base_result, args.cwd + '/' + 'test_llm_1015.pkl', plot_dir) plot_energy(base_result, args.cwd + '/' + 'test_llm_1015.pkl', plot_dir) '''compare the different cost get from gurobi and PPO''' print('rl_cost:', sum(rl_data['operation_cost'])) print('gurobi_cost:', sum(base_result['step_cost'])) print('ration:', sum(rl_data['operation_cost']) / sum(base_result['step_cost']))