diff --git a/PPO_test.py b/PPO_test.py deleted file mode 100644 index 763542e..0000000 --- a/PPO_test.py +++ /dev/null @@ -1,360 +0,0 @@ -import json -import matplotlib.pyplot as plt -from environment import ESSEnv -import os -import pickle -from copy import deepcopy -import numpy as np -import pandas as pd -import torch -import torch.nn as nn - -os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE' - - -class ActorPPO(nn.Module): - def __init__(self, mid_dim, state_dim, action_dim, layer_norm=False): - super().__init__() - self.layer_norm = layer_norm - self.net = nn.Sequential(nn.Linear(state_dim, mid_dim), nn.ReLU(), - nn.Linear(mid_dim, mid_dim), nn.ReLU(), - nn.Linear(mid_dim, mid_dim), nn.Hardswish(), - nn.Linear(mid_dim, action_dim)) - # the logarithm (log) of standard deviation (std) of action, it is a trainable parameter - self.a_logstd = nn.Parameter(torch.zeros((1, action_dim)) - 0.5, requires_grad=True) - self.sqrt_2pi_log = np.log(np.sqrt(2 * np.pi)) - - if self.layer_norm: - self.apply_layer_norm() - - def apply_layer_norm(self): - def init_weights(layer): - if isinstance(layer, nn.Linear): - nn.init.orthogonal_(layer.weight, 1.0) - nn.init.constant_(layer.bias, 0.0) - - self.net.apply(init_weights) - - def forward(self, state): - return self.net(state).tanh() # action.tanh() limit the data output of action - - def get_action(self, state): - a_avg = self.forward(state) # too big for the action - a_std = self.a_logstd.exp() - - noise = torch.randn_like(a_avg) - action = a_avg + noise * a_std - return action, noise - - def get_logprob_entropy(self, state, action): - a_avg = self.forward(state) - a_std = self.a_logstd.exp() - - delta = ((a_avg - action) / a_std).pow(2) * 0.5 - logprob = -(self.a_logstd + self.sqrt_2pi_log + delta).sum(1) # new_logprob - - dist_entropy = (logprob.exp() * logprob).mean() # policy entropy - return logprob, dist_entropy - - def get_old_logprob(self, _action, noise): # noise = action - a_noise - delta = noise.pow(2) * 0.5 - return -(self.a_logstd + self.sqrt_2pi_log + delta).sum(1) # old_logprob - - -class CriticAdv(nn.Module): - def __init__(self, mid_dim, state_dim, _action_dim, layer_norm=False): - super().__init__() - self.layer_norm = layer_norm - self.net = nn.Sequential(nn.Linear(state_dim, mid_dim), nn.ReLU(), - nn.Linear(mid_dim, mid_dim), nn.ReLU(), - nn.Linear(mid_dim, mid_dim), nn.Hardswish(), - nn.Linear(mid_dim, 1)) - if self.layer_norm: - self.apply_layer_norm() - - def apply_layer_norm(self): - def init_weights(layer): - if isinstance(layer, nn.Linear): - nn.init.orthogonal_(layer.weight, 1.0) - nn.init.constant_(layer.bias, 0.0) - - self.net.apply(init_weights) - - def forward(self, state): - return self.net(state) # Advantage value - - -class AgentPPO: - def __init__(self): - super().__init__() - self.state = None - self.device = None - self.action_dim = None - self.get_obj_critic = None - - self.criterion = torch.nn.SmoothL1Loss() - self.cri = self.cri_target = self.if_use_cri_target = self.cri_optim = self.ClassCri = None - self.act = self.act_target = self.if_use_act_target = self.act_optim = self.ClassAct = None - - '''init modify''' - self.ClassCri = CriticAdv - self.ClassAct = ActorPPO - - self.ratio_clip = 0.2 # ratio.clamp(1 - clip, 1 + clip) - self.lambda_entropy = 0.02 # could be 0.01~0.05 - self.lambda_gae_adv = 0.98 # could be 0.95~0.99, GAE (Generalized Advantage Estimation. ICLR.2016.) - self.get_reward_sum = None # self.get_reward_sum_gae if if_use_gae else self.get_reward_sum_raw - self.trajectory_list = None - - def init(self, net_dim, state_dim, action_dim, learning_rate=1e-4, if_use_gae=False, gpu_id=0, layer_norm=False): - self.device = torch.device(f"cuda:{gpu_id}" if (torch.cuda.is_available() and (gpu_id >= 0)) else "cpu") - self.trajectory_list = list() - # choose whether to use gae or not - self.get_reward_sum = self.get_reward_sum_gae if if_use_gae else self.get_reward_sum_raw - - self.cri = self.ClassCri(net_dim, state_dim, action_dim, layer_norm).to(self.device) - self.act = self.ClassAct(net_dim, state_dim, action_dim, layer_norm).to( - self.device) if self.ClassAct else self.cri - self.cri_target = deepcopy(self.cri) if self.if_use_cri_target else self.cri - self.act_target = deepcopy(self.act) if self.if_use_act_target else self.act - - self.cri_optim = torch.optim.Adam(self.cri.parameters(), learning_rate) - self.act_optim = torch.optim.Adam(self.act.parameters(), learning_rate) if self.ClassAct else self.cri - - def select_action(self, state): - states = torch.as_tensor((state,), dtype=torch.float32, device=self.device) - actions, noises = self.act.get_action(states) - return actions[0].detach().cpu().numpy(), noises[0].detach().cpu().numpy() - - @staticmethod - def get_llm_action(index): - with open('data/llm_action.json', 'r') as file: - data = json.load(file) - data_tensor = torch.tensor(data, dtype=torch.float32) - normalized_index = index % len(data_tensor) - action = data_tensor[normalized_index].detach().cpu().numpy() - return action - - def explore_env(self, env, target_step): - state = self.state # sent state to agent and then agent sent state to method - trajectory_temp = list() - last_done = 0 - for i in range(target_step): - # action = self.get_llm_action(i) - # noise = 0 - action, noise = self.select_action(state) - state, next_state, reward, done, = env.step(np.tanh(action)) # make action between -1 & 1 - trajectory_temp.append((state, reward, done, action, noise)) - if done: - state = env.reset() - last_done = i - else: - state = next_state - self.state = state - - '''splice list''' - # store 0 trajectory information to list - trajectory_list = self.trajectory_list + trajectory_temp[:last_done + 1] - self.trajectory_list = trajectory_temp[last_done:] - return trajectory_list - - def update_net(self, buffer, batch_size, repeat_times, soft_update_tau): - """put data extract and update network together""" - with torch.no_grad(): - buf_len = buffer[0].shape[0] - # decompose buffer data - buf_state, buf_action, buf_noise, buf_reward, buf_mask = [ten.to(self.device) for ten in buffer] - - '''get buf_r_sum, buf_logprob''' - bs = 4096 # set a smaller 'BatchSize' when out of GPU memory: 1024, could change to 4096 - buf_value = [self.cri_target(buf_state[i:i + bs]) for i in range(0, buf_len, bs)] - buf_value = torch.cat(buf_value, dim=0) - buf_logprob = self.act.get_old_logprob(buf_action, buf_noise) - - buf_r_sum, buf_advantage = self.get_reward_sum(buf_len, buf_reward, buf_mask, buf_value) # detach() - # normalize advantage - buf_advantage = (buf_advantage - buf_advantage.mean()) / (buf_advantage.std() + 1e-5) - del buf_noise, buffer[:] - - '''PPO: Surrogate objective of Trust Region''' - obj_critic = obj_actor = None - for _ in range(int(buf_len / batch_size * repeat_times)): - indices = torch.randint(buf_len, size=(batch_size,), requires_grad=False, device=self.device) - - state = buf_state[indices] - action = buf_action[indices] - r_sum = buf_r_sum[indices] - logprob = buf_logprob[indices] - advantage = buf_advantage[indices] - - new_logprob, obj_entropy = self.act.get_logprob_entropy(state, action) # it is obj_actor - ratio = (new_logprob - logprob.detach()).exp() - surrogate1 = advantage * ratio - surrogate2 = advantage * ratio.clamp(1 - self.ratio_clip, 1 + self.ratio_clip) - obj_surrogate = -torch.min(surrogate1, surrogate2).mean() - obj_actor = obj_surrogate + obj_entropy * self.lambda_entropy - self.optim_update(self.act_optim, obj_actor) # update actor - - value = self.cri(state).squeeze(1) # critic network predicts the reward_sum (Q value) of state - # use smoothloss L1 to evaluate the value loss - # obj_critic = self.criterion(value, r_sum) / (r_sum.std() + 1e-6) - obj_critic = self.criterion(value, r_sum) - self.optim_update(self.cri_optim, obj_critic) # calculate and update the back propogation of value loss - # choose whether to use soft update - self.soft_update(self.cri_target, self.cri, soft_update_tau) if self.cri_target is not self.cri else None - - a_std_log = getattr(self.act, 'a_std_log', torch.zeros(1)) - return obj_critic.item(), obj_actor.item(), a_std_log.mean().item() # logging_tuple - - def get_reward_sum_raw(self, buf_len, buf_reward, buf_mask, buf_value) -> (torch.Tensor, torch.Tensor): - buf_r_sum = torch.empty(buf_len, dtype=torch.float32, device=self.device) # reward sum - - pre_r_sum = 0 - for i in range(buf_len - 1, -1, -1): - buf_r_sum[i] = buf_reward[i] + buf_mask[i] * pre_r_sum - pre_r_sum = buf_r_sum[i] - buf_advantage = buf_r_sum - (buf_mask * buf_value[:, 0]) - return buf_r_sum, buf_advantage - - def get_reward_sum_gae(self, buf_len, ten_reward, ten_mask, ten_value) -> (torch.Tensor, torch.Tensor): - buf_r_sum = torch.empty(buf_len, dtype=torch.float32, device=self.device) # old policy value - buf_advantage = torch.empty(buf_len, dtype=torch.float32, device=self.device) # advantage value - - pre_r_sum = 0 - pre_advantage = 0 # advantage value of previous step - for i in range(buf_len - 1, -1, -1): - buf_r_sum[i] = ten_reward[i] + ten_mask[i] * pre_r_sum - pre_r_sum = buf_r_sum[i] - buf_advantage[i] = ten_reward[i] + ten_mask[i] * (pre_advantage - ten_value[i]) # fix a bug here - pre_advantage = ten_value[i] + buf_advantage[i] * self.lambda_gae_adv - return buf_r_sum, buf_advantage - - @staticmethod - def optim_update(optimizer, objective): - optimizer.zero_grad() - objective.backward() - optimizer.step() - - @staticmethod - def soft_update(target_net, current_net, tau): - for tar, cur in zip(target_net.parameters(), current_net.parameters()): - tar.data.copy_(cur.data.__mul__(tau) + tar.data.__mul__(1.0 - tau)) - - -class Arguments: - def __init__(self, agent=None, env=None): - self.agent = agent # Deep Reinforcement Learning algorithm - self.env = env # the environment for training - self.cwd = None # current work directory. None means set automatically - self.if_remove = False # remove the cwd folder? (True, False, None:ask me) - self.visible_gpu = '0' # for example: os.environ['CUDA_VISIBLE_DEVICES'] = '0, 2,' - # self.worker_num = 2 # rollout workers number pre GPU (adjust it to get high GPU usage) - self.num_threads = 32 # cpu_num for evaluate model, torch.set_num_threads(self.num_threads) - - '''Arguments for training''' - self.num_episode = 1000 # to control the train episodes for PPO - self.gamma = 0.995 # discount factor of future rewards - self.learning_rate = 2 ** -14 # 2e-4 - self.soft_update_tau = 2 ** -8 # 2 ** -8 ~= 5e-3 - - self.net_dim = 256 # the network width - self.batch_size = 4096 # num of transitions sampled from replay buffer. - self.repeat_times = 2 ** 3 # collect target_step, then update network - self.target_step = 4096 # repeatedly update network to keep critic's loss small - self.max_memo = self.target_step # capacity of replay buffer - self.if_per_or_gae = False # GAE for on-policy sparse reward: Generalized Advantage Estimation. - - '''Arguments for evaluate''' - self.random_seed = 0 # initialize random seed in self.init_before_training() - # self.random_seed_list = [1234, 2234, 3234, 4234, 5234] - self.random_seed_list = [1234] - self.train = True - self.save_network = True - self.test_network = True - self.save_test_data = True - self.compare_with_gurobi = True - self.plot_on = True - - def init_before_training(self, if_main): - if self.cwd is None: - agent_name = self.agent.__class__.__name__ - self.cwd = f'./{agent_name}' - - if if_main: - import shutil # remove history according to bool(if_remove) - if self.if_remove is None: - self.if_remove = bool(input(f"| PRESS 'y' to REMOVE: {self.cwd}? ") == 'y') - elif self.if_remove: - shutil.rmtree(self.cwd, ignore_errors=True) - print(f"| Remove cwd: {self.cwd}") - os.makedirs(self.cwd, exist_ok=True) - - np.random.seed(self.random_seed) - torch.manual_seed(self.random_seed) - torch.set_num_threads(self.num_threads) - torch.set_default_dtype(torch.float32) - - os.environ['CUDA_VISIBLE_DEVICES'] = str(self.visible_gpu) - - -def update_buffer(_trajectory): - _trajectory = list(map(list, zip(*_trajectory))) # 2D-list transpose, here cut the trajectory into 5 parts - ten_state = torch.as_tensor(_trajectory[0]) # tensor state here - ten_reward = torch.as_tensor(_trajectory[1], dtype=torch.float32) - # _trajectory[2] = done, replace done by mask, save memory - ten_mask = (1.0 - torch.as_tensor(_trajectory[2], dtype=torch.float32)) * gamma - ten_action = torch.as_tensor(_trajectory[3]) - ten_noise = torch.as_tensor(_trajectory[4], dtype=torch.float32) - - buffer[:] = (ten_state, ten_action, ten_noise, ten_reward, ten_mask) # list store tensors - - _steps = ten_reward.shape[0] # how many steps are collected in all trajectories - _r_exp = ten_reward.mean() # the mean reward - return _steps, _r_exp - - -def load_actions_from_json(file_path): - with open(file_path, 'r') as file: - actions = json.load(file) - return actions - - -def simulate_with_llm_actions(env, llm_actions): - states, actions, rewards, unbalances = [], [], [], [] - state = env.reset() - - for action in llm_actions: - next_state, reward, done, info = env.step(action) - states.append(state) - actions.append(action) - rewards.append(reward) - unbalances.append(info.get('unbalance', 0)) - state = next_state - if done: - break - - return states, actions, rewards, unbalances - - -if __name__ == '__main__': - env = ESSEnv() - llm_actions = load_actions_from_json('data/llm_action.json') - states, actions, rewards, unbalances = simulate_with_llm_actions(env, llm_actions) - - fig, ax1 = plt.subplots() - - color = 'tab:blue' - ax1.set_xlabel('Step') - ax1.set_ylabel('Reward', color=color) - ax1.plot(rewards, color=color) - ax1.tick_params(axis='y', labelcolor=color) - - ax2 = ax1.twinx() - color = 'tab:red' - ax2.set_ylabel('Unbalance', color=color) - ax2.plot(unbalances, color=color) - ax2.tick_params(axis='y', labelcolor=color) - - fig.tight_layout() - plt.title('Rewards and Unbalance over Steps') - plt.show()