From 53d3ac9ca828eca346ce980fedd887acb3007219 Mon Sep 17 00:00:00 2001 From: chenxiaodong Date: Wed, 19 Jun 2024 09:56:11 +0800 Subject: [PATCH] edit the layer normalization --- .idea/DRL-for-Energy-Systems.iml | 2 +- .idea/deployment.xml | 22 +++++++++++ .idea/misc.xml | 2 +- PPO.py | 64 ++++++++++++++++---------------- tools.py | 1 - 5 files changed, 57 insertions(+), 34 deletions(-) create mode 100644 .idea/deployment.xml diff --git a/.idea/DRL-for-Energy-Systems.iml b/.idea/DRL-for-Energy-Systems.iml index 0a5fd4b..ab54ddc 100644 --- a/.idea/DRL-for-Energy-Systems.iml +++ b/.idea/DRL-for-Energy-Systems.iml @@ -2,7 +2,7 @@ - + diff --git a/.idea/deployment.xml b/.idea/deployment.xml new file mode 100644 index 0000000..9af79fb --- /dev/null +++ b/.idea/deployment.xml @@ -0,0 +1,22 @@ + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index 94f4964..9aaae3a 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,5 +3,5 @@ - + \ No newline at end of file diff --git a/PPO.py b/PPO.py index 35d8484..842b9d6 100644 --- a/PPO.py +++ b/PPO.py @@ -12,33 +12,34 @@ os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE' script_name = os.path.basename(__file__) -# after adding layer normalization, it doesn't work class ActorPPO(nn.Module): def __init__(self, mid_dim, state_dim, action_dim, layer_norm=False): super().__init__() + self.layer_norm = layer_norm self.net = nn.Sequential(nn.Linear(state_dim, mid_dim), nn.ReLU(), nn.Linear(mid_dim, mid_dim), nn.ReLU(), nn.Linear(mid_dim, mid_dim), nn.Hardswish(), - nn.Linear(mid_dim, action_dim), ) - + nn.Linear(mid_dim, action_dim)) # the logarithm (log) of standard deviation (std) of action, it is a trainable parameter self.a_logstd = nn.Parameter(torch.zeros((1, action_dim)) - 0.5, requires_grad=True) self.sqrt_2pi_log = np.log(np.sqrt(2 * np.pi)) - if layer_norm: - self.layer_norm(self.net) - @staticmethod - def layer_norm(layer, std=1.0, bias_const=0.0): - for i in layer: - if hasattr(i, 'weight'): - torch.nn.init.orthogonal_(i.weight, std) - torch.nn.init.constant_(i.bias, bias_const) + if self.layer_norm: + self.apply_layer_norm() + + def apply_layer_norm(self): + def init_weights(layer): + if isinstance(layer, nn.Linear): + nn.init.orthogonal_(layer.weight, 1.0) + nn.init.constant_(layer.bias, 0.0) + + self.net.apply(init_weights) def forward(self, state): - return self.net(state).tanh() # action.tanh() # in this way limit the data output of action + return self.net(state).tanh() # action.tanh() limit the data output of action def get_action(self, state): - a_avg = self.net(state) # too big for the action + a_avg = self.forward(state) # too big for the action a_std = self.a_logstd.exp() noise = torch.randn_like(a_avg) @@ -46,7 +47,7 @@ class ActorPPO(nn.Module): return action, noise def get_logprob_entropy(self, state, action): - a_avg = self.net(state) + a_avg = self.forward(state) a_std = self.a_logstd.exp() delta = ((a_avg - action) / a_std).pow(2) * 0.5 @@ -63,19 +64,21 @@ class ActorPPO(nn.Module): class CriticAdv(nn.Module): def __init__(self, mid_dim, state_dim, _action_dim, layer_norm=False): super().__init__() + self.layer_norm = layer_norm self.net = nn.Sequential(nn.Linear(state_dim, mid_dim), nn.ReLU(), nn.Linear(mid_dim, mid_dim), nn.ReLU(), nn.Linear(mid_dim, mid_dim), nn.Hardswish(), nn.Linear(mid_dim, 1)) - if layer_norm: - self.layer_norm(self.net, std=1.0) + if self.layer_norm: + self.apply_layer_norm() - @staticmethod - def layer_norm(layer, std=1.0, bias_const=0.0): - for i in layer: - if hasattr(i, 'weight'): - torch.nn.init.orthogonal_(i.weight, std) - torch.nn.init.constant_(i.bias, bias_const) + def apply_layer_norm(self): + def init_weights(layer): + if isinstance(layer, nn.Linear): + nn.init.orthogonal_(layer.weight, 1.0) + nn.init.constant_(layer.bias, 0.0) + + self.net.apply(init_weights) def forward(self, state): return self.net(state) # Advantage value @@ -116,7 +119,6 @@ class AgentPPO: self.cri_optim = torch.optim.Adam(self.cri.parameters(), learning_rate) self.act_optim = torch.optim.Adam(self.act.parameters(), learning_rate) if self.ClassAct else self.cri - del self.ClassCri, self.ClassAct # why del self.ClassCri and self.ClassAct here, to save memory? def select_action(self, state): states = torch.as_tensor((state,), dtype=torch.float32, device=self.device) @@ -129,8 +131,8 @@ class AgentPPO: last_done = 0 for i in range(target_step): action, noise = self.select_action(state) - state, next_state, reward, done, = env.step( - np.tanh(action)) # the step of cut action is finally organized into the environment. + # the step of cut action is finally organized into the environment + state, next_state, reward, done, = env.step(np.tanh(action)) trajectory_temp.append((state, reward, done, action, noise)) if done: state = env.reset() @@ -140,8 +142,8 @@ class AgentPPO: self.state = state '''splice list''' - trajectory_list = self.trajectory_list + trajectory_temp[ - :last_done + 1] # store 0 trajectory information to the list + # store 0 trajectory information to list + trajectory_list = self.trajectory_list + trajectory_temp[:last_done + 1] self.trajectory_list = trajectory_temp[last_done:] return trajectory_list @@ -149,12 +151,12 @@ class AgentPPO: """put data extract and update network together""" with torch.no_grad(): buf_len = buffer[0].shape[0] - buf_state, buf_action, buf_noise, buf_reward, buf_mask = [ten.to(self.device) for ten in - buffer] # decompose buffer data + # decompose buffer data + buf_state, buf_action, buf_noise, buf_reward, buf_mask = [ten.to(self.device) for ten in buffer] '''get buf_r_sum, buf_logprob''' bs = 4096 # set a smaller 'BatchSize' when out of GPU memory: 1024, could change to 4096 - buf_value = [self.cri_target(buf_state[i:i + bs]) for i in range(0, buf_len, bs)] # + buf_value = [self.cri_target(buf_state[i:i + bs]) for i in range(0, buf_len, bs)] buf_value = torch.cat(buf_value, dim=0) buf_logprob = self.act.get_old_logprob(buf_action, buf_noise) @@ -317,7 +319,7 @@ if __name__ == '__main__': agent = args.agent env = args.env agent.init(args.net_dim, env.state_space.shape[0], env.action_space.shape[0], args.learning_rate, - args.if_per_or_gae) + args.if_per_or_gae, layer_norm=True) cwd = args.cwd gamma = args.gamma diff --git a/tools.py b/tools.py index 824e9c7..c3c303f 100644 --- a/tools.py +++ b/tools.py @@ -49,7 +49,6 @@ def optimization_base_result(env, month, day, initial_soc): # 设置系统变量 on_off = m.addVars(NUM_GEN, period, vtype=GRB.BINARY, name='on_off') gen_output = m.addVars(NUM_GEN, period, vtype=GRB.CONTINUOUS, name='output') - pv = m.addVars(period, vtype=GRB.CONTINUOUS, lb=0, name='pv') # 设置充放电约束 battery_energy_change = m.addVars(period, vtype=GRB.CONTINUOUS, lb=env.battery.max_discharge, ub=env.battery.max_charge, name='battery_action')