diff --git a/.idea/DRL-for-Energy-Systems.iml b/.idea/DRL-for-Energy-Systems.iml index ab54ddc..0a5fd4b 100644 --- a/.idea/DRL-for-Energy-Systems.iml +++ b/.idea/DRL-for-Energy-Systems.iml @@ -2,7 +2,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 9aaae3a..94f4964 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,5 +3,5 @@ - + \ No newline at end of file diff --git a/AgentPPO/loss_data.pkl b/AgentPPO/loss_data.pkl index 0a336c8..ed9087b 100644 Binary files a/AgentPPO/loss_data.pkl and b/AgentPPO/loss_data.pkl differ diff --git a/AgentPPO/reward_data.pkl b/AgentPPO/reward_data.pkl index 2e333f7..4ea7c10 100644 Binary files a/AgentPPO/reward_data.pkl and b/AgentPPO/reward_data.pkl differ diff --git a/AgentPPO/test_data.pkl b/AgentPPO/test_data.pkl index 1dfc983..32167b4 100644 Binary files a/AgentPPO/test_data.pkl and b/AgentPPO/test_data.pkl differ diff --git a/PPO.py b/PPO.py index 33986ad..2fd8f0e 100644 --- a/PPO.py +++ b/PPO.py @@ -331,11 +331,12 @@ if __name__ == '__main__': buffer = list() '''init training parameters''' num_episode = args.num_episode - # args.train=False - # args.save_network=False - # args.test_network=False - # args.save_test_data=False - # args.compare_with_gurobi=False + # args.train = False + # args.save_network = False + # args.test_network = False + # args.save_test_data = False + # args.compare_with_gurobi = False + # args.plot_on = False if args.train: for i_episode in range(num_episode): with torch.no_grad(): diff --git a/PPO_llm.py b/PPO_llm.py index cef9381..419afb2 100644 --- a/PPO_llm.py +++ b/PPO_llm.py @@ -140,9 +140,9 @@ class AgentPPO: trajectory_temp = list() last_done = 0 for i in range(target_step): - action = self.get_llm_action(i) - noise = 0 - # action, noise = self.select_action(state) + # action = self.get_llm_action(i) + # noise = 0 + action, noise = self.select_action(state) state, next_state, reward, done, = env.step(np.tanh(action)) # make action between -1 & 1 trajectory_temp.append((state, reward, done, action, noise)) if done: diff --git a/PPO_test.py b/PPO_test.py new file mode 100644 index 0000000..763542e --- /dev/null +++ b/PPO_test.py @@ -0,0 +1,360 @@ +import json +import matplotlib.pyplot as plt +from environment import ESSEnv +import os +import pickle +from copy import deepcopy +import numpy as np +import pandas as pd +import torch +import torch.nn as nn + +os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE' + + +class ActorPPO(nn.Module): + def __init__(self, mid_dim, state_dim, action_dim, layer_norm=False): + super().__init__() + self.layer_norm = layer_norm + self.net = nn.Sequential(nn.Linear(state_dim, mid_dim), nn.ReLU(), + nn.Linear(mid_dim, mid_dim), nn.ReLU(), + nn.Linear(mid_dim, mid_dim), nn.Hardswish(), + nn.Linear(mid_dim, action_dim)) + # the logarithm (log) of standard deviation (std) of action, it is a trainable parameter + self.a_logstd = nn.Parameter(torch.zeros((1, action_dim)) - 0.5, requires_grad=True) + self.sqrt_2pi_log = np.log(np.sqrt(2 * np.pi)) + + if self.layer_norm: + self.apply_layer_norm() + + def apply_layer_norm(self): + def init_weights(layer): + if isinstance(layer, nn.Linear): + nn.init.orthogonal_(layer.weight, 1.0) + nn.init.constant_(layer.bias, 0.0) + + self.net.apply(init_weights) + + def forward(self, state): + return self.net(state).tanh() # action.tanh() limit the data output of action + + def get_action(self, state): + a_avg = self.forward(state) # too big for the action + a_std = self.a_logstd.exp() + + noise = torch.randn_like(a_avg) + action = a_avg + noise * a_std + return action, noise + + def get_logprob_entropy(self, state, action): + a_avg = self.forward(state) + a_std = self.a_logstd.exp() + + delta = ((a_avg - action) / a_std).pow(2) * 0.5 + logprob = -(self.a_logstd + self.sqrt_2pi_log + delta).sum(1) # new_logprob + + dist_entropy = (logprob.exp() * logprob).mean() # policy entropy + return logprob, dist_entropy + + def get_old_logprob(self, _action, noise): # noise = action - a_noise + delta = noise.pow(2) * 0.5 + return -(self.a_logstd + self.sqrt_2pi_log + delta).sum(1) # old_logprob + + +class CriticAdv(nn.Module): + def __init__(self, mid_dim, state_dim, _action_dim, layer_norm=False): + super().__init__() + self.layer_norm = layer_norm + self.net = nn.Sequential(nn.Linear(state_dim, mid_dim), nn.ReLU(), + nn.Linear(mid_dim, mid_dim), nn.ReLU(), + nn.Linear(mid_dim, mid_dim), nn.Hardswish(), + nn.Linear(mid_dim, 1)) + if self.layer_norm: + self.apply_layer_norm() + + def apply_layer_norm(self): + def init_weights(layer): + if isinstance(layer, nn.Linear): + nn.init.orthogonal_(layer.weight, 1.0) + nn.init.constant_(layer.bias, 0.0) + + self.net.apply(init_weights) + + def forward(self, state): + return self.net(state) # Advantage value + + +class AgentPPO: + def __init__(self): + super().__init__() + self.state = None + self.device = None + self.action_dim = None + self.get_obj_critic = None + + self.criterion = torch.nn.SmoothL1Loss() + self.cri = self.cri_target = self.if_use_cri_target = self.cri_optim = self.ClassCri = None + self.act = self.act_target = self.if_use_act_target = self.act_optim = self.ClassAct = None + + '''init modify''' + self.ClassCri = CriticAdv + self.ClassAct = ActorPPO + + self.ratio_clip = 0.2 # ratio.clamp(1 - clip, 1 + clip) + self.lambda_entropy = 0.02 # could be 0.01~0.05 + self.lambda_gae_adv = 0.98 # could be 0.95~0.99, GAE (Generalized Advantage Estimation. ICLR.2016.) + self.get_reward_sum = None # self.get_reward_sum_gae if if_use_gae else self.get_reward_sum_raw + self.trajectory_list = None + + def init(self, net_dim, state_dim, action_dim, learning_rate=1e-4, if_use_gae=False, gpu_id=0, layer_norm=False): + self.device = torch.device(f"cuda:{gpu_id}" if (torch.cuda.is_available() and (gpu_id >= 0)) else "cpu") + self.trajectory_list = list() + # choose whether to use gae or not + self.get_reward_sum = self.get_reward_sum_gae if if_use_gae else self.get_reward_sum_raw + + self.cri = self.ClassCri(net_dim, state_dim, action_dim, layer_norm).to(self.device) + self.act = self.ClassAct(net_dim, state_dim, action_dim, layer_norm).to( + self.device) if self.ClassAct else self.cri + self.cri_target = deepcopy(self.cri) if self.if_use_cri_target else self.cri + self.act_target = deepcopy(self.act) if self.if_use_act_target else self.act + + self.cri_optim = torch.optim.Adam(self.cri.parameters(), learning_rate) + self.act_optim = torch.optim.Adam(self.act.parameters(), learning_rate) if self.ClassAct else self.cri + + def select_action(self, state): + states = torch.as_tensor((state,), dtype=torch.float32, device=self.device) + actions, noises = self.act.get_action(states) + return actions[0].detach().cpu().numpy(), noises[0].detach().cpu().numpy() + + @staticmethod + def get_llm_action(index): + with open('data/llm_action.json', 'r') as file: + data = json.load(file) + data_tensor = torch.tensor(data, dtype=torch.float32) + normalized_index = index % len(data_tensor) + action = data_tensor[normalized_index].detach().cpu().numpy() + return action + + def explore_env(self, env, target_step): + state = self.state # sent state to agent and then agent sent state to method + trajectory_temp = list() + last_done = 0 + for i in range(target_step): + # action = self.get_llm_action(i) + # noise = 0 + action, noise = self.select_action(state) + state, next_state, reward, done, = env.step(np.tanh(action)) # make action between -1 & 1 + trajectory_temp.append((state, reward, done, action, noise)) + if done: + state = env.reset() + last_done = i + else: + state = next_state + self.state = state + + '''splice list''' + # store 0 trajectory information to list + trajectory_list = self.trajectory_list + trajectory_temp[:last_done + 1] + self.trajectory_list = trajectory_temp[last_done:] + return trajectory_list + + def update_net(self, buffer, batch_size, repeat_times, soft_update_tau): + """put data extract and update network together""" + with torch.no_grad(): + buf_len = buffer[0].shape[0] + # decompose buffer data + buf_state, buf_action, buf_noise, buf_reward, buf_mask = [ten.to(self.device) for ten in buffer] + + '''get buf_r_sum, buf_logprob''' + bs = 4096 # set a smaller 'BatchSize' when out of GPU memory: 1024, could change to 4096 + buf_value = [self.cri_target(buf_state[i:i + bs]) for i in range(0, buf_len, bs)] + buf_value = torch.cat(buf_value, dim=0) + buf_logprob = self.act.get_old_logprob(buf_action, buf_noise) + + buf_r_sum, buf_advantage = self.get_reward_sum(buf_len, buf_reward, buf_mask, buf_value) # detach() + # normalize advantage + buf_advantage = (buf_advantage - buf_advantage.mean()) / (buf_advantage.std() + 1e-5) + del buf_noise, buffer[:] + + '''PPO: Surrogate objective of Trust Region''' + obj_critic = obj_actor = None + for _ in range(int(buf_len / batch_size * repeat_times)): + indices = torch.randint(buf_len, size=(batch_size,), requires_grad=False, device=self.device) + + state = buf_state[indices] + action = buf_action[indices] + r_sum = buf_r_sum[indices] + logprob = buf_logprob[indices] + advantage = buf_advantage[indices] + + new_logprob, obj_entropy = self.act.get_logprob_entropy(state, action) # it is obj_actor + ratio = (new_logprob - logprob.detach()).exp() + surrogate1 = advantage * ratio + surrogate2 = advantage * ratio.clamp(1 - self.ratio_clip, 1 + self.ratio_clip) + obj_surrogate = -torch.min(surrogate1, surrogate2).mean() + obj_actor = obj_surrogate + obj_entropy * self.lambda_entropy + self.optim_update(self.act_optim, obj_actor) # update actor + + value = self.cri(state).squeeze(1) # critic network predicts the reward_sum (Q value) of state + # use smoothloss L1 to evaluate the value loss + # obj_critic = self.criterion(value, r_sum) / (r_sum.std() + 1e-6) + obj_critic = self.criterion(value, r_sum) + self.optim_update(self.cri_optim, obj_critic) # calculate and update the back propogation of value loss + # choose whether to use soft update + self.soft_update(self.cri_target, self.cri, soft_update_tau) if self.cri_target is not self.cri else None + + a_std_log = getattr(self.act, 'a_std_log', torch.zeros(1)) + return obj_critic.item(), obj_actor.item(), a_std_log.mean().item() # logging_tuple + + def get_reward_sum_raw(self, buf_len, buf_reward, buf_mask, buf_value) -> (torch.Tensor, torch.Tensor): + buf_r_sum = torch.empty(buf_len, dtype=torch.float32, device=self.device) # reward sum + + pre_r_sum = 0 + for i in range(buf_len - 1, -1, -1): + buf_r_sum[i] = buf_reward[i] + buf_mask[i] * pre_r_sum + pre_r_sum = buf_r_sum[i] + buf_advantage = buf_r_sum - (buf_mask * buf_value[:, 0]) + return buf_r_sum, buf_advantage + + def get_reward_sum_gae(self, buf_len, ten_reward, ten_mask, ten_value) -> (torch.Tensor, torch.Tensor): + buf_r_sum = torch.empty(buf_len, dtype=torch.float32, device=self.device) # old policy value + buf_advantage = torch.empty(buf_len, dtype=torch.float32, device=self.device) # advantage value + + pre_r_sum = 0 + pre_advantage = 0 # advantage value of previous step + for i in range(buf_len - 1, -1, -1): + buf_r_sum[i] = ten_reward[i] + ten_mask[i] * pre_r_sum + pre_r_sum = buf_r_sum[i] + buf_advantage[i] = ten_reward[i] + ten_mask[i] * (pre_advantage - ten_value[i]) # fix a bug here + pre_advantage = ten_value[i] + buf_advantage[i] * self.lambda_gae_adv + return buf_r_sum, buf_advantage + + @staticmethod + def optim_update(optimizer, objective): + optimizer.zero_grad() + objective.backward() + optimizer.step() + + @staticmethod + def soft_update(target_net, current_net, tau): + for tar, cur in zip(target_net.parameters(), current_net.parameters()): + tar.data.copy_(cur.data.__mul__(tau) + tar.data.__mul__(1.0 - tau)) + + +class Arguments: + def __init__(self, agent=None, env=None): + self.agent = agent # Deep Reinforcement Learning algorithm + self.env = env # the environment for training + self.cwd = None # current work directory. None means set automatically + self.if_remove = False # remove the cwd folder? (True, False, None:ask me) + self.visible_gpu = '0' # for example: os.environ['CUDA_VISIBLE_DEVICES'] = '0, 2,' + # self.worker_num = 2 # rollout workers number pre GPU (adjust it to get high GPU usage) + self.num_threads = 32 # cpu_num for evaluate model, torch.set_num_threads(self.num_threads) + + '''Arguments for training''' + self.num_episode = 1000 # to control the train episodes for PPO + self.gamma = 0.995 # discount factor of future rewards + self.learning_rate = 2 ** -14 # 2e-4 + self.soft_update_tau = 2 ** -8 # 2 ** -8 ~= 5e-3 + + self.net_dim = 256 # the network width + self.batch_size = 4096 # num of transitions sampled from replay buffer. + self.repeat_times = 2 ** 3 # collect target_step, then update network + self.target_step = 4096 # repeatedly update network to keep critic's loss small + self.max_memo = self.target_step # capacity of replay buffer + self.if_per_or_gae = False # GAE for on-policy sparse reward: Generalized Advantage Estimation. + + '''Arguments for evaluate''' + self.random_seed = 0 # initialize random seed in self.init_before_training() + # self.random_seed_list = [1234, 2234, 3234, 4234, 5234] + self.random_seed_list = [1234] + self.train = True + self.save_network = True + self.test_network = True + self.save_test_data = True + self.compare_with_gurobi = True + self.plot_on = True + + def init_before_training(self, if_main): + if self.cwd is None: + agent_name = self.agent.__class__.__name__ + self.cwd = f'./{agent_name}' + + if if_main: + import shutil # remove history according to bool(if_remove) + if self.if_remove is None: + self.if_remove = bool(input(f"| PRESS 'y' to REMOVE: {self.cwd}? ") == 'y') + elif self.if_remove: + shutil.rmtree(self.cwd, ignore_errors=True) + print(f"| Remove cwd: {self.cwd}") + os.makedirs(self.cwd, exist_ok=True) + + np.random.seed(self.random_seed) + torch.manual_seed(self.random_seed) + torch.set_num_threads(self.num_threads) + torch.set_default_dtype(torch.float32) + + os.environ['CUDA_VISIBLE_DEVICES'] = str(self.visible_gpu) + + +def update_buffer(_trajectory): + _trajectory = list(map(list, zip(*_trajectory))) # 2D-list transpose, here cut the trajectory into 5 parts + ten_state = torch.as_tensor(_trajectory[0]) # tensor state here + ten_reward = torch.as_tensor(_trajectory[1], dtype=torch.float32) + # _trajectory[2] = done, replace done by mask, save memory + ten_mask = (1.0 - torch.as_tensor(_trajectory[2], dtype=torch.float32)) * gamma + ten_action = torch.as_tensor(_trajectory[3]) + ten_noise = torch.as_tensor(_trajectory[4], dtype=torch.float32) + + buffer[:] = (ten_state, ten_action, ten_noise, ten_reward, ten_mask) # list store tensors + + _steps = ten_reward.shape[0] # how many steps are collected in all trajectories + _r_exp = ten_reward.mean() # the mean reward + return _steps, _r_exp + + +def load_actions_from_json(file_path): + with open(file_path, 'r') as file: + actions = json.load(file) + return actions + + +def simulate_with_llm_actions(env, llm_actions): + states, actions, rewards, unbalances = [], [], [], [] + state = env.reset() + + for action in llm_actions: + next_state, reward, done, info = env.step(action) + states.append(state) + actions.append(action) + rewards.append(reward) + unbalances.append(info.get('unbalance', 0)) + state = next_state + if done: + break + + return states, actions, rewards, unbalances + + +if __name__ == '__main__': + env = ESSEnv() + llm_actions = load_actions_from_json('data/llm_action.json') + states, actions, rewards, unbalances = simulate_with_llm_actions(env, llm_actions) + + fig, ax1 = plt.subplots() + + color = 'tab:blue' + ax1.set_xlabel('Step') + ax1.set_ylabel('Reward', color=color) + ax1.plot(rewards, color=color) + ax1.tick_params(axis='y', labelcolor=color) + + ax2 = ax1.twinx() + color = 'tab:red' + ax2.set_ylabel('Unbalance', color=color) + ax2.plot(unbalances, color=color) + ax2.tick_params(axis='y', labelcolor=color) + + fig.tight_layout() + plt.title('Rewards and Unbalance over Steps') + plt.show() diff --git a/test.py b/test.py index 359def5..fd259a8 100644 --- a/test.py +++ b/test.py @@ -39,17 +39,22 @@ # data = pickle.load(f) # # print(data) -import json -import numpy as np +# import json +# import numpy as np +# +# +# def get_llm_action(index) -> np.ndarray: +# with open('data/llm_action.json', 'r') as file: +# data = json.load(file) +# normalized_index = index % len(data) +# action = np.array(data[normalized_index]) +# return action +# +# data = get_llm_action(2) +# print(data) +from environment import ESSEnv - -def get_llm_action(index) -> np.ndarray: - with open('data/llm_action.json', 'r') as file: - data = json.load(file) - normalized_index = index % len(data) - action = np.array(data[normalized_index]) - return action - - -data = get_llm_action(2) -print(data) +env = ESSEnv() +wind_speed = env.data_manager.get_series_wind_data(1, 1) +wind = [env.wind.step(ws) for ws in wind_speed] +print(wind) diff --git a/tools.py b/tools.py index a4e4926..7e8c17a 100644 --- a/tools.py +++ b/tools.py @@ -7,7 +7,7 @@ import torch from gurobipy import GRB -def optimization_base_result(env, month, day, initial_soc): +def optimization_base_result(env,month,day,initial_soc): price = env.data_manager.get_series_price_data(month, day) load = env.data_manager.get_series_load_cons_data(month, day) temperature = env.data_manager.get_series_temperature_data(month, day) @@ -50,7 +50,7 @@ def optimization_base_result(env, month, day, initial_soc): on_off = m.addVars(NUM_GEN, period, vtype=GRB.BINARY, name='on_off') gen_output = m.addVars(NUM_GEN, period, vtype=GRB.CONTINUOUS, name='output') # 设置充放电约束 - battery_energy_change = m.addVars(period, vtype=GRB.CONTINUOUS, lb=env.battery.max_discharge, + battery_energy_change = m.addVars(period, vtype=GRB.CONTINUOUS, lb=-env.battery.max_charge, ub=env.battery.max_charge, name='battery_action') # 设置外部电网与能源系统交换约束 grid_energy_import = m.addVars(period, vtype=GRB.CONTINUOUS, lb=0, ub=env.grid.exchange_ability, name='import') @@ -86,8 +86,8 @@ def optimization_base_result(env, month, day, initial_soc): m.setObjective((cost_gen + cost_grid_import - cost_grid_export), GRB.MINIMIZE) m.optimize() - output_record = {'pv': [], 'temperature': [], 'irradiance': [], 'wind_speed': [], 'price': [], 'load': [], - 'netload': [], 'soc': [], 'battery_energy_change': [], 'grid_import': [], 'grid_export': [], + output_record = {'pv': [], 'wind': [], 'price': [], 'load': [], 'netload': [], + 'soc': [], 'battery_energy_change': [], 'grid_import': [], 'grid_export': [], 'gen1': [], 'gen2': [], 'gen3': [], 'step_cost': []} for t in range(period): gen_cost = sum((on_off[g, t].x * ( @@ -95,13 +95,11 @@ def optimization_base_result(env, month, day, initial_soc): for g in range(NUM_GEN)) grid_import_cost = grid_energy_import[t].x * price[t] grid_export_cost = grid_energy_export[t].x * price[t] * env.sell_coefficient - output_record['pv'].append(pv[t].x) - output_record['temperature'].append(temperature[t]) - output_record['irradiance'].append(irradiance[t]) - output_record['wind_speed'].append(wind[t]) + output_record['pv'].append(pv[t]) + output_record['wind'].append(wind[t]) output_record['price'].append(price[t]) output_record['load'].append(load[t]) - output_record['netload'].append(load[t] - pv[t].x) + output_record['netload'].append(load[t] - pv[t]) output_record['soc'].append(soc[t].x) output_record['battery_energy_change'].append(battery_energy_change[t].x) output_record['grid_import'].append(grid_energy_import[t].x) @@ -110,7 +108,6 @@ def optimization_base_result(env, month, day, initial_soc): output_record['gen2'].append(gen_output[1, t].x) output_record['gen3'].append(gen_output[2, t].x) output_record['step_cost'].append(gen_cost + grid_import_cost - grid_export_cost) - output_record_df = pd.DataFrame.from_dict(output_record) return output_record_df diff --git a/tools备份.py b/tools备份.py new file mode 100644 index 0000000..99ed610 --- /dev/null +++ b/tools备份.py @@ -0,0 +1,285 @@ +import os +import gurobipy as gp +import numpy as np +import numpy.random as rd +import pandas as pd +import torch +from gurobipy import GRB + + +def optimization_base_result(env, month, day, initial_soc): + price = env.data_manager.get_series_price_data(month, day) + load = env.data_manager.get_series_load_cons_data(month, day) + temperature = env.data_manager.get_series_temperature_data(month, day) + irradiance = env.data_manager.get_series_irradiance_data(month, day) + wind_speed = env.data_manager.get_series_wind_data(month, day) + + pv = [env.solar.step(temp, irr) for temp, irr in zip(temperature, irradiance)] + wind = [env.wind.step(ws) for ws in wind_speed] + + period = env.episode_length + DG_parameters = env.dg_parameters + + def get_dg_info(parameters): + p_max = [] + p_min = [] + ramping_up = [] + ramping_down = [] + a_para = [] + b_para = [] + c_para = [] + + for name, gen_info in parameters.items(): + p_max.append(gen_info['power_output_max']) + p_min.append(gen_info['power_output_min']) + ramping_up.append(gen_info['ramping_up']) + ramping_down.append(gen_info['ramping_down']) + a_para.append(gen_info['a']) + b_para.append(gen_info['b']) + c_para.append(gen_info['c']) + return p_max, p_min, ramping_up, ramping_down, a_para, b_para, c_para + + p_max, p_min, ramping_up, ramping_down, a_para, b_para, c_para = get_dg_info(parameters=DG_parameters) + NUM_GEN = len(DG_parameters.keys()) + battery_capacity = env.battery.capacity + battery_efficiency = env.battery.efficiency + + m = gp.Model("UC") + m.setParam('OutputFlag', 1) + + # 设置系统变量 + on_off = m.addVars(NUM_GEN, period, vtype=GRB.BINARY, name='on_off') + gen_output = m.addVars(NUM_GEN, period, vtype=GRB.CONTINUOUS, name='output') + # 设置充放电约束 + battery_energy_change = m.addVars(period, vtype=GRB.CONTINUOUS, lb=env.battery.max_discharge, + ub=env.battery.max_charge, name='battery_action') + # 设置外部电网与能源系统交换约束 + grid_energy_import = m.addVars(period, vtype=GRB.CONTINUOUS, lb=0, ub=env.grid.exchange_ability, name='import') + grid_energy_export = m.addVars(period, vtype=GRB.CONTINUOUS, lb=0, ub=env.grid.exchange_ability, name='export') + soc = m.addVars(period, vtype=GRB.CONTINUOUS, lb=0.2, ub=0.8, name='SOC') + + # 1. 添加平衡约束 + m.addConstrs(((sum(gen_output[g, t] for g in range(NUM_GEN)) + pv[t] + wind[t] + grid_energy_import[t] >= load[t] + + battery_energy_change[t] + grid_energy_export[t]) for t in range(period)), name='powerbalance') + # 2. 添加发电机最大/最小功率约束 + m.addConstrs((gen_output[g, t] <= on_off[g, t] * p_max[g] for g in range(NUM_GEN) for t in range(period)), + 'gen_output_max') + m.addConstrs((gen_output[g, t] >= on_off[g, t] * p_min[g] for g in range(NUM_GEN) for t in range(period)), + 'gen_output_min') + # 3. 添加上升和下降约束 + m.addConstrs((gen_output[g, t + 1] - gen_output[g, t] <= ramping_up[g] for g in range(NUM_GEN) + for t in range(period - 1)), 'ramping_up') + m.addConstrs((gen_output[g, t] - gen_output[g, t + 1] <= ramping_down[g] for g in range(NUM_GEN) + for t in range(period - 1)), 'ramping_down') + # 4. 添加电池容量约束 + m.addConstr(battery_capacity * soc[0] == battery_capacity * initial_soc + + (battery_energy_change[0] * battery_efficiency), name='soc0') + m.addConstrs((battery_capacity * soc[t] == battery_capacity * soc[t - 1] + + (battery_energy_change[t] * battery_efficiency) for t in range(1, period)), name='soc update') + # 设置成本函数 + # 发电机成本 + cost_gen = gp.quicksum( + (a_para[g] * gen_output[g, t] * gen_output[g, t] + b_para[g] * gen_output[g, t] + c_para[g] * on_off[g, t]) for + t in range(period) for g in range(NUM_GEN)) + cost_grid_import = gp.quicksum(grid_energy_import[t] * price[t] for t in range(period)) + cost_grid_export = gp.quicksum(grid_energy_export[t] * price[t] * env.sell_coefficient for t in range(period)) + + m.setObjective((cost_gen + cost_grid_import - cost_grid_export), GRB.MINIMIZE) + m.optimize() + + m.computeIIS() + + output_record = {'pv': [], 'wind': [], 'price': [], 'load': [], + 'netload': [], 'soc': [], 'battery_energy_change': [], 'grid_import': [], 'grid_export': [], + 'gen1': [], 'gen2': [], 'gen3': [], 'step_cost': []} + for t in range(period): + gen_cost = sum((on_off[g, t].x * ( + a_para[g] * gen_output[g, t].x * gen_output[g, t].x + b_para[g] * gen_output[g, t].x + c_para[g])) + for g in range(NUM_GEN)) + grid_import_cost = grid_energy_import[t].x * price[t] + grid_export_cost = grid_energy_export[t].x * price[t] * env.sell_coefficient + output_record['pv'].append(pv[t]) + # output_record['temperature'].append(temperature[t]) + # output_record['irradiance'].append(irradiance[t]) + output_record['wind'].append(wind[t]) + output_record['price'].append(price[t]) + output_record['load'].append(load[t]) + output_record['netload'].append(load[t] - pv[t]) + output_record['soc'].append(soc[t].x) + output_record['battery_energy_change'].append(battery_energy_change[t].x) + output_record['grid_import'].append(grid_energy_import[t].x) + output_record['grid_export'].append(grid_energy_export[t].x) + output_record['gen1'].append(gen_output[0, t].x) + output_record['gen2'].append(gen_output[1, t].x) + output_record['gen3'].append(gen_output[2, t].x) + output_record['step_cost'].append(gen_cost + grid_import_cost - grid_export_cost) + output_record_df = pd.DataFrame.from_dict(output_record) + return output_record_df + + +class Arguments: + """revise here for our own purpose""" + + def __init__(self, agent=None, env=None): + self.agent = agent # Deep Reinforcement Learning algorithm + self.env = env # the environment for training + # self.plot_shadow_on = False # control do we need to plot all shadow figures + self.cwd = None # current work directory. None means set automatically + self.if_remove = False # remove the cwd folder? (True, False, None:ask me) + self.visible_gpu = '0,1,2,3' # for example: os.environ['CUDA_VISIBLE_DEVICES'] = '0, 2,' + # self.worker_num = 2 # rollout workers number pre GPU (adjust it to get high GPU usage) + self.num_threads = 32 # cpu_num for evaluate model, torch.set_num_threads(self.num_threads) + + '''Arguments for training''' + self.num_episode = 1000 + self.gamma = 0.995 # discount factor of future rewards + # self.reward_scale = 1 # an approximate target reward usually be closed to 256 + self.learning_rate = 2 ** -14 # 2 ** -14 ~= 6e-5 + self.soft_update_tau = 2 ** -8 # 2 ** -8 ~= 5e-3 + self.net_dim = 256 # the network width 256 + self.batch_size = 4096 # num of transitions sampled from replay buffer. + self.repeat_times = 2 ** 5 # repeatedly update network to keep critic's loss small + self.target_step = 4096 # collect target_step experiences, then update network, 1024 + self.max_memo = 500000 # capacity of replay buffer + self.if_per_or_gae = False # PER for off-policy sparse reward: Prioritized Experience Replay. + + '''Arguments for evaluate''' + # self.eval_gap = 2 ** 6 # evaluate the agent per eval_gap seconds + # self.eval_times = 2 # number of times that get episode return in first + self.random_seed = 0 # initialize random seed in self.init_before_training() + # self.random_seed_list = [1234, 2234, 3234, 4234, 5234] + self.random_seed_list = [1234] + '''Arguments for save and plot issues''' + self.train = True + self.save_network = True + self.test_network = True + self.save_test_data = True + self.compare_with_gurobi = True + self.plot_on = True + + def init_before_training(self, if_main): + if self.cwd is None: + agent_name = self.agent.__class__.__name__ + self.cwd = f'./{agent_name}' + + if if_main: + import shutil # remove history according to bool(if_remove) + if self.if_remove is None: + self.if_remove = bool(input(f"| PRESS 'y' to REMOVE: {self.cwd}? ") == 'y') + elif self.if_remove: + shutil.rmtree(self.cwd, ignore_errors=True) + print(f"| Remove cwd: {self.cwd}") + os.makedirs(self.cwd, exist_ok=True) + + np.random.seed(self.random_seed) + torch.manual_seed(self.random_seed) + torch.set_num_threads(self.num_threads) + torch.set_default_dtype(torch.float32) + + os.environ['CUDA_VISIBLE_DEVICES'] = str(self.visible_gpu) # control how many GPU is used   + + +def test_one_episode(env, act, device): + """to get evaluate information, here record the unbalance of after taking action""" + record_state = [] + record_action = [] + record_reward = [] + record_output = [] + record_cost = [] + record_unbalance = [] + record_system_info = [] # [time price,netload,action,real action, output*4,soc,unbalance(exchange+penalty)] + record_init_info = [] # include month,day,time,intial soc + env.TRAIN = False + state = env.reset() + record_init_info.append([env.month, env.day, env.current_time, env.battery.current_capacity]) + print(f'current testing month is {env.month}, day is {env.day},initial_soc is {env.battery.current_capacity}') + for i in range(24): + s_tensor = torch.as_tensor((state,), device=device) + a_tensor = act(s_tensor) + action = a_tensor.detach().cpu().numpy()[0] # not need detach(), because with torch.no_grad() outside + real_action = action + state, next_state, reward, done = env.step(action) + + record_system_info.append([state[0], state[1], state[3], action, real_action, env.battery.SOC(), + env.battery.energy_change, next_state[4], next_state[5], next_state[6], + next_state[7], next_state[8], env.unbalance, env.operation_cost]) + record_state.append(state) + record_action.append(real_action) + record_reward.append(reward) + record_output.append(env.current_output) + record_unbalance.append(env.unbalance) + state = next_state + # add information of last step dg1, dh2, dg3, soc + record_system_info[-1][7:10] = [env.final_step_outputs[0], env.final_step_outputs[1], env.final_step_outputs[2]] + record_system_info[-1][5] = env.final_step_outputs[3] + record = {'init_info': record_init_info, 'system_info': record_system_info, 'state': record_state, + 'action': record_action, 'reward': record_reward, 'cost': record_cost, 'unbalance': record_unbalance, + 'record_output': record_output} + return record + + +def get_episode_return(env, act, device): + episode_return = 0.0 # sum of rewards in an episode + episode_unbalance = 0.0 + state = env.reset() + for i in range(24): + s_tensor = torch.as_tensor((state,), device=device) + a_tensor = act(s_tensor) + action = a_tensor.detach().cpu().numpy()[0] # not need detach(), because with torch.no_grad() outside + state, next_state, reward, done, = env.step(action) + state = next_state + episode_return += reward + episode_unbalance += env.real_unbalance + if done: + break + return episode_return, episode_unbalance + + +class ReplayBuffer: + def __init__(self, max_len, state_dim, action_dim, gpu_id=0): + self.now_len = 0 + self.next_idx = 0 + self.if_full = False + self.max_len = max_len + self.data_type = torch.float32 + self.action_dim = action_dim + self.device = torch.device(f"cuda:{gpu_id}" if (torch.cuda.is_available() and (gpu_id >= 0)) else "cpu") + + other_dim = 1 + 1 + self.action_dim + self.buf_other = torch.empty(size=(max_len, other_dim), dtype=self.data_type, device=self.device) + + if isinstance(state_dim, int): # state is pixel + self.buf_state = torch.empty((max_len, state_dim), dtype=torch.float32, device=self.device) + elif isinstance(state_dim, tuple): + self.buf_state = torch.empty((max_len, *state_dim), dtype=torch.uint8, device=self.device) + else: + raise ValueError('state_dim') + + def extend_buffer(self, state, other): # CPU array to CPU array + size = len(other) + next_idx = self.next_idx + size + + if next_idx > self.max_len: + self.buf_state[self.next_idx:self.max_len] = state[:self.max_len - self.next_idx] + self.buf_other[self.next_idx:self.max_len] = other[:self.max_len - self.next_idx] + self.if_full = True + + next_idx = next_idx - self.max_len + self.buf_state[0:next_idx] = state[-next_idx:] + self.buf_other[0:next_idx] = other[-next_idx:] + else: + self.buf_state[self.next_idx:next_idx] = state + self.buf_other[self.next_idx:next_idx] = other + self.next_idx = next_idx + + def sample_batch(self, batch_size) -> tuple: + indices = rd.randint(self.now_len - 1, size=batch_size) + r_m_a = self.buf_other[indices] + return (r_m_a[:, 0:1], + r_m_a[:, 1:2], + r_m_a[:, 2:], + self.buf_state[indices], + self.buf_state[indices + 1]) + + def update_now_len(self): + self.now_len = self.max_len if self.if_full else self.next_idx