This commit is contained in:
chenxiaodong 2024-06-25 11:02:53 +08:00
parent 2c672d9569
commit d7feebab50
11 changed files with 681 additions and 33 deletions

View File

@ -2,7 +2,7 @@
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Remote Python 3.9.18 (sftp://chenxd@124.16.151.196:22121/home/chenxd/miniconda3/envs/grid/bin/python3.9)" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="rl-microgrid" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">

View File

@ -3,5 +3,5 @@
<component name="Black">
<option name="sdkName" value="rl-microgrid" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Remote Python 3.9.18 (sftp://chenxd@124.16.151.196:22121/home/chenxd/miniconda3/envs/grid/bin/python3.9)" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="rl-microgrid" project-jdk-type="Python SDK" />
</project>

Binary file not shown.

Binary file not shown.

Binary file not shown.

1
PPO.py
View File

@ -336,6 +336,7 @@ if __name__ == '__main__':
# args.test_network = False
# args.save_test_data = False
# args.compare_with_gurobi = False
# args.plot_on = False
if args.train:
for i_episode in range(num_episode):
with torch.no_grad():

View File

@ -140,9 +140,9 @@ class AgentPPO:
trajectory_temp = list()
last_done = 0
for i in range(target_step):
action = self.get_llm_action(i)
noise = 0
# action, noise = self.select_action(state)
# action = self.get_llm_action(i)
# noise = 0
action, noise = self.select_action(state)
state, next_state, reward, done, = env.step(np.tanh(action)) # make action between -1 & 1
trajectory_temp.append((state, reward, done, action, noise))
if done:

360
PPO_test.py Normal file
View File

@ -0,0 +1,360 @@
import json
import matplotlib.pyplot as plt
from environment import ESSEnv
import os
import pickle
from copy import deepcopy
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
class ActorPPO(nn.Module):
def __init__(self, mid_dim, state_dim, action_dim, layer_norm=False):
super().__init__()
self.layer_norm = layer_norm
self.net = nn.Sequential(nn.Linear(state_dim, mid_dim), nn.ReLU(),
nn.Linear(mid_dim, mid_dim), nn.ReLU(),
nn.Linear(mid_dim, mid_dim), nn.Hardswish(),
nn.Linear(mid_dim, action_dim))
# the logarithm (log) of standard deviation (std) of action, it is a trainable parameter
self.a_logstd = nn.Parameter(torch.zeros((1, action_dim)) - 0.5, requires_grad=True)
self.sqrt_2pi_log = np.log(np.sqrt(2 * np.pi))
if self.layer_norm:
self.apply_layer_norm()
def apply_layer_norm(self):
def init_weights(layer):
if isinstance(layer, nn.Linear):
nn.init.orthogonal_(layer.weight, 1.0)
nn.init.constant_(layer.bias, 0.0)
self.net.apply(init_weights)
def forward(self, state):
return self.net(state).tanh() # action.tanh() limit the data output of action
def get_action(self, state):
a_avg = self.forward(state) # too big for the action
a_std = self.a_logstd.exp()
noise = torch.randn_like(a_avg)
action = a_avg + noise * a_std
return action, noise
def get_logprob_entropy(self, state, action):
a_avg = self.forward(state)
a_std = self.a_logstd.exp()
delta = ((a_avg - action) / a_std).pow(2) * 0.5
logprob = -(self.a_logstd + self.sqrt_2pi_log + delta).sum(1) # new_logprob
dist_entropy = (logprob.exp() * logprob).mean() # policy entropy
return logprob, dist_entropy
def get_old_logprob(self, _action, noise): # noise = action - a_noise
delta = noise.pow(2) * 0.5
return -(self.a_logstd + self.sqrt_2pi_log + delta).sum(1) # old_logprob
class CriticAdv(nn.Module):
def __init__(self, mid_dim, state_dim, _action_dim, layer_norm=False):
super().__init__()
self.layer_norm = layer_norm
self.net = nn.Sequential(nn.Linear(state_dim, mid_dim), nn.ReLU(),
nn.Linear(mid_dim, mid_dim), nn.ReLU(),
nn.Linear(mid_dim, mid_dim), nn.Hardswish(),
nn.Linear(mid_dim, 1))
if self.layer_norm:
self.apply_layer_norm()
def apply_layer_norm(self):
def init_weights(layer):
if isinstance(layer, nn.Linear):
nn.init.orthogonal_(layer.weight, 1.0)
nn.init.constant_(layer.bias, 0.0)
self.net.apply(init_weights)
def forward(self, state):
return self.net(state) # Advantage value
class AgentPPO:
def __init__(self):
super().__init__()
self.state = None
self.device = None
self.action_dim = None
self.get_obj_critic = None
self.criterion = torch.nn.SmoothL1Loss()
self.cri = self.cri_target = self.if_use_cri_target = self.cri_optim = self.ClassCri = None
self.act = self.act_target = self.if_use_act_target = self.act_optim = self.ClassAct = None
'''init modify'''
self.ClassCri = CriticAdv
self.ClassAct = ActorPPO
self.ratio_clip = 0.2 # ratio.clamp(1 - clip, 1 + clip)
self.lambda_entropy = 0.02 # could be 0.01~0.05
self.lambda_gae_adv = 0.98 # could be 0.95~0.99, GAE (Generalized Advantage Estimation. ICLR.2016.)
self.get_reward_sum = None # self.get_reward_sum_gae if if_use_gae else self.get_reward_sum_raw
self.trajectory_list = None
def init(self, net_dim, state_dim, action_dim, learning_rate=1e-4, if_use_gae=False, gpu_id=0, layer_norm=False):
self.device = torch.device(f"cuda:{gpu_id}" if (torch.cuda.is_available() and (gpu_id >= 0)) else "cpu")
self.trajectory_list = list()
# choose whether to use gae or not
self.get_reward_sum = self.get_reward_sum_gae if if_use_gae else self.get_reward_sum_raw
self.cri = self.ClassCri(net_dim, state_dim, action_dim, layer_norm).to(self.device)
self.act = self.ClassAct(net_dim, state_dim, action_dim, layer_norm).to(
self.device) if self.ClassAct else self.cri
self.cri_target = deepcopy(self.cri) if self.if_use_cri_target else self.cri
self.act_target = deepcopy(self.act) if self.if_use_act_target else self.act
self.cri_optim = torch.optim.Adam(self.cri.parameters(), learning_rate)
self.act_optim = torch.optim.Adam(self.act.parameters(), learning_rate) if self.ClassAct else self.cri
def select_action(self, state):
states = torch.as_tensor((state,), dtype=torch.float32, device=self.device)
actions, noises = self.act.get_action(states)
return actions[0].detach().cpu().numpy(), noises[0].detach().cpu().numpy()
@staticmethod
def get_llm_action(index):
with open('data/llm_action.json', 'r') as file:
data = json.load(file)
data_tensor = torch.tensor(data, dtype=torch.float32)
normalized_index = index % len(data_tensor)
action = data_tensor[normalized_index].detach().cpu().numpy()
return action
def explore_env(self, env, target_step):
state = self.state # sent state to agent and then agent sent state to method
trajectory_temp = list()
last_done = 0
for i in range(target_step):
# action = self.get_llm_action(i)
# noise = 0
action, noise = self.select_action(state)
state, next_state, reward, done, = env.step(np.tanh(action)) # make action between -1 & 1
trajectory_temp.append((state, reward, done, action, noise))
if done:
state = env.reset()
last_done = i
else:
state = next_state
self.state = state
'''splice list'''
# store 0 trajectory information to list
trajectory_list = self.trajectory_list + trajectory_temp[:last_done + 1]
self.trajectory_list = trajectory_temp[last_done:]
return trajectory_list
def update_net(self, buffer, batch_size, repeat_times, soft_update_tau):
"""put data extract and update network together"""
with torch.no_grad():
buf_len = buffer[0].shape[0]
# decompose buffer data
buf_state, buf_action, buf_noise, buf_reward, buf_mask = [ten.to(self.device) for ten in buffer]
'''get buf_r_sum, buf_logprob'''
bs = 4096 # set a smaller 'BatchSize' when out of GPU memory: 1024, could change to 4096
buf_value = [self.cri_target(buf_state[i:i + bs]) for i in range(0, buf_len, bs)]
buf_value = torch.cat(buf_value, dim=0)
buf_logprob = self.act.get_old_logprob(buf_action, buf_noise)
buf_r_sum, buf_advantage = self.get_reward_sum(buf_len, buf_reward, buf_mask, buf_value) # detach()
# normalize advantage
buf_advantage = (buf_advantage - buf_advantage.mean()) / (buf_advantage.std() + 1e-5)
del buf_noise, buffer[:]
'''PPO: Surrogate objective of Trust Region'''
obj_critic = obj_actor = None
for _ in range(int(buf_len / batch_size * repeat_times)):
indices = torch.randint(buf_len, size=(batch_size,), requires_grad=False, device=self.device)
state = buf_state[indices]
action = buf_action[indices]
r_sum = buf_r_sum[indices]
logprob = buf_logprob[indices]
advantage = buf_advantage[indices]
new_logprob, obj_entropy = self.act.get_logprob_entropy(state, action) # it is obj_actor
ratio = (new_logprob - logprob.detach()).exp()
surrogate1 = advantage * ratio
surrogate2 = advantage * ratio.clamp(1 - self.ratio_clip, 1 + self.ratio_clip)
obj_surrogate = -torch.min(surrogate1, surrogate2).mean()
obj_actor = obj_surrogate + obj_entropy * self.lambda_entropy
self.optim_update(self.act_optim, obj_actor) # update actor
value = self.cri(state).squeeze(1) # critic network predicts the reward_sum (Q value) of state
# use smoothloss L1 to evaluate the value loss
# obj_critic = self.criterion(value, r_sum) / (r_sum.std() + 1e-6)
obj_critic = self.criterion(value, r_sum)
self.optim_update(self.cri_optim, obj_critic) # calculate and update the back propogation of value loss
# choose whether to use soft update
self.soft_update(self.cri_target, self.cri, soft_update_tau) if self.cri_target is not self.cri else None
a_std_log = getattr(self.act, 'a_std_log', torch.zeros(1))
return obj_critic.item(), obj_actor.item(), a_std_log.mean().item() # logging_tuple
def get_reward_sum_raw(self, buf_len, buf_reward, buf_mask, buf_value) -> (torch.Tensor, torch.Tensor):
buf_r_sum = torch.empty(buf_len, dtype=torch.float32, device=self.device) # reward sum
pre_r_sum = 0
for i in range(buf_len - 1, -1, -1):
buf_r_sum[i] = buf_reward[i] + buf_mask[i] * pre_r_sum
pre_r_sum = buf_r_sum[i]
buf_advantage = buf_r_sum - (buf_mask * buf_value[:, 0])
return buf_r_sum, buf_advantage
def get_reward_sum_gae(self, buf_len, ten_reward, ten_mask, ten_value) -> (torch.Tensor, torch.Tensor):
buf_r_sum = torch.empty(buf_len, dtype=torch.float32, device=self.device) # old policy value
buf_advantage = torch.empty(buf_len, dtype=torch.float32, device=self.device) # advantage value
pre_r_sum = 0
pre_advantage = 0 # advantage value of previous step
for i in range(buf_len - 1, -1, -1):
buf_r_sum[i] = ten_reward[i] + ten_mask[i] * pre_r_sum
pre_r_sum = buf_r_sum[i]
buf_advantage[i] = ten_reward[i] + ten_mask[i] * (pre_advantage - ten_value[i]) # fix a bug here
pre_advantage = ten_value[i] + buf_advantage[i] * self.lambda_gae_adv
return buf_r_sum, buf_advantage
@staticmethod
def optim_update(optimizer, objective):
optimizer.zero_grad()
objective.backward()
optimizer.step()
@staticmethod
def soft_update(target_net, current_net, tau):
for tar, cur in zip(target_net.parameters(), current_net.parameters()):
tar.data.copy_(cur.data.__mul__(tau) + tar.data.__mul__(1.0 - tau))
class Arguments:
def __init__(self, agent=None, env=None):
self.agent = agent # Deep Reinforcement Learning algorithm
self.env = env # the environment for training
self.cwd = None # current work directory. None means set automatically
self.if_remove = False # remove the cwd folder? (True, False, None:ask me)
self.visible_gpu = '0' # for example: os.environ['CUDA_VISIBLE_DEVICES'] = '0, 2,'
# self.worker_num = 2 # rollout workers number pre GPU (adjust it to get high GPU usage)
self.num_threads = 32 # cpu_num for evaluate model, torch.set_num_threads(self.num_threads)
'''Arguments for training'''
self.num_episode = 1000 # to control the train episodes for PPO
self.gamma = 0.995 # discount factor of future rewards
self.learning_rate = 2 ** -14 # 2e-4
self.soft_update_tau = 2 ** -8 # 2 ** -8 ~= 5e-3
self.net_dim = 256 # the network width
self.batch_size = 4096 # num of transitions sampled from replay buffer.
self.repeat_times = 2 ** 3 # collect target_step, then update network
self.target_step = 4096 # repeatedly update network to keep critic's loss small
self.max_memo = self.target_step # capacity of replay buffer
self.if_per_or_gae = False # GAE for on-policy sparse reward: Generalized Advantage Estimation.
'''Arguments for evaluate'''
self.random_seed = 0 # initialize random seed in self.init_before_training()
# self.random_seed_list = [1234, 2234, 3234, 4234, 5234]
self.random_seed_list = [1234]
self.train = True
self.save_network = True
self.test_network = True
self.save_test_data = True
self.compare_with_gurobi = True
self.plot_on = True
def init_before_training(self, if_main):
if self.cwd is None:
agent_name = self.agent.__class__.__name__
self.cwd = f'./{agent_name}'
if if_main:
import shutil # remove history according to bool(if_remove)
if self.if_remove is None:
self.if_remove = bool(input(f"| PRESS 'y' to REMOVE: {self.cwd}? ") == 'y')
elif self.if_remove:
shutil.rmtree(self.cwd, ignore_errors=True)
print(f"| Remove cwd: {self.cwd}")
os.makedirs(self.cwd, exist_ok=True)
np.random.seed(self.random_seed)
torch.manual_seed(self.random_seed)
torch.set_num_threads(self.num_threads)
torch.set_default_dtype(torch.float32)
os.environ['CUDA_VISIBLE_DEVICES'] = str(self.visible_gpu)
def update_buffer(_trajectory):
_trajectory = list(map(list, zip(*_trajectory))) # 2D-list transpose, here cut the trajectory into 5 parts
ten_state = torch.as_tensor(_trajectory[0]) # tensor state here
ten_reward = torch.as_tensor(_trajectory[1], dtype=torch.float32)
# _trajectory[2] = done, replace done by mask, save memory
ten_mask = (1.0 - torch.as_tensor(_trajectory[2], dtype=torch.float32)) * gamma
ten_action = torch.as_tensor(_trajectory[3])
ten_noise = torch.as_tensor(_trajectory[4], dtype=torch.float32)
buffer[:] = (ten_state, ten_action, ten_noise, ten_reward, ten_mask) # list store tensors
_steps = ten_reward.shape[0] # how many steps are collected in all trajectories
_r_exp = ten_reward.mean() # the mean reward
return _steps, _r_exp
def load_actions_from_json(file_path):
with open(file_path, 'r') as file:
actions = json.load(file)
return actions
def simulate_with_llm_actions(env, llm_actions):
states, actions, rewards, unbalances = [], [], [], []
state = env.reset()
for action in llm_actions:
next_state, reward, done, info = env.step(action)
states.append(state)
actions.append(action)
rewards.append(reward)
unbalances.append(info.get('unbalance', 0))
state = next_state
if done:
break
return states, actions, rewards, unbalances
if __name__ == '__main__':
env = ESSEnv()
llm_actions = load_actions_from_json('data/llm_action.json')
states, actions, rewards, unbalances = simulate_with_llm_actions(env, llm_actions)
fig, ax1 = plt.subplots()
color = 'tab:blue'
ax1.set_xlabel('Step')
ax1.set_ylabel('Reward', color=color)
ax1.plot(rewards, color=color)
ax1.tick_params(axis='y', labelcolor=color)
ax2 = ax1.twinx()
color = 'tab:red'
ax2.set_ylabel('Unbalance', color=color)
ax2.plot(unbalances, color=color)
ax2.tick_params(axis='y', labelcolor=color)
fig.tight_layout()
plt.title('Rewards and Unbalance over Steps')
plt.show()

31
test.py
View File

@ -39,17 +39,22 @@
# data = pickle.load(f)
#
# print(data)
import json
import numpy as np
# import json
# import numpy as np
#
#
# def get_llm_action(index) -> np.ndarray:
# with open('data/llm_action.json', 'r') as file:
# data = json.load(file)
# normalized_index = index % len(data)
# action = np.array(data[normalized_index])
# return action
#
# data = get_llm_action(2)
# print(data)
from environment import ESSEnv
def get_llm_action(index) -> np.ndarray:
with open('data/llm_action.json', 'r') as file:
data = json.load(file)
normalized_index = index % len(data)
action = np.array(data[normalized_index])
return action
data = get_llm_action(2)
print(data)
env = ESSEnv()
wind_speed = env.data_manager.get_series_wind_data(1, 1)
wind = [env.wind.step(ws) for ws in wind_speed]
print(wind)

View File

@ -50,7 +50,7 @@ def optimization_base_result(env, month, day, initial_soc):
on_off = m.addVars(NUM_GEN, period, vtype=GRB.BINARY, name='on_off')
gen_output = m.addVars(NUM_GEN, period, vtype=GRB.CONTINUOUS, name='output')
# 设置充放电约束
battery_energy_change = m.addVars(period, vtype=GRB.CONTINUOUS, lb=env.battery.max_discharge,
battery_energy_change = m.addVars(period, vtype=GRB.CONTINUOUS, lb=-env.battery.max_charge,
ub=env.battery.max_charge, name='battery_action')
# 设置外部电网与能源系统交换约束
grid_energy_import = m.addVars(period, vtype=GRB.CONTINUOUS, lb=0, ub=env.grid.exchange_ability, name='import')
@ -86,8 +86,8 @@ def optimization_base_result(env, month, day, initial_soc):
m.setObjective((cost_gen + cost_grid_import - cost_grid_export), GRB.MINIMIZE)
m.optimize()
output_record = {'pv': [], 'temperature': [], 'irradiance': [], 'wind_speed': [], 'price': [], 'load': [],
'netload': [], 'soc': [], 'battery_energy_change': [], 'grid_import': [], 'grid_export': [],
output_record = {'pv': [], 'wind': [], 'price': [], 'load': [], 'netload': [],
'soc': [], 'battery_energy_change': [], 'grid_import': [], 'grid_export': [],
'gen1': [], 'gen2': [], 'gen3': [], 'step_cost': []}
for t in range(period):
gen_cost = sum((on_off[g, t].x * (
@ -95,13 +95,11 @@ def optimization_base_result(env, month, day, initial_soc):
for g in range(NUM_GEN))
grid_import_cost = grid_energy_import[t].x * price[t]
grid_export_cost = grid_energy_export[t].x * price[t] * env.sell_coefficient
output_record['pv'].append(pv[t].x)
output_record['temperature'].append(temperature[t])
output_record['irradiance'].append(irradiance[t])
output_record['wind_speed'].append(wind[t])
output_record['pv'].append(pv[t])
output_record['wind'].append(wind[t])
output_record['price'].append(price[t])
output_record['load'].append(load[t])
output_record['netload'].append(load[t] - pv[t].x)
output_record['netload'].append(load[t] - pv[t])
output_record['soc'].append(soc[t].x)
output_record['battery_energy_change'].append(battery_energy_change[t].x)
output_record['grid_import'].append(grid_energy_import[t].x)
@ -110,7 +108,6 @@ def optimization_base_result(env, month, day, initial_soc):
output_record['gen2'].append(gen_output[1, t].x)
output_record['gen3'].append(gen_output[2, t].x)
output_record['step_cost'].append(gen_cost + grid_import_cost - grid_export_cost)
output_record_df = pd.DataFrame.from_dict(output_record)
return output_record_df

285
tools备份.py Normal file
View File

@ -0,0 +1,285 @@
import os
import gurobipy as gp
import numpy as np
import numpy.random as rd
import pandas as pd
import torch
from gurobipy import GRB
def optimization_base_result(env, month, day, initial_soc):
price = env.data_manager.get_series_price_data(month, day)
load = env.data_manager.get_series_load_cons_data(month, day)
temperature = env.data_manager.get_series_temperature_data(month, day)
irradiance = env.data_manager.get_series_irradiance_data(month, day)
wind_speed = env.data_manager.get_series_wind_data(month, day)
pv = [env.solar.step(temp, irr) for temp, irr in zip(temperature, irradiance)]
wind = [env.wind.step(ws) for ws in wind_speed]
period = env.episode_length
DG_parameters = env.dg_parameters
def get_dg_info(parameters):
p_max = []
p_min = []
ramping_up = []
ramping_down = []
a_para = []
b_para = []
c_para = []
for name, gen_info in parameters.items():
p_max.append(gen_info['power_output_max'])
p_min.append(gen_info['power_output_min'])
ramping_up.append(gen_info['ramping_up'])
ramping_down.append(gen_info['ramping_down'])
a_para.append(gen_info['a'])
b_para.append(gen_info['b'])
c_para.append(gen_info['c'])
return p_max, p_min, ramping_up, ramping_down, a_para, b_para, c_para
p_max, p_min, ramping_up, ramping_down, a_para, b_para, c_para = get_dg_info(parameters=DG_parameters)
NUM_GEN = len(DG_parameters.keys())
battery_capacity = env.battery.capacity
battery_efficiency = env.battery.efficiency
m = gp.Model("UC")
m.setParam('OutputFlag', 1)
# 设置系统变量
on_off = m.addVars(NUM_GEN, period, vtype=GRB.BINARY, name='on_off')
gen_output = m.addVars(NUM_GEN, period, vtype=GRB.CONTINUOUS, name='output')
# 设置充放电约束
battery_energy_change = m.addVars(period, vtype=GRB.CONTINUOUS, lb=env.battery.max_discharge,
ub=env.battery.max_charge, name='battery_action')
# 设置外部电网与能源系统交换约束
grid_energy_import = m.addVars(period, vtype=GRB.CONTINUOUS, lb=0, ub=env.grid.exchange_ability, name='import')
grid_energy_export = m.addVars(period, vtype=GRB.CONTINUOUS, lb=0, ub=env.grid.exchange_ability, name='export')
soc = m.addVars(period, vtype=GRB.CONTINUOUS, lb=0.2, ub=0.8, name='SOC')
# 1. 添加平衡约束
m.addConstrs(((sum(gen_output[g, t] for g in range(NUM_GEN)) + pv[t] + wind[t] + grid_energy_import[t] >= load[t] +
battery_energy_change[t] + grid_energy_export[t]) for t in range(period)), name='powerbalance')
# 2. 添加发电机最大/最小功率约束
m.addConstrs((gen_output[g, t] <= on_off[g, t] * p_max[g] for g in range(NUM_GEN) for t in range(period)),
'gen_output_max')
m.addConstrs((gen_output[g, t] >= on_off[g, t] * p_min[g] for g in range(NUM_GEN) for t in range(period)),
'gen_output_min')
# 3. 添加上升和下降约束
m.addConstrs((gen_output[g, t + 1] - gen_output[g, t] <= ramping_up[g] for g in range(NUM_GEN)
for t in range(period - 1)), 'ramping_up')
m.addConstrs((gen_output[g, t] - gen_output[g, t + 1] <= ramping_down[g] for g in range(NUM_GEN)
for t in range(period - 1)), 'ramping_down')
# 4. 添加电池容量约束
m.addConstr(battery_capacity * soc[0] == battery_capacity * initial_soc +
(battery_energy_change[0] * battery_efficiency), name='soc0')
m.addConstrs((battery_capacity * soc[t] == battery_capacity * soc[t - 1] +
(battery_energy_change[t] * battery_efficiency) for t in range(1, period)), name='soc update')
# 设置成本函数
# 发电机成本
cost_gen = gp.quicksum(
(a_para[g] * gen_output[g, t] * gen_output[g, t] + b_para[g] * gen_output[g, t] + c_para[g] * on_off[g, t]) for
t in range(period) for g in range(NUM_GEN))
cost_grid_import = gp.quicksum(grid_energy_import[t] * price[t] for t in range(period))
cost_grid_export = gp.quicksum(grid_energy_export[t] * price[t] * env.sell_coefficient for t in range(period))
m.setObjective((cost_gen + cost_grid_import - cost_grid_export), GRB.MINIMIZE)
m.optimize()
m.computeIIS()
output_record = {'pv': [], 'wind': [], 'price': [], 'load': [],
'netload': [], 'soc': [], 'battery_energy_change': [], 'grid_import': [], 'grid_export': [],
'gen1': [], 'gen2': [], 'gen3': [], 'step_cost': []}
for t in range(period):
gen_cost = sum((on_off[g, t].x * (
a_para[g] * gen_output[g, t].x * gen_output[g, t].x + b_para[g] * gen_output[g, t].x + c_para[g]))
for g in range(NUM_GEN))
grid_import_cost = grid_energy_import[t].x * price[t]
grid_export_cost = grid_energy_export[t].x * price[t] * env.sell_coefficient
output_record['pv'].append(pv[t])
# output_record['temperature'].append(temperature[t])
# output_record['irradiance'].append(irradiance[t])
output_record['wind'].append(wind[t])
output_record['price'].append(price[t])
output_record['load'].append(load[t])
output_record['netload'].append(load[t] - pv[t])
output_record['soc'].append(soc[t].x)
output_record['battery_energy_change'].append(battery_energy_change[t].x)
output_record['grid_import'].append(grid_energy_import[t].x)
output_record['grid_export'].append(grid_energy_export[t].x)
output_record['gen1'].append(gen_output[0, t].x)
output_record['gen2'].append(gen_output[1, t].x)
output_record['gen3'].append(gen_output[2, t].x)
output_record['step_cost'].append(gen_cost + grid_import_cost - grid_export_cost)
output_record_df = pd.DataFrame.from_dict(output_record)
return output_record_df
class Arguments:
"""revise here for our own purpose"""
def __init__(self, agent=None, env=None):
self.agent = agent # Deep Reinforcement Learning algorithm
self.env = env # the environment for training
# self.plot_shadow_on = False # control do we need to plot all shadow figures
self.cwd = None # current work directory. None means set automatically
self.if_remove = False # remove the cwd folder? (True, False, None:ask me)
self.visible_gpu = '0,1,2,3' # for example: os.environ['CUDA_VISIBLE_DEVICES'] = '0, 2,'
# self.worker_num = 2 # rollout workers number pre GPU (adjust it to get high GPU usage)
self.num_threads = 32 # cpu_num for evaluate model, torch.set_num_threads(self.num_threads)
'''Arguments for training'''
self.num_episode = 1000
self.gamma = 0.995 # discount factor of future rewards
# self.reward_scale = 1 # an approximate target reward usually be closed to 256
self.learning_rate = 2 ** -14 # 2 ** -14 ~= 6e-5
self.soft_update_tau = 2 ** -8 # 2 ** -8 ~= 5e-3
self.net_dim = 256 # the network width 256
self.batch_size = 4096 # num of transitions sampled from replay buffer.
self.repeat_times = 2 ** 5 # repeatedly update network to keep critic's loss small
self.target_step = 4096 # collect target_step experiences, then update network, 1024
self.max_memo = 500000 # capacity of replay buffer
self.if_per_or_gae = False # PER for off-policy sparse reward: Prioritized Experience Replay.
'''Arguments for evaluate'''
# self.eval_gap = 2 ** 6 # evaluate the agent per eval_gap seconds
# self.eval_times = 2 # number of times that get episode return in first
self.random_seed = 0 # initialize random seed in self.init_before_training()
# self.random_seed_list = [1234, 2234, 3234, 4234, 5234]
self.random_seed_list = [1234]
'''Arguments for save and plot issues'''
self.train = True
self.save_network = True
self.test_network = True
self.save_test_data = True
self.compare_with_gurobi = True
self.plot_on = True
def init_before_training(self, if_main):
if self.cwd is None:
agent_name = self.agent.__class__.__name__
self.cwd = f'./{agent_name}'
if if_main:
import shutil # remove history according to bool(if_remove)
if self.if_remove is None:
self.if_remove = bool(input(f"| PRESS 'y' to REMOVE: {self.cwd}? ") == 'y')
elif self.if_remove:
shutil.rmtree(self.cwd, ignore_errors=True)
print(f"| Remove cwd: {self.cwd}")
os.makedirs(self.cwd, exist_ok=True)
np.random.seed(self.random_seed)
torch.manual_seed(self.random_seed)
torch.set_num_threads(self.num_threads)
torch.set_default_dtype(torch.float32)
os.environ['CUDA_VISIBLE_DEVICES'] = str(self.visible_gpu) # control how many GPU is used  
def test_one_episode(env, act, device):
"""to get evaluate information, here record the unbalance of after taking action"""
record_state = []
record_action = []
record_reward = []
record_output = []
record_cost = []
record_unbalance = []
record_system_info = [] # [time price,netload,action,real action, output*4,soc,unbalance(exchange+penalty)]
record_init_info = [] # include month,day,time,intial soc
env.TRAIN = False
state = env.reset()
record_init_info.append([env.month, env.day, env.current_time, env.battery.current_capacity])
print(f'current testing month is {env.month}, day is {env.day},initial_soc is {env.battery.current_capacity}')
for i in range(24):
s_tensor = torch.as_tensor((state,), device=device)
a_tensor = act(s_tensor)
action = a_tensor.detach().cpu().numpy()[0] # not need detach(), because with torch.no_grad() outside
real_action = action
state, next_state, reward, done = env.step(action)
record_system_info.append([state[0], state[1], state[3], action, real_action, env.battery.SOC(),
env.battery.energy_change, next_state[4], next_state[5], next_state[6],
next_state[7], next_state[8], env.unbalance, env.operation_cost])
record_state.append(state)
record_action.append(real_action)
record_reward.append(reward)
record_output.append(env.current_output)
record_unbalance.append(env.unbalance)
state = next_state
# add information of last step dg1, dh2, dg3, soc
record_system_info[-1][7:10] = [env.final_step_outputs[0], env.final_step_outputs[1], env.final_step_outputs[2]]
record_system_info[-1][5] = env.final_step_outputs[3]
record = {'init_info': record_init_info, 'system_info': record_system_info, 'state': record_state,
'action': record_action, 'reward': record_reward, 'cost': record_cost, 'unbalance': record_unbalance,
'record_output': record_output}
return record
def get_episode_return(env, act, device):
episode_return = 0.0 # sum of rewards in an episode
episode_unbalance = 0.0
state = env.reset()
for i in range(24):
s_tensor = torch.as_tensor((state,), device=device)
a_tensor = act(s_tensor)
action = a_tensor.detach().cpu().numpy()[0] # not need detach(), because with torch.no_grad() outside
state, next_state, reward, done, = env.step(action)
state = next_state
episode_return += reward
episode_unbalance += env.real_unbalance
if done:
break
return episode_return, episode_unbalance
class ReplayBuffer:
def __init__(self, max_len, state_dim, action_dim, gpu_id=0):
self.now_len = 0
self.next_idx = 0
self.if_full = False
self.max_len = max_len
self.data_type = torch.float32
self.action_dim = action_dim
self.device = torch.device(f"cuda:{gpu_id}" if (torch.cuda.is_available() and (gpu_id >= 0)) else "cpu")
other_dim = 1 + 1 + self.action_dim
self.buf_other = torch.empty(size=(max_len, other_dim), dtype=self.data_type, device=self.device)
if isinstance(state_dim, int): # state is pixel
self.buf_state = torch.empty((max_len, state_dim), dtype=torch.float32, device=self.device)
elif isinstance(state_dim, tuple):
self.buf_state = torch.empty((max_len, *state_dim), dtype=torch.uint8, device=self.device)
else:
raise ValueError('state_dim')
def extend_buffer(self, state, other): # CPU array to CPU array
size = len(other)
next_idx = self.next_idx + size
if next_idx > self.max_len:
self.buf_state[self.next_idx:self.max_len] = state[:self.max_len - self.next_idx]
self.buf_other[self.next_idx:self.max_len] = other[:self.max_len - self.next_idx]
self.if_full = True
next_idx = next_idx - self.max_len
self.buf_state[0:next_idx] = state[-next_idx:]
self.buf_other[0:next_idx] = other[-next_idx:]
else:
self.buf_state[self.next_idx:next_idx] = state
self.buf_other[self.next_idx:next_idx] = other
self.next_idx = next_idx
def sample_batch(self, batch_size) -> tuple:
indices = rd.randint(self.now_len - 1, size=batch_size)
r_m_a = self.buf_other[indices]
return (r_m_a[:, 0:1],
r_m_a[:, 1:2],
r_m_a[:, 2:],
self.buf_state[indices],
self.buf_state[indices + 1])
def update_now_len(self):
self.now_len = self.max_len if self.if_full else self.next_idx