llm
This commit is contained in:
parent
2c672d9569
commit
d7feebab50
|
@ -2,7 +2,7 @@
|
|||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="jdk" jdkName="Remote Python 3.9.18 (sftp://chenxd@124.16.151.196:22121/home/chenxd/miniconda3/envs/grid/bin/python3.9)" jdkType="Python SDK" />
|
||||
<orderEntry type="jdk" jdkName="rl-microgrid" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="PyDocumentationSettings">
|
||||
|
|
|
@ -3,5 +3,5 @@
|
|||
<component name="Black">
|
||||
<option name="sdkName" value="rl-microgrid" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Remote Python 3.9.18 (sftp://chenxd@124.16.151.196:22121/home/chenxd/miniconda3/envs/grid/bin/python3.9)" project-jdk-type="Python SDK" />
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="rl-microgrid" project-jdk-type="Python SDK" />
|
||||
</project>
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
11
PPO.py
11
PPO.py
|
@ -331,11 +331,12 @@ if __name__ == '__main__':
|
|||
buffer = list()
|
||||
'''init training parameters'''
|
||||
num_episode = args.num_episode
|
||||
# args.train=False
|
||||
# args.save_network=False
|
||||
# args.test_network=False
|
||||
# args.save_test_data=False
|
||||
# args.compare_with_gurobi=False
|
||||
# args.train = False
|
||||
# args.save_network = False
|
||||
# args.test_network = False
|
||||
# args.save_test_data = False
|
||||
# args.compare_with_gurobi = False
|
||||
# args.plot_on = False
|
||||
if args.train:
|
||||
for i_episode in range(num_episode):
|
||||
with torch.no_grad():
|
||||
|
|
|
@ -140,9 +140,9 @@ class AgentPPO:
|
|||
trajectory_temp = list()
|
||||
last_done = 0
|
||||
for i in range(target_step):
|
||||
action = self.get_llm_action(i)
|
||||
noise = 0
|
||||
# action, noise = self.select_action(state)
|
||||
# action = self.get_llm_action(i)
|
||||
# noise = 0
|
||||
action, noise = self.select_action(state)
|
||||
state, next_state, reward, done, = env.step(np.tanh(action)) # make action between -1 & 1
|
||||
trajectory_temp.append((state, reward, done, action, noise))
|
||||
if done:
|
||||
|
|
|
@ -0,0 +1,360 @@
|
|||
import json
|
||||
import matplotlib.pyplot as plt
|
||||
from environment import ESSEnv
|
||||
import os
|
||||
import pickle
|
||||
from copy import deepcopy
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
|
||||
|
||||
|
||||
class ActorPPO(nn.Module):
|
||||
def __init__(self, mid_dim, state_dim, action_dim, layer_norm=False):
|
||||
super().__init__()
|
||||
self.layer_norm = layer_norm
|
||||
self.net = nn.Sequential(nn.Linear(state_dim, mid_dim), nn.ReLU(),
|
||||
nn.Linear(mid_dim, mid_dim), nn.ReLU(),
|
||||
nn.Linear(mid_dim, mid_dim), nn.Hardswish(),
|
||||
nn.Linear(mid_dim, action_dim))
|
||||
# the logarithm (log) of standard deviation (std) of action, it is a trainable parameter
|
||||
self.a_logstd = nn.Parameter(torch.zeros((1, action_dim)) - 0.5, requires_grad=True)
|
||||
self.sqrt_2pi_log = np.log(np.sqrt(2 * np.pi))
|
||||
|
||||
if self.layer_norm:
|
||||
self.apply_layer_norm()
|
||||
|
||||
def apply_layer_norm(self):
|
||||
def init_weights(layer):
|
||||
if isinstance(layer, nn.Linear):
|
||||
nn.init.orthogonal_(layer.weight, 1.0)
|
||||
nn.init.constant_(layer.bias, 0.0)
|
||||
|
||||
self.net.apply(init_weights)
|
||||
|
||||
def forward(self, state):
|
||||
return self.net(state).tanh() # action.tanh() limit the data output of action
|
||||
|
||||
def get_action(self, state):
|
||||
a_avg = self.forward(state) # too big for the action
|
||||
a_std = self.a_logstd.exp()
|
||||
|
||||
noise = torch.randn_like(a_avg)
|
||||
action = a_avg + noise * a_std
|
||||
return action, noise
|
||||
|
||||
def get_logprob_entropy(self, state, action):
|
||||
a_avg = self.forward(state)
|
||||
a_std = self.a_logstd.exp()
|
||||
|
||||
delta = ((a_avg - action) / a_std).pow(2) * 0.5
|
||||
logprob = -(self.a_logstd + self.sqrt_2pi_log + delta).sum(1) # new_logprob
|
||||
|
||||
dist_entropy = (logprob.exp() * logprob).mean() # policy entropy
|
||||
return logprob, dist_entropy
|
||||
|
||||
def get_old_logprob(self, _action, noise): # noise = action - a_noise
|
||||
delta = noise.pow(2) * 0.5
|
||||
return -(self.a_logstd + self.sqrt_2pi_log + delta).sum(1) # old_logprob
|
||||
|
||||
|
||||
class CriticAdv(nn.Module):
|
||||
def __init__(self, mid_dim, state_dim, _action_dim, layer_norm=False):
|
||||
super().__init__()
|
||||
self.layer_norm = layer_norm
|
||||
self.net = nn.Sequential(nn.Linear(state_dim, mid_dim), nn.ReLU(),
|
||||
nn.Linear(mid_dim, mid_dim), nn.ReLU(),
|
||||
nn.Linear(mid_dim, mid_dim), nn.Hardswish(),
|
||||
nn.Linear(mid_dim, 1))
|
||||
if self.layer_norm:
|
||||
self.apply_layer_norm()
|
||||
|
||||
def apply_layer_norm(self):
|
||||
def init_weights(layer):
|
||||
if isinstance(layer, nn.Linear):
|
||||
nn.init.orthogonal_(layer.weight, 1.0)
|
||||
nn.init.constant_(layer.bias, 0.0)
|
||||
|
||||
self.net.apply(init_weights)
|
||||
|
||||
def forward(self, state):
|
||||
return self.net(state) # Advantage value
|
||||
|
||||
|
||||
class AgentPPO:
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.state = None
|
||||
self.device = None
|
||||
self.action_dim = None
|
||||
self.get_obj_critic = None
|
||||
|
||||
self.criterion = torch.nn.SmoothL1Loss()
|
||||
self.cri = self.cri_target = self.if_use_cri_target = self.cri_optim = self.ClassCri = None
|
||||
self.act = self.act_target = self.if_use_act_target = self.act_optim = self.ClassAct = None
|
||||
|
||||
'''init modify'''
|
||||
self.ClassCri = CriticAdv
|
||||
self.ClassAct = ActorPPO
|
||||
|
||||
self.ratio_clip = 0.2 # ratio.clamp(1 - clip, 1 + clip)
|
||||
self.lambda_entropy = 0.02 # could be 0.01~0.05
|
||||
self.lambda_gae_adv = 0.98 # could be 0.95~0.99, GAE (Generalized Advantage Estimation. ICLR.2016.)
|
||||
self.get_reward_sum = None # self.get_reward_sum_gae if if_use_gae else self.get_reward_sum_raw
|
||||
self.trajectory_list = None
|
||||
|
||||
def init(self, net_dim, state_dim, action_dim, learning_rate=1e-4, if_use_gae=False, gpu_id=0, layer_norm=False):
|
||||
self.device = torch.device(f"cuda:{gpu_id}" if (torch.cuda.is_available() and (gpu_id >= 0)) else "cpu")
|
||||
self.trajectory_list = list()
|
||||
# choose whether to use gae or not
|
||||
self.get_reward_sum = self.get_reward_sum_gae if if_use_gae else self.get_reward_sum_raw
|
||||
|
||||
self.cri = self.ClassCri(net_dim, state_dim, action_dim, layer_norm).to(self.device)
|
||||
self.act = self.ClassAct(net_dim, state_dim, action_dim, layer_norm).to(
|
||||
self.device) if self.ClassAct else self.cri
|
||||
self.cri_target = deepcopy(self.cri) if self.if_use_cri_target else self.cri
|
||||
self.act_target = deepcopy(self.act) if self.if_use_act_target else self.act
|
||||
|
||||
self.cri_optim = torch.optim.Adam(self.cri.parameters(), learning_rate)
|
||||
self.act_optim = torch.optim.Adam(self.act.parameters(), learning_rate) if self.ClassAct else self.cri
|
||||
|
||||
def select_action(self, state):
|
||||
states = torch.as_tensor((state,), dtype=torch.float32, device=self.device)
|
||||
actions, noises = self.act.get_action(states)
|
||||
return actions[0].detach().cpu().numpy(), noises[0].detach().cpu().numpy()
|
||||
|
||||
@staticmethod
|
||||
def get_llm_action(index):
|
||||
with open('data/llm_action.json', 'r') as file:
|
||||
data = json.load(file)
|
||||
data_tensor = torch.tensor(data, dtype=torch.float32)
|
||||
normalized_index = index % len(data_tensor)
|
||||
action = data_tensor[normalized_index].detach().cpu().numpy()
|
||||
return action
|
||||
|
||||
def explore_env(self, env, target_step):
|
||||
state = self.state # sent state to agent and then agent sent state to method
|
||||
trajectory_temp = list()
|
||||
last_done = 0
|
||||
for i in range(target_step):
|
||||
# action = self.get_llm_action(i)
|
||||
# noise = 0
|
||||
action, noise = self.select_action(state)
|
||||
state, next_state, reward, done, = env.step(np.tanh(action)) # make action between -1 & 1
|
||||
trajectory_temp.append((state, reward, done, action, noise))
|
||||
if done:
|
||||
state = env.reset()
|
||||
last_done = i
|
||||
else:
|
||||
state = next_state
|
||||
self.state = state
|
||||
|
||||
'''splice list'''
|
||||
# store 0 trajectory information to list
|
||||
trajectory_list = self.trajectory_list + trajectory_temp[:last_done + 1]
|
||||
self.trajectory_list = trajectory_temp[last_done:]
|
||||
return trajectory_list
|
||||
|
||||
def update_net(self, buffer, batch_size, repeat_times, soft_update_tau):
|
||||
"""put data extract and update network together"""
|
||||
with torch.no_grad():
|
||||
buf_len = buffer[0].shape[0]
|
||||
# decompose buffer data
|
||||
buf_state, buf_action, buf_noise, buf_reward, buf_mask = [ten.to(self.device) for ten in buffer]
|
||||
|
||||
'''get buf_r_sum, buf_logprob'''
|
||||
bs = 4096 # set a smaller 'BatchSize' when out of GPU memory: 1024, could change to 4096
|
||||
buf_value = [self.cri_target(buf_state[i:i + bs]) for i in range(0, buf_len, bs)]
|
||||
buf_value = torch.cat(buf_value, dim=0)
|
||||
buf_logprob = self.act.get_old_logprob(buf_action, buf_noise)
|
||||
|
||||
buf_r_sum, buf_advantage = self.get_reward_sum(buf_len, buf_reward, buf_mask, buf_value) # detach()
|
||||
# normalize advantage
|
||||
buf_advantage = (buf_advantage - buf_advantage.mean()) / (buf_advantage.std() + 1e-5)
|
||||
del buf_noise, buffer[:]
|
||||
|
||||
'''PPO: Surrogate objective of Trust Region'''
|
||||
obj_critic = obj_actor = None
|
||||
for _ in range(int(buf_len / batch_size * repeat_times)):
|
||||
indices = torch.randint(buf_len, size=(batch_size,), requires_grad=False, device=self.device)
|
||||
|
||||
state = buf_state[indices]
|
||||
action = buf_action[indices]
|
||||
r_sum = buf_r_sum[indices]
|
||||
logprob = buf_logprob[indices]
|
||||
advantage = buf_advantage[indices]
|
||||
|
||||
new_logprob, obj_entropy = self.act.get_logprob_entropy(state, action) # it is obj_actor
|
||||
ratio = (new_logprob - logprob.detach()).exp()
|
||||
surrogate1 = advantage * ratio
|
||||
surrogate2 = advantage * ratio.clamp(1 - self.ratio_clip, 1 + self.ratio_clip)
|
||||
obj_surrogate = -torch.min(surrogate1, surrogate2).mean()
|
||||
obj_actor = obj_surrogate + obj_entropy * self.lambda_entropy
|
||||
self.optim_update(self.act_optim, obj_actor) # update actor
|
||||
|
||||
value = self.cri(state).squeeze(1) # critic network predicts the reward_sum (Q value) of state
|
||||
# use smoothloss L1 to evaluate the value loss
|
||||
# obj_critic = self.criterion(value, r_sum) / (r_sum.std() + 1e-6)
|
||||
obj_critic = self.criterion(value, r_sum)
|
||||
self.optim_update(self.cri_optim, obj_critic) # calculate and update the back propogation of value loss
|
||||
# choose whether to use soft update
|
||||
self.soft_update(self.cri_target, self.cri, soft_update_tau) if self.cri_target is not self.cri else None
|
||||
|
||||
a_std_log = getattr(self.act, 'a_std_log', torch.zeros(1))
|
||||
return obj_critic.item(), obj_actor.item(), a_std_log.mean().item() # logging_tuple
|
||||
|
||||
def get_reward_sum_raw(self, buf_len, buf_reward, buf_mask, buf_value) -> (torch.Tensor, torch.Tensor):
|
||||
buf_r_sum = torch.empty(buf_len, dtype=torch.float32, device=self.device) # reward sum
|
||||
|
||||
pre_r_sum = 0
|
||||
for i in range(buf_len - 1, -1, -1):
|
||||
buf_r_sum[i] = buf_reward[i] + buf_mask[i] * pre_r_sum
|
||||
pre_r_sum = buf_r_sum[i]
|
||||
buf_advantage = buf_r_sum - (buf_mask * buf_value[:, 0])
|
||||
return buf_r_sum, buf_advantage
|
||||
|
||||
def get_reward_sum_gae(self, buf_len, ten_reward, ten_mask, ten_value) -> (torch.Tensor, torch.Tensor):
|
||||
buf_r_sum = torch.empty(buf_len, dtype=torch.float32, device=self.device) # old policy value
|
||||
buf_advantage = torch.empty(buf_len, dtype=torch.float32, device=self.device) # advantage value
|
||||
|
||||
pre_r_sum = 0
|
||||
pre_advantage = 0 # advantage value of previous step
|
||||
for i in range(buf_len - 1, -1, -1):
|
||||
buf_r_sum[i] = ten_reward[i] + ten_mask[i] * pre_r_sum
|
||||
pre_r_sum = buf_r_sum[i]
|
||||
buf_advantage[i] = ten_reward[i] + ten_mask[i] * (pre_advantage - ten_value[i]) # fix a bug here
|
||||
pre_advantage = ten_value[i] + buf_advantage[i] * self.lambda_gae_adv
|
||||
return buf_r_sum, buf_advantage
|
||||
|
||||
@staticmethod
|
||||
def optim_update(optimizer, objective):
|
||||
optimizer.zero_grad()
|
||||
objective.backward()
|
||||
optimizer.step()
|
||||
|
||||
@staticmethod
|
||||
def soft_update(target_net, current_net, tau):
|
||||
for tar, cur in zip(target_net.parameters(), current_net.parameters()):
|
||||
tar.data.copy_(cur.data.__mul__(tau) + tar.data.__mul__(1.0 - tau))
|
||||
|
||||
|
||||
class Arguments:
|
||||
def __init__(self, agent=None, env=None):
|
||||
self.agent = agent # Deep Reinforcement Learning algorithm
|
||||
self.env = env # the environment for training
|
||||
self.cwd = None # current work directory. None means set automatically
|
||||
self.if_remove = False # remove the cwd folder? (True, False, None:ask me)
|
||||
self.visible_gpu = '0' # for example: os.environ['CUDA_VISIBLE_DEVICES'] = '0, 2,'
|
||||
# self.worker_num = 2 # rollout workers number pre GPU (adjust it to get high GPU usage)
|
||||
self.num_threads = 32 # cpu_num for evaluate model, torch.set_num_threads(self.num_threads)
|
||||
|
||||
'''Arguments for training'''
|
||||
self.num_episode = 1000 # to control the train episodes for PPO
|
||||
self.gamma = 0.995 # discount factor of future rewards
|
||||
self.learning_rate = 2 ** -14 # 2e-4
|
||||
self.soft_update_tau = 2 ** -8 # 2 ** -8 ~= 5e-3
|
||||
|
||||
self.net_dim = 256 # the network width
|
||||
self.batch_size = 4096 # num of transitions sampled from replay buffer.
|
||||
self.repeat_times = 2 ** 3 # collect target_step, then update network
|
||||
self.target_step = 4096 # repeatedly update network to keep critic's loss small
|
||||
self.max_memo = self.target_step # capacity of replay buffer
|
||||
self.if_per_or_gae = False # GAE for on-policy sparse reward: Generalized Advantage Estimation.
|
||||
|
||||
'''Arguments for evaluate'''
|
||||
self.random_seed = 0 # initialize random seed in self.init_before_training()
|
||||
# self.random_seed_list = [1234, 2234, 3234, 4234, 5234]
|
||||
self.random_seed_list = [1234]
|
||||
self.train = True
|
||||
self.save_network = True
|
||||
self.test_network = True
|
||||
self.save_test_data = True
|
||||
self.compare_with_gurobi = True
|
||||
self.plot_on = True
|
||||
|
||||
def init_before_training(self, if_main):
|
||||
if self.cwd is None:
|
||||
agent_name = self.agent.__class__.__name__
|
||||
self.cwd = f'./{agent_name}'
|
||||
|
||||
if if_main:
|
||||
import shutil # remove history according to bool(if_remove)
|
||||
if self.if_remove is None:
|
||||
self.if_remove = bool(input(f"| PRESS 'y' to REMOVE: {self.cwd}? ") == 'y')
|
||||
elif self.if_remove:
|
||||
shutil.rmtree(self.cwd, ignore_errors=True)
|
||||
print(f"| Remove cwd: {self.cwd}")
|
||||
os.makedirs(self.cwd, exist_ok=True)
|
||||
|
||||
np.random.seed(self.random_seed)
|
||||
torch.manual_seed(self.random_seed)
|
||||
torch.set_num_threads(self.num_threads)
|
||||
torch.set_default_dtype(torch.float32)
|
||||
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = str(self.visible_gpu)
|
||||
|
||||
|
||||
def update_buffer(_trajectory):
|
||||
_trajectory = list(map(list, zip(*_trajectory))) # 2D-list transpose, here cut the trajectory into 5 parts
|
||||
ten_state = torch.as_tensor(_trajectory[0]) # tensor state here
|
||||
ten_reward = torch.as_tensor(_trajectory[1], dtype=torch.float32)
|
||||
# _trajectory[2] = done, replace done by mask, save memory
|
||||
ten_mask = (1.0 - torch.as_tensor(_trajectory[2], dtype=torch.float32)) * gamma
|
||||
ten_action = torch.as_tensor(_trajectory[3])
|
||||
ten_noise = torch.as_tensor(_trajectory[4], dtype=torch.float32)
|
||||
|
||||
buffer[:] = (ten_state, ten_action, ten_noise, ten_reward, ten_mask) # list store tensors
|
||||
|
||||
_steps = ten_reward.shape[0] # how many steps are collected in all trajectories
|
||||
_r_exp = ten_reward.mean() # the mean reward
|
||||
return _steps, _r_exp
|
||||
|
||||
|
||||
def load_actions_from_json(file_path):
|
||||
with open(file_path, 'r') as file:
|
||||
actions = json.load(file)
|
||||
return actions
|
||||
|
||||
|
||||
def simulate_with_llm_actions(env, llm_actions):
|
||||
states, actions, rewards, unbalances = [], [], [], []
|
||||
state = env.reset()
|
||||
|
||||
for action in llm_actions:
|
||||
next_state, reward, done, info = env.step(action)
|
||||
states.append(state)
|
||||
actions.append(action)
|
||||
rewards.append(reward)
|
||||
unbalances.append(info.get('unbalance', 0))
|
||||
state = next_state
|
||||
if done:
|
||||
break
|
||||
|
||||
return states, actions, rewards, unbalances
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
env = ESSEnv()
|
||||
llm_actions = load_actions_from_json('data/llm_action.json')
|
||||
states, actions, rewards, unbalances = simulate_with_llm_actions(env, llm_actions)
|
||||
|
||||
fig, ax1 = plt.subplots()
|
||||
|
||||
color = 'tab:blue'
|
||||
ax1.set_xlabel('Step')
|
||||
ax1.set_ylabel('Reward', color=color)
|
||||
ax1.plot(rewards, color=color)
|
||||
ax1.tick_params(axis='y', labelcolor=color)
|
||||
|
||||
ax2 = ax1.twinx()
|
||||
color = 'tab:red'
|
||||
ax2.set_ylabel('Unbalance', color=color)
|
||||
ax2.plot(unbalances, color=color)
|
||||
ax2.tick_params(axis='y', labelcolor=color)
|
||||
|
||||
fig.tight_layout()
|
||||
plt.title('Rewards and Unbalance over Steps')
|
||||
plt.show()
|
31
test.py
31
test.py
|
@ -39,17 +39,22 @@
|
|||
# data = pickle.load(f)
|
||||
#
|
||||
# print(data)
|
||||
import json
|
||||
import numpy as np
|
||||
# import json
|
||||
# import numpy as np
|
||||
#
|
||||
#
|
||||
# def get_llm_action(index) -> np.ndarray:
|
||||
# with open('data/llm_action.json', 'r') as file:
|
||||
# data = json.load(file)
|
||||
# normalized_index = index % len(data)
|
||||
# action = np.array(data[normalized_index])
|
||||
# return action
|
||||
#
|
||||
# data = get_llm_action(2)
|
||||
# print(data)
|
||||
from environment import ESSEnv
|
||||
|
||||
|
||||
def get_llm_action(index) -> np.ndarray:
|
||||
with open('data/llm_action.json', 'r') as file:
|
||||
data = json.load(file)
|
||||
normalized_index = index % len(data)
|
||||
action = np.array(data[normalized_index])
|
||||
return action
|
||||
|
||||
|
||||
data = get_llm_action(2)
|
||||
print(data)
|
||||
env = ESSEnv()
|
||||
wind_speed = env.data_manager.get_series_wind_data(1, 1)
|
||||
wind = [env.wind.step(ws) for ws in wind_speed]
|
||||
print(wind)
|
||||
|
|
17
tools.py
17
tools.py
|
@ -7,7 +7,7 @@ import torch
|
|||
from gurobipy import GRB
|
||||
|
||||
|
||||
def optimization_base_result(env, month, day, initial_soc):
|
||||
def optimization_base_result(env,month,day,initial_soc):
|
||||
price = env.data_manager.get_series_price_data(month, day)
|
||||
load = env.data_manager.get_series_load_cons_data(month, day)
|
||||
temperature = env.data_manager.get_series_temperature_data(month, day)
|
||||
|
@ -50,7 +50,7 @@ def optimization_base_result(env, month, day, initial_soc):
|
|||
on_off = m.addVars(NUM_GEN, period, vtype=GRB.BINARY, name='on_off')
|
||||
gen_output = m.addVars(NUM_GEN, period, vtype=GRB.CONTINUOUS, name='output')
|
||||
# 设置充放电约束
|
||||
battery_energy_change = m.addVars(period, vtype=GRB.CONTINUOUS, lb=env.battery.max_discharge,
|
||||
battery_energy_change = m.addVars(period, vtype=GRB.CONTINUOUS, lb=-env.battery.max_charge,
|
||||
ub=env.battery.max_charge, name='battery_action')
|
||||
# 设置外部电网与能源系统交换约束
|
||||
grid_energy_import = m.addVars(period, vtype=GRB.CONTINUOUS, lb=0, ub=env.grid.exchange_ability, name='import')
|
||||
|
@ -86,8 +86,8 @@ def optimization_base_result(env, month, day, initial_soc):
|
|||
m.setObjective((cost_gen + cost_grid_import - cost_grid_export), GRB.MINIMIZE)
|
||||
m.optimize()
|
||||
|
||||
output_record = {'pv': [], 'temperature': [], 'irradiance': [], 'wind_speed': [], 'price': [], 'load': [],
|
||||
'netload': [], 'soc': [], 'battery_energy_change': [], 'grid_import': [], 'grid_export': [],
|
||||
output_record = {'pv': [], 'wind': [], 'price': [], 'load': [], 'netload': [],
|
||||
'soc': [], 'battery_energy_change': [], 'grid_import': [], 'grid_export': [],
|
||||
'gen1': [], 'gen2': [], 'gen3': [], 'step_cost': []}
|
||||
for t in range(period):
|
||||
gen_cost = sum((on_off[g, t].x * (
|
||||
|
@ -95,13 +95,11 @@ def optimization_base_result(env, month, day, initial_soc):
|
|||
for g in range(NUM_GEN))
|
||||
grid_import_cost = grid_energy_import[t].x * price[t]
|
||||
grid_export_cost = grid_energy_export[t].x * price[t] * env.sell_coefficient
|
||||
output_record['pv'].append(pv[t].x)
|
||||
output_record['temperature'].append(temperature[t])
|
||||
output_record['irradiance'].append(irradiance[t])
|
||||
output_record['wind_speed'].append(wind[t])
|
||||
output_record['pv'].append(pv[t])
|
||||
output_record['wind'].append(wind[t])
|
||||
output_record['price'].append(price[t])
|
||||
output_record['load'].append(load[t])
|
||||
output_record['netload'].append(load[t] - pv[t].x)
|
||||
output_record['netload'].append(load[t] - pv[t])
|
||||
output_record['soc'].append(soc[t].x)
|
||||
output_record['battery_energy_change'].append(battery_energy_change[t].x)
|
||||
output_record['grid_import'].append(grid_energy_import[t].x)
|
||||
|
@ -110,7 +108,6 @@ def optimization_base_result(env, month, day, initial_soc):
|
|||
output_record['gen2'].append(gen_output[1, t].x)
|
||||
output_record['gen3'].append(gen_output[2, t].x)
|
||||
output_record['step_cost'].append(gen_cost + grid_import_cost - grid_export_cost)
|
||||
|
||||
output_record_df = pd.DataFrame.from_dict(output_record)
|
||||
return output_record_df
|
||||
|
||||
|
|
|
@ -0,0 +1,285 @@
|
|||
import os
|
||||
import gurobipy as gp
|
||||
import numpy as np
|
||||
import numpy.random as rd
|
||||
import pandas as pd
|
||||
import torch
|
||||
from gurobipy import GRB
|
||||
|
||||
|
||||
def optimization_base_result(env, month, day, initial_soc):
|
||||
price = env.data_manager.get_series_price_data(month, day)
|
||||
load = env.data_manager.get_series_load_cons_data(month, day)
|
||||
temperature = env.data_manager.get_series_temperature_data(month, day)
|
||||
irradiance = env.data_manager.get_series_irradiance_data(month, day)
|
||||
wind_speed = env.data_manager.get_series_wind_data(month, day)
|
||||
|
||||
pv = [env.solar.step(temp, irr) for temp, irr in zip(temperature, irradiance)]
|
||||
wind = [env.wind.step(ws) for ws in wind_speed]
|
||||
|
||||
period = env.episode_length
|
||||
DG_parameters = env.dg_parameters
|
||||
|
||||
def get_dg_info(parameters):
|
||||
p_max = []
|
||||
p_min = []
|
||||
ramping_up = []
|
||||
ramping_down = []
|
||||
a_para = []
|
||||
b_para = []
|
||||
c_para = []
|
||||
|
||||
for name, gen_info in parameters.items():
|
||||
p_max.append(gen_info['power_output_max'])
|
||||
p_min.append(gen_info['power_output_min'])
|
||||
ramping_up.append(gen_info['ramping_up'])
|
||||
ramping_down.append(gen_info['ramping_down'])
|
||||
a_para.append(gen_info['a'])
|
||||
b_para.append(gen_info['b'])
|
||||
c_para.append(gen_info['c'])
|
||||
return p_max, p_min, ramping_up, ramping_down, a_para, b_para, c_para
|
||||
|
||||
p_max, p_min, ramping_up, ramping_down, a_para, b_para, c_para = get_dg_info(parameters=DG_parameters)
|
||||
NUM_GEN = len(DG_parameters.keys())
|
||||
battery_capacity = env.battery.capacity
|
||||
battery_efficiency = env.battery.efficiency
|
||||
|
||||
m = gp.Model("UC")
|
||||
m.setParam('OutputFlag', 1)
|
||||
|
||||
# 设置系统变量
|
||||
on_off = m.addVars(NUM_GEN, period, vtype=GRB.BINARY, name='on_off')
|
||||
gen_output = m.addVars(NUM_GEN, period, vtype=GRB.CONTINUOUS, name='output')
|
||||
# 设置充放电约束
|
||||
battery_energy_change = m.addVars(period, vtype=GRB.CONTINUOUS, lb=env.battery.max_discharge,
|
||||
ub=env.battery.max_charge, name='battery_action')
|
||||
# 设置外部电网与能源系统交换约束
|
||||
grid_energy_import = m.addVars(period, vtype=GRB.CONTINUOUS, lb=0, ub=env.grid.exchange_ability, name='import')
|
||||
grid_energy_export = m.addVars(period, vtype=GRB.CONTINUOUS, lb=0, ub=env.grid.exchange_ability, name='export')
|
||||
soc = m.addVars(period, vtype=GRB.CONTINUOUS, lb=0.2, ub=0.8, name='SOC')
|
||||
|
||||
# 1. 添加平衡约束
|
||||
m.addConstrs(((sum(gen_output[g, t] for g in range(NUM_GEN)) + pv[t] + wind[t] + grid_energy_import[t] >= load[t] +
|
||||
battery_energy_change[t] + grid_energy_export[t]) for t in range(period)), name='powerbalance')
|
||||
# 2. 添加发电机最大/最小功率约束
|
||||
m.addConstrs((gen_output[g, t] <= on_off[g, t] * p_max[g] for g in range(NUM_GEN) for t in range(period)),
|
||||
'gen_output_max')
|
||||
m.addConstrs((gen_output[g, t] >= on_off[g, t] * p_min[g] for g in range(NUM_GEN) for t in range(period)),
|
||||
'gen_output_min')
|
||||
# 3. 添加上升和下降约束
|
||||
m.addConstrs((gen_output[g, t + 1] - gen_output[g, t] <= ramping_up[g] for g in range(NUM_GEN)
|
||||
for t in range(period - 1)), 'ramping_up')
|
||||
m.addConstrs((gen_output[g, t] - gen_output[g, t + 1] <= ramping_down[g] for g in range(NUM_GEN)
|
||||
for t in range(period - 1)), 'ramping_down')
|
||||
# 4. 添加电池容量约束
|
||||
m.addConstr(battery_capacity * soc[0] == battery_capacity * initial_soc +
|
||||
(battery_energy_change[0] * battery_efficiency), name='soc0')
|
||||
m.addConstrs((battery_capacity * soc[t] == battery_capacity * soc[t - 1] +
|
||||
(battery_energy_change[t] * battery_efficiency) for t in range(1, period)), name='soc update')
|
||||
# 设置成本函数
|
||||
# 发电机成本
|
||||
cost_gen = gp.quicksum(
|
||||
(a_para[g] * gen_output[g, t] * gen_output[g, t] + b_para[g] * gen_output[g, t] + c_para[g] * on_off[g, t]) for
|
||||
t in range(period) for g in range(NUM_GEN))
|
||||
cost_grid_import = gp.quicksum(grid_energy_import[t] * price[t] for t in range(period))
|
||||
cost_grid_export = gp.quicksum(grid_energy_export[t] * price[t] * env.sell_coefficient for t in range(period))
|
||||
|
||||
m.setObjective((cost_gen + cost_grid_import - cost_grid_export), GRB.MINIMIZE)
|
||||
m.optimize()
|
||||
|
||||
m.computeIIS()
|
||||
|
||||
output_record = {'pv': [], 'wind': [], 'price': [], 'load': [],
|
||||
'netload': [], 'soc': [], 'battery_energy_change': [], 'grid_import': [], 'grid_export': [],
|
||||
'gen1': [], 'gen2': [], 'gen3': [], 'step_cost': []}
|
||||
for t in range(period):
|
||||
gen_cost = sum((on_off[g, t].x * (
|
||||
a_para[g] * gen_output[g, t].x * gen_output[g, t].x + b_para[g] * gen_output[g, t].x + c_para[g]))
|
||||
for g in range(NUM_GEN))
|
||||
grid_import_cost = grid_energy_import[t].x * price[t]
|
||||
grid_export_cost = grid_energy_export[t].x * price[t] * env.sell_coefficient
|
||||
output_record['pv'].append(pv[t])
|
||||
# output_record['temperature'].append(temperature[t])
|
||||
# output_record['irradiance'].append(irradiance[t])
|
||||
output_record['wind'].append(wind[t])
|
||||
output_record['price'].append(price[t])
|
||||
output_record['load'].append(load[t])
|
||||
output_record['netload'].append(load[t] - pv[t])
|
||||
output_record['soc'].append(soc[t].x)
|
||||
output_record['battery_energy_change'].append(battery_energy_change[t].x)
|
||||
output_record['grid_import'].append(grid_energy_import[t].x)
|
||||
output_record['grid_export'].append(grid_energy_export[t].x)
|
||||
output_record['gen1'].append(gen_output[0, t].x)
|
||||
output_record['gen2'].append(gen_output[1, t].x)
|
||||
output_record['gen3'].append(gen_output[2, t].x)
|
||||
output_record['step_cost'].append(gen_cost + grid_import_cost - grid_export_cost)
|
||||
output_record_df = pd.DataFrame.from_dict(output_record)
|
||||
return output_record_df
|
||||
|
||||
|
||||
class Arguments:
|
||||
"""revise here for our own purpose"""
|
||||
|
||||
def __init__(self, agent=None, env=None):
|
||||
self.agent = agent # Deep Reinforcement Learning algorithm
|
||||
self.env = env # the environment for training
|
||||
# self.plot_shadow_on = False # control do we need to plot all shadow figures
|
||||
self.cwd = None # current work directory. None means set automatically
|
||||
self.if_remove = False # remove the cwd folder? (True, False, None:ask me)
|
||||
self.visible_gpu = '0,1,2,3' # for example: os.environ['CUDA_VISIBLE_DEVICES'] = '0, 2,'
|
||||
# self.worker_num = 2 # rollout workers number pre GPU (adjust it to get high GPU usage)
|
||||
self.num_threads = 32 # cpu_num for evaluate model, torch.set_num_threads(self.num_threads)
|
||||
|
||||
'''Arguments for training'''
|
||||
self.num_episode = 1000
|
||||
self.gamma = 0.995 # discount factor of future rewards
|
||||
# self.reward_scale = 1 # an approximate target reward usually be closed to 256
|
||||
self.learning_rate = 2 ** -14 # 2 ** -14 ~= 6e-5
|
||||
self.soft_update_tau = 2 ** -8 # 2 ** -8 ~= 5e-3
|
||||
self.net_dim = 256 # the network width 256
|
||||
self.batch_size = 4096 # num of transitions sampled from replay buffer.
|
||||
self.repeat_times = 2 ** 5 # repeatedly update network to keep critic's loss small
|
||||
self.target_step = 4096 # collect target_step experiences, then update network, 1024
|
||||
self.max_memo = 500000 # capacity of replay buffer
|
||||
self.if_per_or_gae = False # PER for off-policy sparse reward: Prioritized Experience Replay.
|
||||
|
||||
'''Arguments for evaluate'''
|
||||
# self.eval_gap = 2 ** 6 # evaluate the agent per eval_gap seconds
|
||||
# self.eval_times = 2 # number of times that get episode return in first
|
||||
self.random_seed = 0 # initialize random seed in self.init_before_training()
|
||||
# self.random_seed_list = [1234, 2234, 3234, 4234, 5234]
|
||||
self.random_seed_list = [1234]
|
||||
'''Arguments for save and plot issues'''
|
||||
self.train = True
|
||||
self.save_network = True
|
||||
self.test_network = True
|
||||
self.save_test_data = True
|
||||
self.compare_with_gurobi = True
|
||||
self.plot_on = True
|
||||
|
||||
def init_before_training(self, if_main):
|
||||
if self.cwd is None:
|
||||
agent_name = self.agent.__class__.__name__
|
||||
self.cwd = f'./{agent_name}'
|
||||
|
||||
if if_main:
|
||||
import shutil # remove history according to bool(if_remove)
|
||||
if self.if_remove is None:
|
||||
self.if_remove = bool(input(f"| PRESS 'y' to REMOVE: {self.cwd}? ") == 'y')
|
||||
elif self.if_remove:
|
||||
shutil.rmtree(self.cwd, ignore_errors=True)
|
||||
print(f"| Remove cwd: {self.cwd}")
|
||||
os.makedirs(self.cwd, exist_ok=True)
|
||||
|
||||
np.random.seed(self.random_seed)
|
||||
torch.manual_seed(self.random_seed)
|
||||
torch.set_num_threads(self.num_threads)
|
||||
torch.set_default_dtype(torch.float32)
|
||||
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = str(self.visible_gpu) # control how many GPU is used
|
||||
|
||||
|
||||
def test_one_episode(env, act, device):
|
||||
"""to get evaluate information, here record the unbalance of after taking action"""
|
||||
record_state = []
|
||||
record_action = []
|
||||
record_reward = []
|
||||
record_output = []
|
||||
record_cost = []
|
||||
record_unbalance = []
|
||||
record_system_info = [] # [time price,netload,action,real action, output*4,soc,unbalance(exchange+penalty)]
|
||||
record_init_info = [] # include month,day,time,intial soc
|
||||
env.TRAIN = False
|
||||
state = env.reset()
|
||||
record_init_info.append([env.month, env.day, env.current_time, env.battery.current_capacity])
|
||||
print(f'current testing month is {env.month}, day is {env.day},initial_soc is {env.battery.current_capacity}')
|
||||
for i in range(24):
|
||||
s_tensor = torch.as_tensor((state,), device=device)
|
||||
a_tensor = act(s_tensor)
|
||||
action = a_tensor.detach().cpu().numpy()[0] # not need detach(), because with torch.no_grad() outside
|
||||
real_action = action
|
||||
state, next_state, reward, done = env.step(action)
|
||||
|
||||
record_system_info.append([state[0], state[1], state[3], action, real_action, env.battery.SOC(),
|
||||
env.battery.energy_change, next_state[4], next_state[5], next_state[6],
|
||||
next_state[7], next_state[8], env.unbalance, env.operation_cost])
|
||||
record_state.append(state)
|
||||
record_action.append(real_action)
|
||||
record_reward.append(reward)
|
||||
record_output.append(env.current_output)
|
||||
record_unbalance.append(env.unbalance)
|
||||
state = next_state
|
||||
# add information of last step dg1, dh2, dg3, soc
|
||||
record_system_info[-1][7:10] = [env.final_step_outputs[0], env.final_step_outputs[1], env.final_step_outputs[2]]
|
||||
record_system_info[-1][5] = env.final_step_outputs[3]
|
||||
record = {'init_info': record_init_info, 'system_info': record_system_info, 'state': record_state,
|
||||
'action': record_action, 'reward': record_reward, 'cost': record_cost, 'unbalance': record_unbalance,
|
||||
'record_output': record_output}
|
||||
return record
|
||||
|
||||
|
||||
def get_episode_return(env, act, device):
|
||||
episode_return = 0.0 # sum of rewards in an episode
|
||||
episode_unbalance = 0.0
|
||||
state = env.reset()
|
||||
for i in range(24):
|
||||
s_tensor = torch.as_tensor((state,), device=device)
|
||||
a_tensor = act(s_tensor)
|
||||
action = a_tensor.detach().cpu().numpy()[0] # not need detach(), because with torch.no_grad() outside
|
||||
state, next_state, reward, done, = env.step(action)
|
||||
state = next_state
|
||||
episode_return += reward
|
||||
episode_unbalance += env.real_unbalance
|
||||
if done:
|
||||
break
|
||||
return episode_return, episode_unbalance
|
||||
|
||||
|
||||
class ReplayBuffer:
|
||||
def __init__(self, max_len, state_dim, action_dim, gpu_id=0):
|
||||
self.now_len = 0
|
||||
self.next_idx = 0
|
||||
self.if_full = False
|
||||
self.max_len = max_len
|
||||
self.data_type = torch.float32
|
||||
self.action_dim = action_dim
|
||||
self.device = torch.device(f"cuda:{gpu_id}" if (torch.cuda.is_available() and (gpu_id >= 0)) else "cpu")
|
||||
|
||||
other_dim = 1 + 1 + self.action_dim
|
||||
self.buf_other = torch.empty(size=(max_len, other_dim), dtype=self.data_type, device=self.device)
|
||||
|
||||
if isinstance(state_dim, int): # state is pixel
|
||||
self.buf_state = torch.empty((max_len, state_dim), dtype=torch.float32, device=self.device)
|
||||
elif isinstance(state_dim, tuple):
|
||||
self.buf_state = torch.empty((max_len, *state_dim), dtype=torch.uint8, device=self.device)
|
||||
else:
|
||||
raise ValueError('state_dim')
|
||||
|
||||
def extend_buffer(self, state, other): # CPU array to CPU array
|
||||
size = len(other)
|
||||
next_idx = self.next_idx + size
|
||||
|
||||
if next_idx > self.max_len:
|
||||
self.buf_state[self.next_idx:self.max_len] = state[:self.max_len - self.next_idx]
|
||||
self.buf_other[self.next_idx:self.max_len] = other[:self.max_len - self.next_idx]
|
||||
self.if_full = True
|
||||
|
||||
next_idx = next_idx - self.max_len
|
||||
self.buf_state[0:next_idx] = state[-next_idx:]
|
||||
self.buf_other[0:next_idx] = other[-next_idx:]
|
||||
else:
|
||||
self.buf_state[self.next_idx:next_idx] = state
|
||||
self.buf_other[self.next_idx:next_idx] = other
|
||||
self.next_idx = next_idx
|
||||
|
||||
def sample_batch(self, batch_size) -> tuple:
|
||||
indices = rd.randint(self.now_len - 1, size=batch_size)
|
||||
r_m_a = self.buf_other[indices]
|
||||
return (r_m_a[:, 0:1],
|
||||
r_m_a[:, 1:2],
|
||||
r_m_a[:, 2:],
|
||||
self.buf_state[indices],
|
||||
self.buf_state[indices + 1])
|
||||
|
||||
def update_now_len(self):
|
||||
self.now_len = self.max_len if self.if_full else self.next_idx
|
Loading…
Reference in New Issue