update environment and agents for improved performance

refactors the code for the environment and agent modules to optimize performance and readability.
- Standardizing the order of arguments in solar step function calls.
- Introducing a percentage-based action voltage adjustment in the module
  step function to enhance control flexibility.
- Updating PPO and related files to use Generalized Advantage Estimation
  (GAE) by default, improving reward shaping and stability.
This commit is contained in:
chenxiaodong 2024-07-09 10:49:26 +08:00
parent ed51c79123
commit 993e062068
9 changed files with 17 additions and 28 deletions

View File

@ -48,16 +48,14 @@ if __name__ == '__main__':
target_step = args.target_step # how manysteps of one episode should stop
repeat_times = args.repeat_times # how many times should update for one batch size data
soft_update_tau = args.soft_update_tau
# get the first experience from
agent.state = env.reset()
'''collect data and train and update network'''
num_episode = args.num_episode
agent.state = env.reset()
'''collect data, train and update network'''
# args.train=False
# args.save_network=False
# args.test_network=False
# args.save_test_data=False
# args.compare_with_gurobi=False
if args.train:
collect_data = True
while collect_data:

13
PPO.py
View File

@ -249,7 +249,7 @@ class Arguments:
self.repeat_times = 2 ** 3 # collect target_step, then update network
self.target_step = 4096 # repeatedly update network to keep critic's loss small
self.max_memo = self.target_step # capacity of replay buffer
self.if_per_or_gae = False # GAE for on-policy sparse reward: Generalized Advantage Estimation.
self.if_gae_or_raw = True # GAE for on-policy sparse reward: Generalized Advantage Estimation.
'''Arguments for evaluate'''
self.random_seed = 0 # initialize random seed in self.init_before_training()
@ -270,7 +270,7 @@ class Arguments:
if if_main:
import shutil # remove history according to bool(if_remove)
if self.if_remove is None:
self.if_remove = bool(input(f"| PRESS 'y' to REMOVE: {self.cwd}? ") == 'y')
self.if_remove = (input(f"| PRESS 'y' to REMOVE: {self.cwd}? ") == 'y')
elif self.if_remove:
shutil.rmtree(self.cwd, ignore_errors=True)
print(f"| Remove cwd: {self.cwd}")
@ -280,7 +280,6 @@ class Arguments:
torch.manual_seed(self.random_seed)
torch.set_num_threads(self.num_threads)
torch.set_default_dtype(torch.float32)
os.environ['CUDA_VISIBLE_DEVICES'] = str(self.visible_gpu)
@ -308,7 +307,6 @@ if __name__ == '__main__':
for seed in args.random_seed_list:
args.random_seed = seed
args.agent = AgentPPO()
agent_name = f'{args.agent.__class__.__name__}'
args.agent.cri_target = True
args.env = ESSEnv()
@ -317,19 +315,18 @@ if __name__ == '__main__':
agent = args.agent
env = args.env
agent.init(args.net_dim, env.state_space.shape[0], env.action_space.shape[0], args.learning_rate,
args.if_per_or_gae, layer_norm=True)
args.if_gae_or_raw, layer_norm=True)
cwd = args.cwd
gamma = args.gamma
batch_size = args.batch_size # how much data should be used to update net
target_step = args.target_step # how manysteps of one episode should stop
repeat_times = args.repeat_times # how many times should update for one batch size data
soft_update_tau = args.soft_update_tau
num_episode = args.num_episode
agent.state = env.reset()
'''init buffer'''
buffer = list()
'''init training parameters'''
num_episode = args.num_episode
# args.train = False
# args.save_network = False
# args.test_network = False
@ -388,7 +385,7 @@ if __name__ == '__main__':
from plotDRL import PlotArgs, make_dir, plot_evaluation_information, plot_optimization_result
plot_args = PlotArgs()
plot_args.feature_change = ''
plot_args.feature_change = 'gae'
args.cwd = agent_name
plot_dir = make_dir(args.cwd, plot_args.feature_change)
plot_optimization_result(base_result, plot_dir)

View File

@ -249,7 +249,7 @@ class Arguments:
self.repeat_times = 2 ** 3 # collect target_step, then update network
self.target_step = 4096 # repeatedly update network to keep critic's loss small
self.max_memo = self.target_step # capacity of replay buffer
self.if_per_or_gae = False # GAE for on-policy sparse reward: Generalized Advantage Estimation.
self.if_gae_or_raw = True # GAE for on-policy sparse reward: Generalized Advantage Estimation.
'''Arguments for evaluate'''
self.random_seed = 0 # initialize random seed in self.init_before_training()
@ -317,7 +317,7 @@ if __name__ == '__main__':
agent = args.agent
env = args.env
agent.init(args.net_dim, env.state_space.shape[0], env.action_space.shape[0], args.learning_rate,
args.if_per_or_gae, layer_norm=True)
args.if_gae_or_raw, layer_norm=True)
cwd = args.cwd
gamma = args.gamma
@ -389,7 +389,7 @@ if __name__ == '__main__':
from plotDRL import PlotArgs, make_dir, plot_evaluation_information, plot_optimization_result
plot_args = PlotArgs()
plot_args.feature_change = 'llm_5_no_mean'
plot_args.feature_change = 'llm'
args.cwd = agent_name
plot_dir = make_dir(args.cwd, plot_args.feature_change)
plot_optimization_result(base_result, plot_dir)

9
SAC.py
View File

@ -47,20 +47,15 @@ if __name__ == '__main__':
target_step = args.target_step # how manysteps of one episode should stop
repeat_times = args.repeat_times # how many times should update for one batch size data
soft_update_tau = args.soft_update_tau
agent.state = env.reset()
'''collect data and train and update network'''
num_episode = args.num_episode
'''here record real unbalance'''
##
args.train = False
args.save_network = False
# args.train = False
# args.save_network = False
# args.test_network = False
# args.save_test_data = False
# args.compare_with_gurobi = False
#
if args.train:
collect_data = True
while collect_data:

2
TD3.py
View File

@ -47,11 +47,9 @@ if __name__ == '__main__':
target_step = args.target_step # how manysteps of one episode should stop
repeat_times = args.repeat_times # how many times should update for one batch size data
soft_update_tau = args.soft_update_tau
agent.state = env.reset()
'''collect data and train and update network'''
num_episode = args.num_episode
# args.train=False
# args.save_network=False
# args.test_network=False

View File

@ -90,7 +90,7 @@ class ESSEnv(gym.Env):
self.dg1.step(action[1])
self.dg2.step(action[2])
self.dg3.step(action[3])
self.solar.step(action[4], temperature, irradiance)
self.solar.step(temperature, irradiance, action[4])
self.wind.step(wind_speed)
self.current_output = np.array((self.dg1.current_output, self.dg2.current_output, self.dg3.current_output,
-self.battery.energy_change))

View File

@ -93,7 +93,7 @@ class ESSEnv(gym.Env):
self.dg1.step(action[1])
self.dg2.step(action[2])
self.dg3.step(action[3])
self.solar.step(action[4], temperature, irradiance)
self.solar.step(temperature, irradiance, action[4])
self.wind.step(wind_speed)
self.current_output = np.array((self.dg1.current_output, self.dg2.current_output, self.dg3.current_output,
-self.battery.energy_change))

View File

@ -85,7 +85,7 @@ class ESSEnv(gym.Env):
self.dg1.step(action[1])
self.dg2.step(action[2])
self.dg3.step(action[3])
self.solar.step(action[4], temperature, irradiance)
self.solar.step(temperature, irradiance, action[4])
self.wind.step(wind_speed)
self.current_output = np.array((self.dg1.current_output, self.dg2.current_output, self.dg3.current_output,
-self.battery.energy_change, self.solar.current_power, self.wind.current_power))

View File

@ -81,13 +81,14 @@ class Solar:
self.opex_cofficient = parameters['O_c']
self.refer_irradiance = parameters['I_ref']
self.refer_temperature = parameters['T_ref']
self.change_percent = parameters['C_per']
def step(self, temperature, irradiance, action_voltage=0):
I_sc = self.sc_current * (irradiance / self.refer_irradiance)
V_oc = self.oc_voltage + self.temper_coefficient * (temperature - self.refer_temperature)
current = I_sc - (V_oc / self.sh_resistance)
self.current_power = max((1 + action_voltage) * self.base_voltage * current, 0)
self.current_power = max((1 + self.change_percent * action_voltage) * self.base_voltage * current, 0)
return self.current_power
def get_cost(self, current_power):