update environment and agents for improved performance

refactors the code for the environment and agent modules to optimize performance and readability. - Standardizing the order of arguments in solar step function calls. - Introducing a percentage-based action voltage adjustment in the module step function to enhance control flexibility. - Updating PPO and related files to use Generalized Advantage Estimation (GAE) by default, improving reward shaping and stability.
2024-07-09 10:49:26 +08:00 · 2024-07-09 10:49:26 +08:00 · 993e062068
parent ed51c79123
commit 993e062068
9 changed files with 17 additions and 28 deletions
--- a/DDPG.py
+++ b/DDPG.py
@ -48,16 +48,14 @@ if __name__ == '__main__':
        target_step = args.target_step  # how manysteps of one episode should stop
        repeat_times = args.repeat_times  # how many times should update for one batch size data
        soft_update_tau = args.soft_update_tau
-        # get the first experience from
-        agent.state = env.reset()
-        '''collect data and train and update network'''
        num_episode = args.num_episode
+        agent.state = env.reset()
+        '''collect data, train and update network'''
        # args.train=False
        # args.save_network=False
        # args.test_network=False
        # args.save_test_data=False
        # args.compare_with_gurobi=False
-
        if args.train:
            collect_data = True
            while collect_data:
--- a/PPO.py
+++ b/PPO.py
@ -249,7 +249,7 @@ class Arguments:
        self.repeat_times = 2 ** 3  # collect target_step, then update network
        self.target_step = 4096  # repeatedly update network to keep critic's loss small
        self.max_memo = self.target_step  # capacity of replay buffer
-        self.if_per_or_gae = False  # GAE for on-policy sparse reward: Generalized Advantage Estimation.
+        self.if_gae_or_raw = True  # GAE for on-policy sparse reward: Generalized Advantage Estimation.

        '''Arguments for evaluate'''
        self.random_seed = 0  # initialize random seed in self.init_before_training()
@ -270,7 +270,7 @@ class Arguments:
        if if_main:
            import shutil  # remove history according to bool(if_remove)
            if self.if_remove is None:
-                self.if_remove = bool(input(f"| PRESS 'y' to REMOVE: {self.cwd}? ") == 'y')
+                self.if_remove = (input(f"| PRESS 'y' to REMOVE: {self.cwd}? ") == 'y')
            elif self.if_remove:
                shutil.rmtree(self.cwd, ignore_errors=True)
                print(f"| Remove cwd: {self.cwd}")
@ -280,7 +280,6 @@ class Arguments:
        torch.manual_seed(self.random_seed)
        torch.set_num_threads(self.num_threads)
        torch.set_default_dtype(torch.float32)
-
        os.environ['CUDA_VISIBLE_DEVICES'] = str(self.visible_gpu)


@ -308,7 +307,6 @@ if __name__ == '__main__':
    for seed in args.random_seed_list:
        args.random_seed = seed
        args.agent = AgentPPO()
-
        agent_name = f'{args.agent.__class__.__name__}'
        args.agent.cri_target = True
        args.env = ESSEnv()
@ -317,19 +315,18 @@ if __name__ == '__main__':
        agent = args.agent
        env = args.env
        agent.init(args.net_dim, env.state_space.shape[0], env.action_space.shape[0], args.learning_rate,
-                   args.if_per_or_gae, layer_norm=True)
-
+                   args.if_gae_or_raw, layer_norm=True)
        cwd = args.cwd
        gamma = args.gamma
        batch_size = args.batch_size  # how much data should be used to update net
        target_step = args.target_step  # how manysteps of one episode should stop
        repeat_times = args.repeat_times  # how many times should update for one batch size data
        soft_update_tau = args.soft_update_tau
+        num_episode = args.num_episode
        agent.state = env.reset()
        '''init buffer'''
        buffer = list()
        '''init training parameters'''
-        num_episode = args.num_episode
        # args.train = False
        # args.save_network = False
        # args.test_network = False
@ -388,7 +385,7 @@ if __name__ == '__main__':
        from plotDRL import PlotArgs, make_dir, plot_evaluation_information, plot_optimization_result

        plot_args = PlotArgs()
-        plot_args.feature_change = ''
+        plot_args.feature_change = 'gae'
        args.cwd = agent_name
        plot_dir = make_dir(args.cwd, plot_args.feature_change)
        plot_optimization_result(base_result, plot_dir)
--- a/PPO_llm.py
+++ b/PPO_llm.py
@ -249,7 +249,7 @@ class Arguments:
        self.repeat_times = 2 ** 3  # collect target_step, then update network
        self.target_step = 4096  # repeatedly update network to keep critic's loss small
        self.max_memo = self.target_step  # capacity of replay buffer
-        self.if_per_or_gae = False  # GAE for on-policy sparse reward: Generalized Advantage Estimation.
+        self.if_gae_or_raw = True  # GAE for on-policy sparse reward: Generalized Advantage Estimation.

        '''Arguments for evaluate'''
        self.random_seed = 0  # initialize random seed in self.init_before_training()
@ -317,7 +317,7 @@ if __name__ == '__main__':
        agent = args.agent
        env = args.env
        agent.init(args.net_dim, env.state_space.shape[0], env.action_space.shape[0], args.learning_rate,
-                   args.if_per_or_gae, layer_norm=True)
+                   args.if_gae_or_raw, layer_norm=True)

        cwd = args.cwd
        gamma = args.gamma
@ -389,7 +389,7 @@ if __name__ == '__main__':
        from plotDRL import PlotArgs, make_dir, plot_evaluation_information, plot_optimization_result

        plot_args = PlotArgs()
-        plot_args.feature_change = 'llm_5_no_mean'
+        plot_args.feature_change = 'llm'
        args.cwd = agent_name
        plot_dir = make_dir(args.cwd, plot_args.feature_change)
        plot_optimization_result(base_result, plot_dir)
--- a/SAC.py
+++ b/SAC.py
@ -47,20 +47,15 @@ if __name__ == '__main__':
        target_step = args.target_step  # how manysteps of one episode should stop
        repeat_times = args.repeat_times  # how many times should update for one batch size data
        soft_update_tau = args.soft_update_tau
-
        agent.state = env.reset()
-
        '''collect data and train and update network'''
        num_episode = args.num_episode
        '''here record real unbalance'''
-
-        ##
-        args.train = False
-        args.save_network = False
+        # args.train = False
+        # args.save_network = False
        # args.test_network = False
        # args.save_test_data = False
        # args.compare_with_gurobi = False
-        #
        if args.train:
            collect_data = True
            while collect_data:
--- a/TD3.py
+++ b/TD3.py
@ -47,11 +47,9 @@ if __name__ == '__main__':
        target_step = args.target_step  # how manysteps of one episode should stop
        repeat_times = args.repeat_times  # how many times should update for one batch size data
        soft_update_tau = args.soft_update_tau
-
        agent.state = env.reset()
        '''collect data and train and update network'''
        num_episode = args.num_episode
-
        # args.train=False
        # args.save_network=False
        # args.test_network=False
--- a/environment.py
+++ b/environment.py
@ -90,7 +90,7 @@ class ESSEnv(gym.Env):
        self.dg1.step(action[1])
        self.dg2.step(action[2])
        self.dg3.step(action[3])
-        self.solar.step(action[4], temperature, irradiance)
+        self.solar.step(temperature, irradiance, action[4])
        self.wind.step(wind_speed)
        self.current_output = np.array((self.dg1.current_output, self.dg2.current_output, self.dg3.current_output,
                                       -self.battery.energy_change))
--- a/environment_llm.py
+++ b/environment_llm.py
@ -93,7 +93,7 @@ class ESSEnv(gym.Env):
        self.dg1.step(action[1])
        self.dg2.step(action[2])
        self.dg3.step(action[3])
-        self.solar.step(action[4], temperature, irradiance)
+        self.solar.step(temperature, irradiance, action[4])
        self.wind.step(wind_speed)
        self.current_output = np.array((self.dg1.current_output, self.dg2.current_output, self.dg3.current_output,
                                        -self.battery.energy_change))
--- a/environment_primal_dual.py
+++ b/environment_primal_dual.py
@ -85,7 +85,7 @@ class ESSEnv(gym.Env):
        self.dg1.step(action[1])
        self.dg2.step(action[2])
        self.dg3.step(action[3])
-        self.solar.step(action[4], temperature, irradiance)
+        self.solar.step(temperature, irradiance, action[4])
        self.wind.step(wind_speed)
        self.current_output = np.array((self.dg1.current_output, self.dg2.current_output, self.dg3.current_output,
                                        -self.battery.energy_change, self.solar.current_power, self.wind.current_power))
--- a/module.py
+++ b/module.py
@ -81,13 +81,14 @@ class Solar:
        self.opex_cofficient = parameters['O_c']
        self.refer_irradiance = parameters['I_ref']
        self.refer_temperature = parameters['T_ref']
+        self.change_percent = parameters['C_per']

    def step(self, temperature, irradiance, action_voltage=0):
        I_sc = self.sc_current * (irradiance / self.refer_irradiance)
        V_oc = self.oc_voltage + self.temper_coefficient * (temperature - self.refer_temperature)

        current = I_sc - (V_oc / self.sh_resistance)
-        self.current_power = max((1 + action_voltage) * self.base_voltage * current, 0)
+        self.current_power = max((1 + self.change_percent * action_voltage) * self.base_voltage * current, 0)
        return self.current_power

    def get_cost(self, current_power):