8.22

2024-08-22 10:06:26 +08:00 · 2024-08-22 10:06:26 +08:00 · 012e4e0fd4
parent 94f447c711
commit 012e4e0fd4
22 changed files with 4143 additions and 5928 deletions
--- a/7.24.py
+++ b/7.24.py
@ -1,147 +0,0 @@
-import numpy as np
-import pandas as pd
-import time
-import os
-import json
-from concurrent.futures import ThreadPoolExecutor
-from deap import base, creator, tools, algorithms
-
-
-def fitness_numpy(individual, price, load, temperature, irradiance, wind_speed, prev_soc, prev_Pg1, prev_Pg2, prev_Pg3):
-    individual = np.array(individual)
-    price = np.array(price)
-    load = np.array(load)
-    temperature = np.array(temperature)
-    irradiance = np.array(irradiance)
-    wind_speed = np.array(wind_speed)
-
-    Ac, Ag1, Ag2, Ag3, Av = individual[:5]
-    soc = np.clip(prev_soc + 0.2 * Ac * 0.9, 0.2, 0.8)
-    Pg1 = np.clip(prev_Pg1 + 100 * Ag1, 0, 150)
-    Pg2 = np.clip(prev_Pg2 + 100 * Ag2, 0, 375)
-    Pg3 = np.clip(prev_Pg3 + 200 * Ag3, 0, 500)
-    Pso = np.clip((0.2 * irradiance + 0.05 * temperature - 9.25) * (1 + Av), 0, None)
-    Pw = np.where((wind_speed >= 3) & (wind_speed < 8), wind_speed ** 3 * 172.2625 / 1000,
-                  np.where((wind_speed >= 8) & (wind_speed < 12), 64 * 172.2625 / 125, 0))
-    P = Ac + Pg1 + Pg2 + Pg3 + Pso + Pw
-
-    Ee = np.where(P >= load, P - load, 0)
-    Es = np.where(P < load, load - P, 0)
-
-    Cb = 0.01 * Ac + 0.1 * soc
-    Cg1 = 0.0034 * Pg1 ** 2 + 3 * Pg1 + 30
-    Cg2 = 0.001 * Pg2 ** 2 + 10 * Pg2 + 40
-    Cg3 = 0.001 * Pg3 ** 2 + 15 * Pg3 + 70
-    Cs = 0.01 * Pso
-    Cw = 0.01 * Pw
-    Rs = 0.5 * price * Ee
-    Cp = price * Es
-    Pe = np.where(Ee > 100, (Ee - 100) * 50, 0)
-    Ps = np.where(Es > 100, (Es - 100) * 50, 0)
-
-    total_cost = np.sum(Cb + Cg1 + Cg2 + Cg3 + Cs + Cw - Rs + Cp + Pe + Ps)
-    reward = -total_cost / 1000
-
-    return (reward,)
-
-
-def check_bounds(func):
-    def wrapper(*args, **kwargs):
-        offspring = func(*args, **kwargs)
-        for individual in offspring:
-            for i in range(len(individual)):
-                individual[i] = np.clip(individual[i], -1, 1)
-        return offspring
-    return wrapper
-
-
-def main():
-    data = pd.read_csv('./data.csv')
-
-    price = data['price'].values
-    load = data['load'].values
-    temperature = data['temperature'].values
-    irradiance = data['irradiance'].values
-    wind_speed = data['wind_speed'].values
-
-    creator.create("FitnessMax", base.Fitness, weights=(1.0,))
-    creator.create("Individual", list, fitness=creator.FitnessMax)
-
-    toolbox = base.Toolbox()
-    toolbox.register("attr_float", np.random.uniform, -1, 1)
-    toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_float, n=5)
-    toolbox.register("population", tools.initRepeat, list, toolbox.individual)
-
-    toolbox.register("mate", check_bounds(tools.cxBlend), alpha=0.5)
-    toolbox.register("mutate", check_bounds(tools.mutGaussian), mu=0, sigma=0.1, indpb=0.2)
-    toolbox.register("select", tools.selTournament, tournsize=3)
-    toolbox.register("evaluate", fitness_numpy)
-
-    population = toolbox.population(n=500)
-    prev_soc, prev_Pg1, prev_Pg2, prev_Pg3 = 0.4, 0.0, 0.0, 0.0
-    decision_values = []
-
-    for hour in range(8760):
-        start = time.time()
-        current_price = price[hour]
-        current_load = load[hour]
-        current_temperature = temperature[hour]
-        current_irradiance = irradiance[hour]
-        current_wind_speed = wind_speed[hour]
-
-        for gen in range(500):
-            offspring = toolbox.select(population, len(population))
-            offspring = list(map(toolbox.clone, offspring))
-
-            for child1, child2 in zip(offspring[::2], offspring[1::2]):
-                if np.random.rand() < 0.7:
-                    toolbox.mate(child1, child2)
-                    del child1.fitness.values
-                    del child2.fitness.values
-            for mutant in offspring:
-                if np.random.rand() < 0.2:
-                    toolbox.mutate(mutant)
-                    del mutant.fitness.values
-
-            invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
-
-            with ThreadPoolExecutor() as executor:
-                futures = []
-                batch_size = 500
-
-                for i in range(0, len(invalid_ind), batch_size):
-                    batch = invalid_ind[i:i + batch_size]
-                    for ind in batch:
-                        futures.append(
-                            executor.submit(toolbox.evaluate, ind, current_price, current_load, current_temperature,
-                                            current_irradiance, current_wind_speed, prev_soc, prev_Pg1, prev_Pg2,
-                                            prev_Pg3))
-
-                for future in futures:
-                    fitness = future.result()
-                    for ind in invalid_ind:
-                        if not ind.fitness.valid:
-                            ind.fitness.values = fitness
-
-            population[:] = offspring
-        end = time.time()
-        best_ind = tools.selBest(population, 1)[0]
-        print(f'Best individual at hour {hour + 1}: {best_ind}')
-        print(f'Fitness: {best_ind.fitness.values}, using {end - start}s')
-
-        decision_values.append({
-            'Ac': best_ind[0],
-            'Ag1': best_ind[1],
-            'Ag2': best_ind[2],
-            'Ag3': best_ind[3],
-            'Av': best_ind[4]
-        })
-
-        prev_soc, prev_Pg1, prev_Pg2, prev_Pg3 = best_ind[0], best_ind[1], best_ind[2], best_ind[3]
-
-    with open('decision_values.json', 'w') as f:
-        json.dump(decision_values, f)
-
-
-if __name__ == "__main__":
-    main()
--- a/AgentPPO/DRL__plots/evaluation_information.svg
+++ b/AgentPPO/DRL__plots/evaluation_information.svg
--- a/AgentPPO/DRL__plots/optimization_information.svg
+++ b/AgentPPO/DRL__plots/optimization_information.svg
--- a/DDPG.py
+++ b/DDPG.py
@ -83,7 +83,6 @@ if __name__ == '__main__':
                        steps, r_exp = update_buffer(trajectory)
    loss_record_path = f'{args.cwd}/loss_data.pkl'
    reward_record_path = f'{args.cwd}/reward_data.pkl'
-    # current only store last seed corresponded actor 
    act_save_path = f'{args.cwd}/actor.pth'

    with open(loss_record_path, 'wb') as tf:
@ -118,13 +117,13 @@ if __name__ == '__main__':
        from plotDRL import PlotArgs, make_dir, plot_evaluation_information, plot_optimization_result

        plot_args = PlotArgs()
-        plot_args.feature_change = ''
+        plot_args.feature_change = '730'
        args.cwd = agent_name
        plot_dir = make_dir(args.cwd, plot_args.feature_change)
        plot_optimization_result(base_result, plot_dir)
        plot_evaluation_information(args.cwd + '/' + 'test_data.pkl', plot_dir)
    '''compare the different cost get from gurobi and DDPG'''
    ration = sum(eval_data['operation_cost']) / sum(base_result['step_cost'])
-    print('operation_cost_sum:', sum(eval_data['operation_cost']))
-    print('step_cost_sum:', sum(base_result['step_cost']))
+    print('rl_cost:', sum(eval_data['operation_cost']))
+    print('gurobi_cost:', sum(base_result['step_cost']))
    print('ration:', ration)
--- a/PPO.py
+++ b/PPO.py
@ -260,7 +260,6 @@ class Arguments:
        self.cwd = None  # current work directory. None means set automatically
        self.if_remove = False  # remove the cwd folder? (True, False, None:ask me)
        self.visible_gpu = '0'  # for example: os.environ['CUDA_VISIBLE_DEVICES'] = '0, 2,'
-        # self.worker_num = 2  # rollout workers number pre GPU (adjust it to get high GPU usage)
        self.num_threads = 32  # cpu_num for evaluate model, torch.set_num_threads(self.num_threads)

        '''Arguments for training'''
@ -268,7 +267,6 @@ class Arguments:
        self.gamma = 0.995  # discount factor of future rewards
        self.learning_rate = 2 ** -14  # 2e-4
        self.soft_update_tau = 2 ** -8  # 2 ** -8 ~= 5e-3
-
        self.net_dim = 256  # the network width
        self.batch_size = 4096  # num of transitions sampled from replay buffer.
        self.repeat_times = 2 ** 3  # collect target_step, then update network
@ -352,8 +350,8 @@ if __name__ == '__main__':
        '''init buffer'''
        buffer = list()
        '''init training parameters'''
-        # args.train = False
-        # args.save_network = False
+        args.train = False
+        args.save_network = False
        # args.test_network = False
        # args.save_test_data = False
        # args.compare_with_gurobi = False
@ -417,6 +415,6 @@ if __name__ == '__main__':
        plot_evaluation_information(args.cwd + '/' + 'test_data.pkl', plot_dir)
    '''compare the different cost get from gurobi and PPO'''
    ration = sum(eval_data['operation_cost']) / sum(base_result['step_cost'])
-    print('operation_cost_sum:', sum(eval_data['operation_cost']))
-    print('step_cost_sum:', sum(base_result['step_cost']))
+    print('rl_cost:', sum(eval_data['operation_cost']))
+    print('gurobi_cost:', sum(base_result['step_cost']))
    print('ration:', ration)
--- a/PPO_delete.py
+++ b/PPO_delete.py
@ -1,428 +0,0 @@
-import os
-import json
-import pickle
-from copy import deepcopy
-import numpy as np
-import pandas as pd
-import torch
-import torch.nn as nn
-from environment import ESSEnv
-from tools import get_episode_return, test_one_episode, optimization_base_result
-
-os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
-
-
-class ActorPPO(nn.Module):
-    def __init__(self, mid_dim, state_dim, action_dim, layer_norm=False):
-        super().__init__()
-        self.layer_norm = layer_norm
-        self.net = nn.Sequential(nn.Linear(state_dim, mid_dim), nn.ReLU(),
-                                 nn.Linear(mid_dim, mid_dim), nn.ReLU(),
-                                 nn.Linear(mid_dim, mid_dim), nn.Hardswish(),
-                                 nn.Linear(mid_dim, action_dim))
-        # the logarithm (log) of standard deviation (std) of action, it is a trainable parameter
-        self.a_logstd = nn.Parameter(torch.zeros((1, action_dim)) - 0.5, requires_grad=True)
-        self.sqrt_2pi_log = np.log(np.sqrt(2 * np.pi))
-
-        if self.layer_norm:
-            self.apply_layer_norm()
-
-    def apply_layer_norm(self):
-        def init_weights(layer):
-            if isinstance(layer, nn.Linear):
-                nn.init.orthogonal_(layer.weight, 1.0)
-                nn.init.constant_(layer.bias, 0.0)
-
-        self.net.apply(init_weights)
-
-    def forward(self, state):
-        return self.net(state).tanh()  # action.tanh() limit the data output of action
-
-    def get_action(self, state):
-        a_avg = self.forward(state)  # too big for the action
-        a_std = self.a_logstd.exp()
-
-        noise = torch.randn_like(a_avg)
-        action = a_avg + noise * a_std
-        return action, noise
-
-    def get_logprob_entropy(self, state, action):
-        a_avg = self.forward(state)
-        a_std = self.a_logstd.exp()
-
-        delta = ((a_avg - action) / a_std).pow(2) * 0.5
-        logprob = -(self.a_logstd + self.sqrt_2pi_log + delta).sum(1)  # new_logprob
-
-        dist_entropy = (logprob.exp() * logprob).mean()  # policy entropy
-        return logprob, dist_entropy
-
-    def get_old_logprob(self, _action, noise):  # noise = action - a_noise
-        delta = noise.pow(2) * 0.5
-        return -(self.a_logstd + self.sqrt_2pi_log + delta).sum(1)  # old_logprob
-
-
-class CriticAdv(nn.Module):
-    def __init__(self, mid_dim, state_dim, _action_dim, layer_norm=False):
-        super().__init__()
-        self.layer_norm = layer_norm
-        self.net = nn.Sequential(nn.Linear(state_dim, mid_dim), nn.ReLU(),
-                                 nn.Linear(mid_dim, mid_dim), nn.ReLU(),
-                                 nn.Linear(mid_dim, mid_dim), nn.Hardswish(),
-                                 nn.Linear(mid_dim, 1))
-        if self.layer_norm:
-            self.apply_layer_norm()
-
-    def apply_layer_norm(self):
-        def init_weights(layer):
-            if isinstance(layer, nn.Linear):
-                nn.init.orthogonal_(layer.weight, 1.0)
-                nn.init.constant_(layer.bias, 0.0)
-        self.net.apply(init_weights)
-
-    def forward(self, state):
-        return self.net(state)  # Advantage value
-
-
-class AgentPPO:
-    def __init__(self):
-        super().__init__()
-        self.state = None
-        self.device = None
-        self.action_dim = None
-        self.get_obj_critic = None
-        self.current_step = 0
-
-        self.criterion = torch.nn.SmoothL1Loss()
-        self.cri = self.cri_target = self.if_use_cri_target = self.cri_optim = self.ClassCri = None
-        self.act = self.act_target = self.if_use_act_target = self.act_optim = self.ClassAct = None
-
-        '''init modify'''
-        self.ClassCri = CriticAdv
-        self.ClassAct = ActorPPO
-
-        self.ratio_clip = 0.2  # ratio.clamp(1 - clip, 1 + clip)
-        self.lambda_entropy = 0.02  # could be 0.01~0.05
-        self.lambda_gae_adv = 0.98  # could be 0.95~0.99, GAE (Generalized Advantage Estimation. ICLR.2016.)
-        self.get_reward_sum = None  # self.get_reward_sum_gae if if_use_gae else self.get_reward_sum_raw
-        self.trajectory_list = None
-        self.llm_actions = self.load_llm_actions()
-
-    def init(self, net_dim, state_dim, action_dim, learning_rate=1e-4, if_use_gae=False, gpu_id=0, layer_norm=False):
-        self.device = torch.device(f"cuda:{gpu_id}" if (torch.cuda.is_available() and (gpu_id >= 0)) else "cpu")
-        self.action_dim = action_dim
-        self.trajectory_list = list()
-
-        self.get_reward_sum = self.get_reward_sum_gae if if_use_gae else self.get_reward_sum_raw
-
-        self.cri = self.ClassCri(net_dim, state_dim, action_dim, layer_norm).to(self.device)
-        self.act = self.ClassAct(net_dim, state_dim, action_dim, layer_norm).to(
-            self.device) if self.ClassAct else self.cri
-        self.cri_target = deepcopy(self.cri) if self.if_use_cri_target else self.cri
-        self.act_target = deepcopy(self.act) if self.if_use_act_target else self.act
-
-        self.cri_optim = torch.optim.Adam(self.cri.parameters(), learning_rate)
-        self.act_optim = torch.optim.Adam(self.act.parameters(), learning_rate) if self.ClassAct else self.cri
-
-    # def select_action(self, state):
-    #     states = torch.as_tensor((state,), dtype=torch.float32, device=self.device)
-    #     actions, noises = self.act.get_action(states)
-    #     return actions[0].detach().cpu().numpy(), noises[0].detach().cpu().numpy()
-
-    def select_action(self, state):
-        states = torch.as_tensor((state,), dtype=torch.float32, device=self.device)
-        action_rl, noise = self.act.get_action(states[0])
-        action_rl = action_rl.detach().cpu().numpy().flatten()
-        noises = noise.detach().cpu().numpy().flatten()
-        index = self.current_step % len(self.llm_actions)
-        self.current_step += 1
-        action_llm = self.llm_actions[index]
-        action_llm = np.array(action_llm, dtype=np.float32)
-        action_combined = 0.5 * action_rl + 0.5 * action_llm
-        if action_combined.shape[0] != self.action_dim:
-            raise ValueError("Combined action dimension mismatch. Check the action generation process.")
-        return action_combined, noises
-
-    @staticmethod
-    def load_llm_actions():
-        with open('data/llm_action.json', 'r') as file:
-            llm_actions = json.load(file)
-        return llm_actions
-
-    def explore_env(self, env, target_step):
-        state = self.state  # sent state to agent and then agent sent state to method
-        trajectory_temp = list()
-        last_done = 0
-        for i in range(target_step):
-            action, noise = self.select_action(state)
-            action = np.tanh(action)  # make action between -1 & 1
-            # print(f"Action at step {i}: {action}")
-            if len(action) < 2:
-                raise ValueError("Action dimension is less than expected. Check the action generation process.")
-            state, next_state, reward, done, = env.step(action)
-            trajectory_temp.append((state, reward, done, action, noise))
-            if done:
-                state = env.reset()
-                self.current_step = 0
-                last_done = i
-            else:
-                state = next_state
-        self.state = state
-
-        '''splice list'''
-        # store 0 trajectory information to list
-        trajectory_list = self.trajectory_list + trajectory_temp[:last_done + 1]
-        self.trajectory_list = trajectory_temp[last_done:]
-        return trajectory_list
-
-    def update_net(self, buffer, batch_size, repeat_times, soft_update_tau):
-        """put data extract and update network together"""
-        with torch.no_grad():
-            buf_len = buffer[0].shape[0]
-            # decompose buffer data
-            buf_state, buf_action, buf_noise, buf_reward, buf_mask = [ten.to(self.device) for ten in buffer]
-
-            '''get buf_r_sum, buf_logprob'''
-            bs = 4096  # set a smaller 'BatchSize' when out of GPU memory: 1024, could change to 4096
-            buf_value = [self.cri_target(buf_state[i:i + bs]) for i in range(0, buf_len, bs)]
-            buf_value = torch.cat(buf_value, dim=0)
-            buf_logprob = self.act.get_old_logprob(buf_action, buf_noise)
-
-            buf_r_sum, buf_advantage = self.get_reward_sum(buf_len, buf_reward, buf_mask, buf_value)  # detach()
-            # normalize advantage
-            buf_advantage = (buf_advantage - buf_advantage.mean()) / (buf_advantage.std() + 1e-5)
-            del buf_noise, buffer[:]
-
-        '''PPO: Surrogate objective of Trust Region'''
-        obj_critic = obj_actor = None
-        for _ in range(int(buf_len / batch_size * repeat_times)):
-            indices = torch.randint(buf_len, size=(batch_size,), requires_grad=False, device=self.device)
-
-            state = buf_state[indices]
-            action = buf_action[indices]
-            r_sum = buf_r_sum[indices]
-            logprob = buf_logprob[indices]
-            advantage = buf_advantage[indices]
-
-            new_logprob, obj_entropy = self.act.get_logprob_entropy(state, action)  # it is obj_actor
-            ratio = (new_logprob - logprob.detach()).exp()
-            surrogate1 = advantage * ratio
-            surrogate2 = advantage * ratio.clamp(1 - self.ratio_clip, 1 + self.ratio_clip)
-            obj_surrogate = -torch.min(surrogate1, surrogate2).mean()
-            obj_actor = obj_surrogate + obj_entropy * self.lambda_entropy
-            self.optim_update(self.act_optim, obj_actor)  # update actor
-
-            value = self.cri(state).squeeze(1)  # critic network predicts the reward_sum (Q value) of state
-            # use smoothloss L1 to evaluate the value loss
-            # obj_critic = self.criterion(value, r_sum) / (r_sum.std() + 1e-6)
-            obj_critic = self.criterion(value, r_sum)
-            self.optim_update(self.cri_optim, obj_critic)  # calculate and update the back propogation of value loss
-            # choose whether to use soft update
-            self.soft_update(self.cri_target, self.cri, soft_update_tau) if self.cri_target is not self.cri else None
-
-        a_std_log = getattr(self.act, 'a_std_log', torch.zeros(1))
-        return obj_critic.item(), obj_actor.item(), a_std_log.mean().item()  # logging_tuple
-
-    def get_reward_sum_raw(self, buf_len, buf_reward, buf_mask, buf_value) -> (torch.Tensor, torch.Tensor):
-        buf_r_sum = torch.empty(buf_len, dtype=torch.float32, device=self.device)  # reward sum
-
-        pre_r_sum = 0
-        for i in range(buf_len - 1, -1, -1):
-            buf_r_sum[i] = buf_reward[i] + buf_mask[i] * pre_r_sum
-            pre_r_sum = buf_r_sum[i]
-        buf_advantage = buf_r_sum - (buf_mask * buf_value[:, 0])
-        return buf_r_sum, buf_advantage
-
-    def get_reward_sum_gae(self, buf_len, ten_reward, ten_mask, ten_value) -> (torch.Tensor, torch.Tensor):
-        buf_r_sum = torch.empty(buf_len, dtype=torch.float32, device=self.device)  # old policy value
-        buf_advantage = torch.empty(buf_len, dtype=torch.float32, device=self.device)  # advantage value
-
-        pre_r_sum = 0
-        pre_advantage = 0  # advantage value of previous step
-        for i in range(buf_len - 1, -1, -1):
-            buf_r_sum[i] = ten_reward[i] + ten_mask[i] * pre_r_sum
-            pre_r_sum = buf_r_sum[i]
-            buf_advantage[i] = ten_reward[i] + ten_mask[i] * (pre_advantage - ten_value[i])  # fix a bug here
-            pre_advantage = ten_value[i] + buf_advantage[i] * self.lambda_gae_adv
-        return buf_r_sum, buf_advantage
-
-    @staticmethod
-    def optim_update(optimizer, objective):
-        optimizer.zero_grad()
-        objective.backward()
-        optimizer.step()
-
-    @staticmethod
-    def soft_update(target_net, current_net, tau):
-        for tar, cur in zip(target_net.parameters(), current_net.parameters()):
-            tar.data.copy_(cur.data.__mul__(tau) + tar.data.__mul__(1.0 - tau))
-
-
-class Arguments:
-    def __init__(self, agent=None, env=None):
-        self.agent = agent  # Deep Reinforcement Learning algorithm
-        self.env = env  # the environment for training
-        self.cwd = None  # current work directory. None means set automatically
-        self.if_remove = False  # remove the cwd folder? (True, False, None:ask me)
-        self.visible_gpu = '0'  # for example: os.environ['CUDA_VISIBLE_DEVICES'] = '0, 2,'
-        # self.worker_num = 2  # rollout workers number pre GPU (adjust it to get high GPU usage)
-        self.num_threads = 32  # cpu_num for evaluate model, torch.set_num_threads(self.num_threads)
-
-        '''Arguments for training'''
-        self.num_episode = 1000  # to control the train episodes for PPO
-        self.gamma = 0.995  # discount factor of future rewards
-        self.learning_rate = 2 ** -14  # 2e-4
-        self.soft_update_tau = 2 ** -8  # 2 ** -8 ~= 5e-3
-
-        self.net_dim = 256  # the network width
-        self.batch_size = 4096  # num of transitions sampled from replay buffer.
-        self.repeat_times = 2 ** 3  # collect target_step, then update network
-        self.target_step = 4096  # repeatedly update network to keep critic's loss small
-        self.max_memo = self.target_step  # capacity of replay buffer
-        self.if_per_or_gae = False  # GAE for on-policy sparse reward: Generalized Advantage Estimation.
-
-        '''Arguments for evaluate'''
-        self.random_seed = 0  # initialize random seed in self.init_before_training()
-        # self.random_seed_list = [1234, 2234, 3234, 4234, 5234]
-        self.random_seed_list = [1234]
-        self.train = True
-        self.save_network = True
-        self.test_network = True
-        self.save_test_data = True
-        self.compare_with_gurobi = True
-        self.plot_on = True
-
-    def init_before_training(self, if_main):
-        if self.cwd is None:
-            agent_name = self.agent.__class__.__name__
-            self.cwd = f'./{agent_name}'
-
-        if if_main:
-            import shutil  # remove history according to bool(if_remove)
-            if self.if_remove is None:
-                self.if_remove = bool(input(f"| PRESS 'y' to REMOVE: {self.cwd}? ") == 'y')
-            elif self.if_remove:
-                shutil.rmtree(self.cwd, ignore_errors=True)
-                print(f"| Remove cwd: {self.cwd}")
-            os.makedirs(self.cwd, exist_ok=True)
-
-        np.random.seed(self.random_seed)
-        torch.manual_seed(self.random_seed)
-        torch.set_num_threads(self.num_threads)
-        torch.set_default_dtype(torch.float32)
-
-        os.environ['CUDA_VISIBLE_DEVICES'] = str(self.visible_gpu)
-
-
-def update_buffer(_trajectory):
-    _trajectory = list(map(list, zip(*_trajectory)))  # 2D-list transpose, here cut the trajectory into 5 parts 
-    ten_state = torch.as_tensor(_trajectory[0])  # tensor state here
-    ten_reward = torch.as_tensor(_trajectory[1], dtype=torch.float32)
-    # _trajectory[2] = done, replace done by mask, save memory
-    ten_mask = (1.0 - torch.as_tensor(_trajectory[2], dtype=torch.float32)) * gamma
-    ten_action = torch.as_tensor(_trajectory[3])
-    ten_noise = torch.as_tensor(_trajectory[4], dtype=torch.float32)
-
-    buffer[:] = (ten_state, ten_action, ten_noise, ten_reward, ten_mask)  # list store tensors
-
-    _steps = ten_reward.shape[0]  # how many steps are collected in all trajectories
-    _r_exp = ten_reward.mean()  # the mean reward
-    return _steps, _r_exp
-
-
-if __name__ == '__main__':
-    args = Arguments()
-    reward_record = {'episode': [], 'steps': [], 'mean_episode_reward': [], 'unbalance': []}
-    loss_record = {'episode': [], 'steps': [], 'critic_loss': [], 'actor_loss': [], 'entropy_loss': []}
-    args.visible_gpu = '0'
-    for seed in args.random_seed_list:
-        args.random_seed = seed
-        args.agent = AgentPPO()
-
-        agent_name = f'{args.agent.__class__.__name__}'
-        args.agent.cri_target = True
-        args.env = ESSEnv()
-        args.init_before_training(if_main=True)
-        '''init agent and environment'''
-        agent = args.agent
-        env = args.env
-        agent.init(args.net_dim, env.state_space.shape[0], env.action_space.shape[0], args.learning_rate,
-                   args.if_per_or_gae, layer_norm=True)
-
-        cwd = args.cwd
-        gamma = args.gamma
-        batch_size = args.batch_size  # how much data should be used to update net
-        target_step = args.target_step  # how manysteps of one episode should stop
-        repeat_times = args.repeat_times  # how many times should update for one batch size data
-        soft_update_tau = args.soft_update_tau
-        agent.state = env.reset()
-        '''init buffer'''
-        buffer = list()
-        '''init training parameters'''
-        num_episode = args.num_episode
-        # args.train=False
-        # args.save_network=False
-        # args.test_network=False
-        # args.save_test_data=False
-        # args.compare_with_gurobi=False
-        if args.train:
-            for i_episode in range(num_episode):
-                with torch.no_grad():
-                    trajectory_list = agent.explore_env(env, target_step)
-                    steps, r_exp = update_buffer(trajectory_list)
-                critic_loss, actor_loss, entropy_loss = agent.update_net(buffer, batch_size, repeat_times,
-                                                                         soft_update_tau)
-                loss_record['critic_loss'].append(critic_loss)
-                loss_record['actor_loss'].append(actor_loss)
-                loss_record['entropy_loss'].append(entropy_loss)
-
-                with torch.no_grad():
-                    episode_reward, episode_unbalance = get_episode_return(env, agent.act, agent.device)
-                    reward_record['mean_episode_reward'].append(episode_reward)
-                    reward_record['unbalance'].append(episode_unbalance)
-                print(f'current epsiode is {i_episode}, reward:{episode_reward},unbalance:{episode_unbalance}')
-    act_save_path = f'{args.cwd}/actor.pth'
-    loss_record_path = f'{args.cwd}/loss_data.pkl'
-    reward_record_path = f'{args.cwd}/reward_data.pkl'
-
-    with open(loss_record_path, 'wb') as tf:
-        pickle.dump(loss_record, tf)
-    with open(reward_record_path, 'wb') as tf:
-        pickle.dump(reward_record, tf)
-
-    if args.save_network:
-        torch.save(agent.act.state_dict(), act_save_path)
-        print('actor parameters have been saved')
-
-    if args.test_network:
-        args.cwd = agent_name
-        agent.act.load_state_dict(torch.load(act_save_path))
-        print('parameters have been reload and test')
-        record = test_one_episode(env, agent.act, agent.device)
-        eval_data = pd.DataFrame(record['system_info'])
-        eval_data.columns = ['time_step', 'price', 'netload', 'action', 'real_action', 'soc', 'battery', 'gen1', 'gen2',
-                             'gen3', 'temperature', 'irradiance', 'unbalance', 'operation_cost']
-    if args.save_test_data:
-        test_data_save_path = f'{args.cwd}/test_data.pkl'
-        with open(test_data_save_path, 'wb') as tf:
-            pickle.dump(record, tf)
-
-    '''compare with gurobi data and results'''
-    if args.compare_with_gurobi:
-        month = record['init_info'][0][0]
-        day = record['init_info'][0][1]
-        initial_soc = record['init_info'][0][3]
-        base_result = optimization_base_result(env, month, day, initial_soc)
-    if args.plot_on:
-        from plotDRL import PlotArgs, make_dir, plot_evaluation_information, plot_optimization_result
-
-        plot_args = PlotArgs()
-        plot_args.feature_change = ''
-        args.cwd = agent_name
-        plot_dir = make_dir(args.cwd, plot_args.feature_change)
-        plot_optimization_result(base_result, plot_dir)
-        plot_evaluation_information(args.cwd + '/' + 'test_data.pkl', plot_dir)
-    '''compare the different cost get from gurobi and PPO'''
-    ration = sum(eval_data['operation_cost']) / sum(base_result['step_cost'])
-    print('operation_cost_sum:', sum(eval_data['operation_cost']))
-    print('step_cost_sum:', sum(base_result['step_cost']))
-    print('ration:', ration)
--- a/PPO_llm.py
+++ b/PPO_llm.py
@ -31,6 +31,7 @@ class ActorPPO(nn.Module):
            if isinstance(layer, nn.Linear):
                nn.init.orthogonal_(layer.weight, 1.0)
                nn.init.constant_(layer.bias, 0.0)
+
        self.net.apply(init_weights)

    def forward(self, state):
@ -83,7 +84,7 @@ class CriticAdv(nn.Module):


 class AgentPPO:
-    def __init__(self):
+    def __init__(self, file_path='data/llm_actions.json'):
        super().__init__()
        self.state = None
        self.device = None
@ -103,6 +104,8 @@ class AgentPPO:
        self.lambda_gae_adv = 0.98  # could be 0.95~0.99, GAE (Generalized Advantage Estimation. ICLR.2016.)
        self.get_reward_sum = None  # self.get_reward_sum_gae if if_use_gae else self.get_reward_sum_raw
        self.trajectory_list = None
+        self.llm_actions = self.load_llm_actions(file_path)
+        self.current_time = 0

    def init(self, net_dim, state_dim, action_dim, learning_rate=1e-4, if_use_gae=False, gpu_id=0, layer_norm=False):
        self.device = torch.device(f"cuda:{gpu_id}" if (torch.cuda.is_available() and (gpu_id >= 0)) else "cpu")
@ -118,10 +121,33 @@ class AgentPPO:
        self.cri_optim = torch.optim.Adam(self.cri.parameters(), learning_rate)
        self.act_optim = torch.optim.Adam(self.act.parameters(), learning_rate) if self.ClassAct else self.cri

+    @staticmethod
+    def load_llm_actions(file_path):
+        with open(file_path, 'r') as file:
+            data = json.load(file)
+        return data
+
+    @staticmethod
+    def combine_actions(rl_action, llm_action):
+        combined_action = [
+            (rl_action[0] + llm_action['asoc']) / 2,
+            (rl_action[1] + llm_action['ag1']) / 2,
+            (rl_action[2] + llm_action['ag2']) / 2,
+            (rl_action[3] + llm_action['ag3']) / 2,
+            (rl_action[4] + llm_action['apv']) / 2
+        ]
+        return combined_action
+
    def select_action(self, state):
        states = torch.as_tensor((state,), dtype=torch.float32, device=self.device)
        actions, noises = self.act.get_action(states)
-        return actions[0].detach().cpu().numpy(), noises[0].detach().cpu().numpy()
+        rl_action = actions[0].detach().cpu().numpy()
+        if self.current_time >= len(self.llm_actions):
+            self.current_time = 0
+        llm_action = self.llm_actions[self.current_time]
+        combined_action = self.combine_actions(rl_action, llm_action)
+        self.current_time += 1
+        return combined_action, noises[0].detach().cpu().numpy()

    def explore_env(self, env, target_step):
        state = self.state  # sent state to agent and then agent sent state to method
@ -307,7 +333,7 @@ if __name__ == '__main__':
    args.visible_gpu = '0'
    for seed in args.random_seed_list:
        args.random_seed = seed
-        args.agent = AgentPPO()
+        args.agent = AgentPPO('data/llm_actions.json')

        agent_name = f'{args.agent.__class__.__name__}'
        args.agent.cri_target = True
--- a/PPO_primal_dual.py
+++ b/PPO_primal_dual.py
@ -1,393 +0,0 @@
-import os
-import pickle
-from copy import deepcopy
-import numpy as np
-import pandas as pd
-import torch
-import torch.nn as nn
-from environment import ESSEnv
-from tools import get_episode_return, test_one_episode, optimization_base_result
-
-os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
-
-
-class ActorPPO(nn.Module):
-    def __init__(self, mid_dim, state_dim, action_dim, layer_norm=False):
-        super().__init__()
-        self.layer_norm = layer_norm
-        self.net = nn.Sequential(
-            nn.Linear(state_dim, mid_dim), nn.ReLU(),
-            nn.Linear(mid_dim, mid_dim), nn.ReLU(),
-            nn.Linear(mid_dim, mid_dim), nn.Hardswish(),
-            nn.Linear(mid_dim, action_dim)
-        )
-        self.a_logstd = nn.Parameter(torch.zeros((1, action_dim)) - 0.5, requires_grad=True)
-        self.sqrt_2pi_log = np.log(np.sqrt(2 * np.pi))
-
-        if self.layer_norm:
-            self.apply_layer_norm()
-
-    def apply_layer_norm(self):
-        def init_weights(layer):
-            if isinstance(layer, nn.Linear):
-                nn.init.orthogonal_(layer.weight, 1.0)
-                nn.init.constant_(layer.bias, 0.0)
-
-        self.net.apply(init_weights)
-
-    def forward(self, state):
-        return self.net(state).tanh()
-
-    def get_action(self, state):
-        a_avg = self.forward(state)
-        a_std = self.a_logstd.exp()
-        noise = torch.randn_like(a_avg)
-        action = a_avg + noise * a_std
-        return action, noise
-
-    def get_logprob_entropy(self, state, action):
-        a_avg = self.forward(state)
-        a_std = self.a_logstd.exp()
-        delta = ((a_avg - action) / a_std).pow(2) * 0.5
-        logprob = -(self.a_logstd + self.sqrt_2pi_log + delta).sum(1)
-        dist_entropy = (logprob.exp() * logprob).mean()
-        return logprob, dist_entropy
-
-    def get_old_logprob(self, _action, noise):
-        delta = noise.pow(2) * 0.5
-        return -(self.a_logstd + self.sqrt_2pi_log + delta).sum(1)
-
-
-class CriticAdv(nn.Module):
-    def __init__(self, mid_dim, state_dim, _action_dim, layer_norm=False):
-        super().__init__()
-        self.layer_norm = layer_norm
-        self.net = nn.Sequential(
-            nn.Linear(state_dim, mid_dim), nn.ReLU(),
-            nn.Linear(mid_dim, mid_dim), nn.ReLU(),
-            nn.Linear(mid_dim, mid_dim), nn.Hardswish(),
-            nn.Linear(mid_dim, 1)
-        )
-        if self.layer_norm:
-            self.apply_layer_norm()
-
-    def apply_layer_norm(self):
-        def init_weights(layer):
-            if isinstance(layer, nn.Linear):
-                nn.init.orthogonal_(layer.weight, 1.0)
-                nn.init.constant_(layer.bias, 0.0)
-
-        self.net.apply(init_weights)
-
-    def forward(self, state):
-        return self.net(state)
-
-
-class AgentPrimalDualPPO:
-    def __init__(self):
-        self.state = None
-        self.device = None
-        self.action_dim = None
-        self.get_obj_critic = None
-
-        self.criterion = torch.nn.SmoothL1Loss()
-        self.cri = self.cri_target = self.if_use_cri_target = self.cri_optim = self.ClassCri = None
-        self.act = self.act_target = self.if_use_act_target = self.act_optim = self.ClassAct = None
-
-        self.ClassCri = CriticAdv
-        self.ClassAct = ActorPPO
-
-        self.ratio_clip = 0.2
-        self.lambda_entropy = 0.02
-        self.lambda_gae_adv = 0.98
-        self.get_reward_sum = None
-        self.trajectory_list = None
-
-        self.lambda_cost = 1.0  # 初始对偶变量
-        self.constraint_value = 1.0  # 约束值，例如安全成本限制
-
-    def init(self, net_dim, state_dim, action_dim, learning_rate=1e-4, if_use_gae=False, gpu_id=0, layer_norm=False):
-        self.device = torch.device(f"cuda:{gpu_id}" if (torch.cuda.is_available() and (gpu_id >= 0)) else "cpu")
-        self.trajectory_list = list()
-        self.get_reward_sum = self.get_reward_sum_gae if if_use_gae else self.get_reward_sum_raw
-
-        self.cri = self.ClassCri(net_dim, state_dim, action_dim, layer_norm).to(self.device)
-        self.act = self.ClassAct(net_dim, state_dim, action_dim, layer_norm).to(
-            self.device) if self.ClassAct else self.cri
-        self.cri_target = deepcopy(self.cri) if self.if_use_cri_target else self.cri
-        self.act_target = deepcopy(self.act) if self.if_use_act_target else self.act
-
-        self.cri_optim = torch.optim.Adam(self.cri.parameters(), learning_rate)
-        self.act_optim = torch.optim.Adam(self.act.parameters(), learning_rate) if self.ClassAct else self.cri
-
-    def select_action(self, state):
-        states = torch.as_tensor((state,), dtype=torch.float32, device=self.device)
-        actions, noises = self.act.get_action(states)
-        return actions[0].detach().cpu().numpy(), noises[0].detach().cpu().numpy()
-
-    def explore_env(self, env, target_step):
-        state = self.state
-        trajectory_temp = list()
-        last_done = 0
-        for i in range(target_step):
-            action, noise = self.select_action(state)
-            state, next_state, reward, done, cost = env.step(np.tanh(action))
-            trajectory_temp.append((state, reward, done, action, noise, cost))
-            if done:
-                state = env.reset()
-                last_done = i
-            else:
-                state = next_state
-        self.state = state
-
-        trajectory_list = self.trajectory_list + trajectory_temp[:last_done + 1]
-        self.trajectory_list = trajectory_temp[last_done:]
-        return trajectory_list
-
-    def update_net(self, buffer, batch_size, repeat_times, soft_update_tau):
-        with torch.no_grad():
-            buf_len = buffer[0].shape[0]
-            buf_state, buf_action, buf_noise, buf_reward, buf_mask, buf_cost = [ten.to(self.device) for ten in buffer]
-
-            buf_value = torch.cat([self.cri_target(buf_state[i:i + 4096]) for i in range(0, buf_len, 4096)], dim=0)
-            buf_logprob = self.act.get_old_logprob(buf_action, buf_noise)
-
-            buf_r_sum, buf_advantage = self.get_reward_sum(buf_len, buf_reward, buf_mask, buf_value)
-            buf_advantage = (buf_advantage - buf_advantage.mean()) / (buf_advantage.std() + 1e-5)
-
-            cost_sum = buf_cost.sum().item()
-            if cost_sum > self.constraint_value:
-                self.lambda_cost += 0.01 * (cost_sum - self.constraint_value)
-            else:
-                self.lambda_cost -= 0.01 * (self.constraint_value - cost_sum)
-            self.lambda_cost = max(self.lambda_cost, 0)
-
-        obj_critic = obj_actor = None
-        for _ in range(int(buf_len / batch_size * repeat_times)):
-            indices = torch.randint(buf_len, size=(batch_size,), requires_grad=False, device=self.device)
-            state = buf_state[indices]
-            action = buf_action[indices]
-            r_sum = buf_r_sum[indices]
-            logprob = buf_logprob[indices]
-            advantage = buf_advantage[indices]
-
-            new_logprob, obj_entropy = self.act.get_logprob_entropy(state, action)
-            ratio = (new_logprob - logprob.detach()).exp()
-            surrogate1 = advantage * ratio
-            surrogate2 = advantage * ratio.clamp(1 - self.ratio_clip, 1 + self.ratio_clip)
-            obj_surrogate = -torch.min(surrogate1, surrogate2).mean()
-            obj_actor = obj_surrogate + obj_entropy * self.lambda_entropy - self.lambda_cost * buf_cost[indices].mean()
-            self.optim_update(self.act_optim, obj_actor)
-
-            value = self.cri(state).squeeze(1)
-            obj_critic = self.criterion(value, r_sum)
-            self.optim_update(self.cri_optim, obj_critic)
-            if self.cri_target is not self.cri:
-                self.soft_update(self.cri_target, self.cri, soft_update_tau)
-
-        a_std_log = getattr(self.act, 'a_std_log', torch.zeros(1))
-        return obj_critic.item(), obj_actor.item(), a_std_log.mean().item()
-
-    def get_reward_sum_raw(self, buf_len, buf_reward, buf_mask, buf_value) -> (torch.Tensor, torch.Tensor):
-        buf_r_sum = torch.empty(buf_len, dtype=torch.float32, device=self.device)
-        pre_r_sum = 0
-        for i in range(buf_len - 1, -1, -1):
-            buf_r_sum[i] = buf_reward[i] + buf_mask[i] * pre_r_sum
-            pre_r_sum = buf_r_sum[i]
-        buf_advantage = buf_r_sum - (buf_mask * buf_value[:, 0])
-        return buf_r_sum, buf_advantage
-
-    def get_reward_sum_gae(self, buf_len, ten_reward, ten_mask, ten_value) -> (torch.Tensor, torch.Tensor):
-        buf_r_sum = torch.empty(buf_len, dtype=torch.float32, device=self.device)
-        buf_advantage = torch.empty(buf_len, dtype=torch.float32, device=self.device)
-        pre_r_sum = 0
-        pre_advantage = 0
-        for i in range(buf_len - 1, -1, -1):
-            buf_r_sum[i] = ten_reward[i] + ten_mask[i] * pre_r_sum
-            pre_r_sum = buf_r_sum[i]
-            buf_advantage[i] = ten_reward[i] + ten_mask[i] * (pre_advantage - ten_value[i])
-            pre_advantage = ten_value[i] + buf_advantage[i] * self.lambda_gae_adv
-        return buf_r_sum, buf_advantage
-
-    @staticmethod
-    def optim_update(optimizer, objective):
-        optimizer.zero_grad()
-        objective.backward()
-        optimizer.step()
-
-    @staticmethod
-    def soft_update(target_net, current_net, tau):
-        for tar, cur in zip(target_net.parameters(), current_net.parameters()):
-            tar.data.copy_(cur.data * tau + tar.data * (1.0 - tau))
-
-
-class Arguments:
-    def __init__(self, agent=None, env=None):
-        self.agent = agent  # Deep Reinforcement Learning algorithm
-        self.env = env  # the environment for training
-        self.cwd = None  # current work directory. None means set automatically
-        self.if_remove = False  # remove the cwd folder? (True, False, None:ask me)
-        self.visible_gpu = '0'  # for example: os.environ['CUDA_VISIBLE_DEVICES'] = '0, 2,'
-        # self.worker_num = 2  # rollout workers number pre GPU (adjust it to get high GPU usage)
-        self.num_threads = 32  # cpu_num for evaluate model, torch.set_num_threads(self.num_threads)
-
-        '''Arguments for training'''
-        self.num_episode = 1000  # to control the train episodes for PPO
-        self.gamma = 0.995  # discount factor of future rewards
-        self.learning_rate = 2 ** -14  # 2e-4
-        self.soft_update_tau = 2 ** -8  # 2 ** -8 ~= 5e-3
-
-        self.net_dim = 256  # the network width
-        self.batch_size = 4096  # num of transitions sampled from replay buffer.
-        self.repeat_times = 2 ** 3  # collect target_step, then update network
-        self.target_step = 4096  # repeatedly update network to keep critic's loss small
-        self.max_memo = self.target_step  # capacity of replay buffer
-        self.if_per_or_gae = False  # GAE for on-policy sparse reward: Generalized Advantage Estimation.
-
-        '''Arguments for evaluate'''
-        self.random_seed = 0  # initialize random seed in self.init_before_training()
-        # self.random_seed_list = [1234, 2234, 3234, 4234, 5234]
-        self.random_seed_list = [1234]
-        self.train = True
-        self.save_network = True
-        self.test_network = True
-        self.save_test_data = True
-        self.compare_with_gurobi = True
-        self.plot_on = True
-
-    def init_before_training(self, if_main):
-        if self.cwd is None:
-            agent_name = self.agent.__class__.__name__
-            self.cwd = f'./{agent_name}'
-
-        if if_main:
-            import shutil  # remove history according to bool(if_remove)
-            if self.if_remove is None:
-                self.if_remove = bool(input(f"| PRESS 'y' to REMOVE: {self.cwd}? ") == 'y')
-            elif self.if_remove:
-                shutil.rmtree(self.cwd, ignore_errors=True)
-                print(f"| Remove cwd: {self.cwd}")
-            os.makedirs(self.cwd, exist_ok=True)
-
-        np.random.seed(self.random_seed)
-        torch.manual_seed(self.random_seed)
-        torch.set_num_threads(self.num_threads)
-        torch.set_default_dtype(torch.float32)
-
-        os.environ['CUDA_VISIBLE_DEVICES'] = str(self.visible_gpu)
-
-
-def update_buffer(_trajectory):
-    _trajectory = list(map(list, zip(*_trajectory)))  # 2D-list transpose, here cut the trajectory into 5 parts 
-    ten_state = torch.as_tensor(_trajectory[0])  # tensor state here
-    ten_reward = torch.as_tensor(_trajectory[1], dtype=torch.float32)
-    # _trajectory[2] = done, replace done by mask, save memory
-    ten_mask = (1.0 - torch.as_tensor(_trajectory[2], dtype=torch.float32)) * gamma
-    ten_action = torch.as_tensor(_trajectory[3])
-    ten_noise = torch.as_tensor(_trajectory[4], dtype=torch.float32)
-
-    buffer[:] = (ten_state, ten_action, ten_noise, ten_reward, ten_mask)  # list store tensors
-
-    _steps = ten_reward.shape[0]  # how many steps are collected in all trajectories
-    _r_exp = ten_reward.mean()  # the mean reward
-    return _steps, _r_exp
-
-
-if __name__ == '__main__':
-    args = Arguments()
-    reward_record = {'episode': [], 'steps': [], 'mean_episode_reward': [], 'unbalance': []}
-    loss_record = {'episode': [], 'steps': [], 'critic_loss': [], 'actor_loss': [], 'entropy_loss': []}
-    args.visible_gpu = '0'
-    for seed in args.random_seed_list:
-        args.random_seed = seed
-        args.agent = AgentPrimalDualPPO()
-
-        agent_name = f'{args.agent.__class__.__name__}'
-        args.agent.cri_target = True
-        args.env = ESSEnv()
-        args.init_before_training(if_main=True)
-        agent = args.agent
-        env = args.env
-        agent.init(args.net_dim, env.state_space.shape[0], env.action_space.shape[0], args.learning_rate,
-                   args.if_per_or_gae, layer_norm=True)
-
-        cwd = args.cwd
-        gamma = args.gamma
-        batch_size = args.batch_size  # how much data should be used to update net
-        target_step = args.target_step  # how many steps of one episode should stop
-        repeat_times = args.repeat_times  # how many times should update for one batch size data
-        soft_update_tau = args.soft_update_tau
-        agent.state = env.reset()
-        buffer = list()
-        num_episode = args.num_episode
-
-        if args.train:
-            for i_episode in range(num_episode):
-                with torch.no_grad():
-                    trajectory_list = []
-                    for _ in range(target_step):
-                        current_obs = agent.state
-                        action, noise = agent.select_action(current_obs)
-                        next_obs, reward, done, info, cost = env.step(action)
-                        trajectory_list.append((current_obs, reward, done, action, noise, cost))
-                        agent.state = next_obs
-                        if done:
-                            break
-                    steps, r_exp = update_buffer(trajectory_list)
-                critic_loss, actor_loss, entropy_loss = agent.update_net(buffer, batch_size, repeat_times, soft_update_tau)
-                loss_record['critic_loss'].append(critic_loss)
-                loss_record['actor_loss'].append(actor_loss)
-                loss_record['entropy_loss'].append(entropy_loss)
-
-                with torch.no_grad():
-                    episode_reward, episode_unbalance = get_episode_return(env, agent.act, agent.device)
-                    reward_record['mean_episode_reward'].append(episode_reward)
-                    reward_record['unbalance'].append(episode_unbalance)
-                print(f'current episode is {i_episode}, reward: {episode_reward}, unbalance: {episode_unbalance}')
-
-    act_save_path = f'{args.cwd}/actor.pth'
-    loss_record_path = f'{args.cwd}/loss_data.pkl'
-    reward_record_path = f'{args.cwd}/reward_data.pkl'
-
-    with open(loss_record_path, 'wb') as tf:
-        pickle.dump(loss_record, tf)
-    with open(reward_record_path, 'wb') as tf:
-        pickle.dump(reward_record, tf)
-
-    if args.save_network:
-        torch.save(agent.act.state_dict(), act_save_path)
-        print('actor parameters have been saved')
-
-    if args.test_network:
-        args.cwd = agent_name
-        agent.act.load_stateDict(torch.load(act_save_path))
-        print('parameters have been reloaded and test')
-        record = test_one_episode(env, agent.act, agent.device)
-        eval_data = pd.DataFrame(record['system_info'])
-        eval_data.columns = ['time_step', 'price', 'netload', 'action', 'real_action', 'soc', 'battery', 'gen1', 'gen2',
-                             'gen3', 'temperature', 'irradiance', 'unbalance', 'operation_cost']
-    if args.save_test_data:
-        test_data_save_path = f'{args.cwd}/test_data.pkl'
-        with open(test_data_save_path, 'wb') as tf:
-            pickle.dump(record, tf)
-
-    '''compare with gurobi data and results'''
-    if args.compare_with_gurobi:
-        month = record['init_info'][0][0]
-        day = record['init_info'][0][1]
-        initial_soc = record['init_info'][0][3]
-        base_result = optimization_base_result(env, month, day, initial_soc)
-    if args.plot_on:
-        from plotDRL import PlotArgs, make_dir, plot_evaluation_information, plot_optimization_result
-
-        plot_args = PlotArgs()
-        plot_args.feature_change = 'primal_dual'
-        args.cwd = agent_name
-        plot_dir = make_dir(args.cwd, plot_args.feature_change)
-        plot_optimization_result(base_result, plot_dir)
-        plot_evaluation_information(args.cwd + '/' + 'test_data.pkl', plot_dir)
-    '''compare the different cost get from gurobi and PPO'''
-    ration = sum(eval_data['operation_cost']) / sum(base_result['step_cost'])
-    print('operation_cost_sum:', sum(eval_data['operation_cost']))
-    print('step_cost_sum:', sum(base_result['step_cost']))
-    print('ration:', ration)
--- a/Qwen_max.py
+++ b/Qwen_max.py
@ -1,56 +0,0 @@
-from openai import OpenAI
-import os
-
-
-def get_response():
-    client = OpenAI(
-        api_key="sk-f01744b2801344b1a72f89ec7e290cad",
-        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",  # 填写DashScope服务的base_url
-    )
-    completion = client.chat.completions.create(
-        model="qwen-turbo",
-        messages=[
-            {'role': 'system',
-             'content':
-                 '你是一名擅长调试和控制智慧住宅能源系统的专家，需要根据当前环境状态给出最优的动作决策。'
-                 '能源系统结构包括太阳能发电站、风能发电站、发电机、电池模块和住宅电网连接。'},
-            {'role': 'user',
-             'content':
-                 '一、当前状态包含9个值：'
-                 'Sc(电池荷电状态)初始为0.4，范围0.2到0.8；Sg1(发电机1功率状态)、Sg2(发电机2功率状态)和Sg3(发电机3功率状态)初始功率都为0；'
-                 'Sp(电价状态)、Sl(住宅负载状态)、St(温度状态)、Si(太阳辐射度状态)和Sw(风速状态)都由数据提供。'
-                 '其中Sc、Sg1、Sg2和Sg3的值都是在上一时刻基础上进行变化的，始终不小于0，Sg1、Sg2和Sg3为0代表各发动机关闭。'
-                 '二、当前动作包含5个值：'
-                 'x表示当前时间的取值, 其范围从-1到1，电池总容量为500，初始电池容量为500*0.4=200，Ag1、Ag2和Ag3分别表示相对于各自最大瞬时功率的变化值。'
-                 'Ac(电池充放电动作)：电池容量变化100*x；Ag1(发电机1功率动作)：功率变化100*x；Ag2(发电机2功率动作)：功率变化100*x；Ag3(发电机3功率动作)：功率变化200*x；Av(太阳能电压动作)：太阳能电压变为(1+x)倍。'
-                 '三、功率计算公式'
-                 '1.Pc(电池充放电功率)= max(0.2,min(0.8,Sc+0.18*Ac))；2.Pg1(发电机1功率)= max(0,min(150,100*Ag1+Sg1))；'
-                 '3.Pg2(发电机2功率)= max(0,min(375,100*Ag2+Sg2))；4.Pg3(发电机3功率)= max(0,min(500,200*Ag3+Sg3))；'
-                 '5.Pso(太阳能功率)= max(0,(0.45*Si+0.015*St–60.3)*(1+Av))；'
-                 '6.Pw(风能功率) if 3<=Sw<8, Pw=Sw^3*172.2625/1000 elif 8<=Sw<12, Pw=64*172.2625/125 else Pw=0'
-                 '四、成本和能源惩罚计算公式'
-                 'P(总功率)=Pc+Pg1+Pg2+Pg3+Pso+Pw'
-                 'if P>=Sl, Ee(过剩能源)= P–Sl, else Es(短缺能源)=Sl-P'
-                 'Cb(电池成本)=0.01*Ac+0.1*Sc;Cg1(发电机1成本)=0.0034*Pg1^2+3*Pg1+30'
-                 'Cg2(发电机2成本)=0.001*Pg2^2+10*Pg2+40;Cg3(发电机3成本)=0.001*Pg3^2+15*Pg3+70'
-                 'Cs(太阳能成本)=0.01*Pso;Cw(风能成本)=0.01*Pw'
-                 'Rs(销售收入)=0.5*Sp*Ee;Cp(购买成本)=Sp*Es'
-                 'Pe(过剩惩罚)=(Ee-100)*50;Ps(短缺惩罚)=(Es-100)*50'
-                 'Rs、Pe和Cp、Ps对应能源过剩和能源短缺情况，不能同时存在。'
-                 '五、目标函数 Goal=- (Cb+Cg1+Cg2+Cg3+Cs+Cw+Pe+Ps-Rs+Cp)/1000'
-                 '六、控制方案和目标：'
-                 '1.优先使用太阳能和风能发电，当无法满足负荷时，使用发电机和电池供电；'
-                 '2.供电时综合考虑各发电模块的成本，通常前一台发电机的最大功率不足以满足负荷时，才开启后续发电机；'
-                 '3.减少超过公网交易限额的交易，避免公共电网波动影响住宅用电稳定性；'
-                 '4.满足下一时刻住宅用电负载，最大化奖励函数且使供需差异最小化(Ee、Es尽可能小)，提升训练效果和策略质量。'
-                 '请你读取每小时的Sp、Sl、St、Si和Sw数据并计算每个时刻的状态，然后使用最合适的优化方法给出每小时的动作决策，'
-                 '其长度为5，分别表示Ac、Ag1、Ag2、Ag3和Av，每小时计算一次。'
-                 '结果应为Json格式：[{{x1, x2, x3, x4, x5}}, {{x1, x2, x3, x4, x5}}, ...]，如果计算量太大，可以分批回答。'}],
-        temperature=0.8,
-        top_p=0.8
-    )
-    print(completion.model_dump_json())
-
-
-if __name__ == '__main__':
-    get_response()
--- a/SAC.py
+++ b/SAC.py
@ -51,8 +51,8 @@ if __name__ == '__main__':
        '''collect data and train and update network'''
        num_episode = args.num_episode
        '''here record real unbalance'''
-        # args.train = False
-        # args.save_network = False
+        args.train = False
+        args.save_network = False
        # args.test_network = False
        # args.save_test_data = False
        # args.compare_with_gurobi = False
@ -127,6 +127,6 @@ if __name__ == '__main__':
        plot_evaluation_information(args.cwd + '/' + 'test_data.pkl', plot_dir)
    '''compare the different cost get from gurobi and SAC'''
    ration = sum(eval_data['operation_cost']) / sum(base_result['step_cost'])
-    print('operation_cost_sum:', sum(eval_data['operation_cost']))
-    print('step_cost_sum:', sum(base_result['step_cost']))
+    print('rl_cost:', sum(eval_data['operation_cost']))
+    print('gurobi_cost:', sum(base_result['step_cost']))
    print('ration:', ration)
--- a/TD3.py
+++ b/TD3.py
@ -50,8 +50,8 @@ if __name__ == '__main__':
        agent.state = env.reset()
        '''collect data and train and update network'''
        num_episode = args.num_episode
-        # args.train=False
-        # args.save_network=False
+        args.train=False
+        args.save_network=False
        # args.test_network=False
        # args.save_test_data=False
        # args.compare_with_gurobi=False
@ -126,6 +126,6 @@ if __name__ == '__main__':
        plot_evaluation_information(args.cwd + '/' + 'test_data.pkl', plot_dir)
    '''compare the different cost get from gurobi and TD3'''
    ration = sum(eval_data['operation_cost']) / sum(base_result['step_cost'])
-    print('operation_cost_sum:', sum(eval_data['operation_cost']))
-    print('step_cost_sum:', sum(base_result['step_cost']))
+    print('rl_cost:', sum(eval_data['operation_cost']))
+    print('gurobi_cost:', sum(base_result['step_cost']))
    print('ration:', ration)
--- a/data_manager.py
+++ b/data_manager.py
@ -2,6 +2,23 @@ class Constant:
    MONTHS_LEN = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]


+class TimeCounter:
+    def __init__(self):
+        self.month = 1
+        self.day = 1
+
+    def increment_day(self):
+        self.day += 1
+        if self.day > Constant.MONTHS_LEN[self.month - 1]:
+            self.day = 1
+            self.month += 1
+            if self.month > 12:
+                self.month = 1
+
+    def get_date(self):
+        return self.month, self.day
+
+
 class DataManager:
    def __init__(self) -> None:
        self.Prices = []
--- a/environment_llm.py
+++ b/environment_llm.py
@ -19,6 +19,7 @@ class ESSEnv(gym.Env):
        self.current_output = None
        self.final_step_outputs = None
        self.data_manager = DataManager()
+        self.time_counter = TimeCounter()
        self._load_year_data()
        self.month = None
        self.day = None
@ -44,11 +45,9 @@ class ESSEnv(gym.Env):
        self.state_space = gym.spaces.Box(low=0, high=1, shape=(15,), dtype=np.float32)  # 为llm调整shape

    def reset(self, *args):
-        self.month = np.random.randint(1, 13)  # choose 12 month
-        if self.TRAIN:
-            self.day = np.random.randint(1, 20)
-        else:
-            self.day = np.random.randint(20, Constant.MONTHS_LEN[self.month] - 1)
+        # 更改为顺序训练
+        self.time_counter.increment_day()
+        self.month, self.day = self.time_counter.get_date()
        self.current_time = 0
        self.battery.reset()
        self.dg1.reset()
@ -155,7 +154,7 @@ class ESSEnv(gym.Env):
        irradiance_df = pd.read_csv('data/irradiance.csv', sep=',')
        temperature_df = pd.read_csv('data/temper.csv', sep=',')
        wind_df = pd.read_csv('data/wind.csv', sep=',')
-        llm_data = json.load(open('data/llm_action.json', 'r'))
+        # llm_data = json.load(open('data/llm_action.json', 'r'))

        price = price_df['price'].to_numpy(dtype=float)
        load = load_df['houseload'].to_numpy(dtype=float)
@ -174,4 +173,4 @@ class ESSEnv(gym.Env):
        process_elements(irradiance, lambda x: x, self.data_manager.add_irradiance_element)
        process_elements(temperature, lambda x: x - 273.15, self.data_manager.add_temperature_element)
        process_elements(wind, lambda x: x, self.data_manager.add_wind_element)
-        process_elements(llm_data, lambda x: x, self.data_manager.add_llm_element)
+        # process_elements(llm_data, lambda x: x, self.data_manager.add_llm_element)
--- a/environment_primal_dual.py
+++ b/environment_primal_dual.py
@ -1,184 +0,0 @@
-import gym
-import numpy as np
-import pandas as pd
-
-from module import *
-from parameters import *
-from data_manager import *
-
-
-class ESSEnv(gym.Env):
-    def __init__(self, **kwargs):
-        super(ESSEnv, self).__init__()
-        self.excess = None
-        self.shedding = None
-        self.unbalance = None
-        self.real_unbalance = None
-        self.operation_cost = None
-        self.current_output = None
-        self.final_step_outputs = None
-        self.data_manager = DataManager()
-        self._load_year_data()
-        self.month = None
-        self.day = None
-        self.TRAIN = True
-        self.current_time = None
-        self.episode_length = kwargs.get('episode_length', 24)
-        self.battery_parameters = kwargs.get('battery_parameters', battery_parameters)
-        self.dg_parameters = kwargs.get('dg_parameters', dg_parameters)
-        self.solar_parameters = kwargs.get('solar_parameters', solar_parameters)
-        self.wind_parameters = kwargs.get('wind_parameters', wind_parameters)
-        self.penalty_coefficient = 50  # 约束惩罚系数
-        self.sell_coefficient = 0.5  # 售出利润系数
-
-        self.grid = Grid()
-        self.battery = Battery(self.battery_parameters)
-        self.dg1 = DG(self.dg_parameters['gen_1'])
-        self.dg2 = DG(self.dg_parameters['gen_2'])
-        self.dg3 = DG(self.dg_parameters['gen_3'])
-        self.solar = Solar(self.solar_parameters)
-        self.wind = Wind(self.wind_parameters)
-
-        self.action_space = gym.spaces.Box(low=-1, high=1, shape=(5,), dtype=np.float32)  # 已增加调节电压动作
-        self.state_space = gym.spaces.Box(low=0, high=1, shape=(10,), dtype=np.float32)
-
-    def reset(self, *args):
-        self.month = np.random.randint(1, 13)  # choose 12 month
-        if self.TRAIN:
-            self.day = np.random.randint(1, 20)
-        else:
-            self.day = np.random.randint(20, Constant.MONTHS_LEN[self.month] - 1)
-        self.current_time = 0
-        self.battery.reset()
-        self.dg1.reset()
-        self.dg2.reset()
-        self.dg3.reset()
-        self.solar.reset()
-        self.wind.reset()
-        return self._build_state()
-
-    def _build_state(self):
-        soc = self.battery.SOC()
-        dg1_output = self.dg1.current_output
-        dg2_output = self.dg2.current_output
-        dg3_output = self.dg3.current_output
-        time_step = self.current_time
-
-        price = self.data_manager.get_price_data(self.month, self.day, self.current_time)
-        houseload = self.data_manager.get_load_cons_data(self.month, self.day, self.current_time)
-        temperature = self.data_manager.get_temperature_data(self.month, self.day, self.current_time)
-        irradiance = self.data_manager.get_irradiance_data(self.month, self.day, self.current_time)
-        wind_speed = self.data_manager.get_wind_data(self.month, self.day, self.current_time)
-
-        obs = np.concatenate((np.float32(time_step), np.float32(price), np.float32(soc), np.float32(houseload),
-                              np.float32(dg1_output), np.float32(dg2_output), np.float32(dg3_output),
-                              np.float32(temperature), np.float32(irradiance), np.float32(wind_speed)), axis=None)
-        return obs
-
-    def step(self, action):  # state transition: current_obs->take_action->get_reward->get_finish->next_obs
-        # 在每个组件中添加动作
-        current_obs = self._build_state()
-        temperature = current_obs[7]
-        irradiance = current_obs[8]
-        wind_speed = current_obs[9]
-        self.battery.step(action[0])  # 执行状态转换，电池当前容量也改变
-        self.dg1.step(action[1])
-        self.dg2.step(action[2])
-        self.dg3.step(action[3])
-        self.solar.step(temperature, irradiance, action[4])
-        self.wind.step(wind_speed)
-        self.current_output = np.array((self.dg1.current_output, self.dg2.current_output, self.dg3.current_output,
-                                        -self.battery.energy_change, self.solar.current_power, self.wind.current_power))
-        actual_production = sum(self.current_output)
-        price = current_obs[1]
-        houseload = current_obs[3]
-        unbalance = actual_production - houseload
-
-        reward = 0
-        excess_penalty = 0  # 过多
-        deficient_penalty = 0  # 过少
-        sell_benefit = 0
-        buy_cost = 0
-        self.excess = 0
-        self.shedding = 0
-        if unbalance >= 0:  # 现在过剩
-            if unbalance <= self.grid.exchange_ability:
-                # sell money to grid is little [0.029,0.1]
-                sell_benefit = self.grid.get_cost(price, unbalance) * self.sell_coefficient
-            else:
-                sell_benefit = self.grid.get_cost(price, self.grid.exchange_ability) * self.sell_coefficient
-                # real unbalance：电网也无法满足
-                self.excess = unbalance - self.grid.exchange_ability
-                excess_penalty = self.excess * self.penalty_coefficient
-        else:  # unbalance <0, 采用缺少惩罚
-            if abs(unbalance) <= self.grid.exchange_ability:
-                buy_cost = self.grid.get_cost(price, abs(unbalance))
-            else:
-                buy_cost = self.grid.get_cost(price, self.grid.exchange_ability)
-                self.shedding = abs(unbalance) - self.grid.exchange_ability
-                deficient_penalty = self.shedding * self.penalty_coefficient
-        battery_cost = self.battery.get_cost(self.battery.energy_change, self.battery.current_capacity)
-        dg1_cost = self.dg1.get_cost(self.dg1.current_output)
-        dg2_cost = self.dg2.get_cost(self.dg2.current_output)
-        dg3_cost = self.dg3.get_cost(self.dg3.current_output)
-        solar_cost = self.solar.get_cost(self.solar.current_power)
-        wind_cost = self.wind.gen_cost(self.wind.current_power)
-
-        self.operation_cost = (battery_cost + dg1_cost + dg2_cost + dg3_cost + solar_cost + wind_cost + excess_penalty +
-                               deficient_penalty - sell_benefit + buy_cost)
-        reward -= self.operation_cost / 1e3
-        self.unbalance = unbalance
-        self.real_unbalance = self.shedding + self.excess
-        final_step_outputs = [self.dg1.current_output, self.dg2.current_output, self.dg3.current_output,
-                              self.battery.current_capacity, self.solar.current_power, self.wind.current_power]
-        self.current_time += 1
-        finish = (self.current_time == self.episode_length)
-        if finish:
-            self.final_step_outputs = final_step_outputs
-            self.current_time = 0
-            next_obs = self.reset()
-        else:
-            next_obs = self._build_state()
-        return current_obs, next_obs, float(reward), finish
-
-    # def render(self, current_obs, next_obs, reward, finish):
-    #     print('day={},hour={:2d}, state={}, next_state={}, reward={:.4f}, terminal={}\n'.
-    #           format(self.day, self.current_time, current_obs, next_obs, reward, finish))
-
-    def _load_year_data(self):
-        price_df = pd.read_csv('data/prices.csv', sep=',')
-        load_df = pd.read_csv('data/houseload.csv', sep=',')
-        irradiance_df = pd.read_csv('data/irradiance.csv', sep=',')
-        temperature_df = pd.read_csv('data/temper.csv', sep=',')
-        wind_df = pd.read_csv('data/wind.csv', sep=',')
-
-        price = price_df['price'].to_numpy(dtype=float)
-        load = load_df['houseload'].to_numpy(dtype=float)
-        irradiance = irradiance_df['irradiance'].to_numpy(dtype=float)
-        temperature = temperature_df['t2m'].to_numpy(dtype=float)
-        wind = wind_df['wind_speed'].to_numpy(dtype=float)
-
-        '''重新设计价格和发电量以及需求的大小'''
-        def process_elements(elements, transform_function, add_function):
-            for element in elements:
-                transformed_element = transform_function(element)
-                add_function(transformed_element)
-
-        process_elements(price, lambda x: max(x / 10, 0.5), self.data_manager.add_price_element)
-        process_elements(load, lambda x: x * 3, self.data_manager.add_load_element)
-        process_elements(irradiance, lambda x: x, self.data_manager.add_irradiance_element)
-        process_elements(temperature, lambda x: x - 273.15, self.data_manager.add_temperature_element)
-        process_elements(wind, lambda x: x, self.data_manager.add_wind_element)
-
-# if __name__ == '__main__':
-#     env = ESSEnv()
-#     env.TRAIN = False
-#     rewards = []
-#     env.reset()
-#     tem_action = [0.1, 0.1, 0.1, 0.1, 0.1]
-#     for _ in range(144):
-#         print(f'current month is {env.month}, current day is {env.day}, current time is {env.current_time}')
-#         current_obs, next_obs, reward, finish = env.step(tem_action)
-#         env.render(current_obs, next_obs, reward, finish)
-#         current_obs = next_obs
-#         rewards.append(reward)
--- a/generate_optimization.py
+++ b/generate_optimization.py
@ -1,190 +0,0 @@
-import numpy as np
-import pandas as pd
-import json
-from scipy.optimize import minimize
-from pyswarm import pso
-
-# 读取数据
-file_path = './data.csv'
-data = pd.read_csv(file_path)
-
-# 初始化状态
-initial_soc = 0.4  # 电池初始荷电状态
-initial_sg1 = 0
-initial_sg2 = 0
-initial_sg3 = 0
-battery_capacity = 500 * initial_soc
-
-# 参数设置
-battery_efficiency = 0.9
-battery_degradation = 0.01
-battery_holding = 0.1
-solar_cofficient = 0.01
-volatage_change_percent = 0.1
-wind_cofficient = 0.01
-
-# 发电机参数
-DG_parameters = {
-    'gen_1': {'a': 0.0034, 'b': 3, 'c': 30, 'power_output_max': 150, 'power_output_min': 10, 'ramping_up': 100,
-              'ramping_down': 100},
-    'gen_2': {'a': 0.001, 'b': 10, 'c': 40, 'power_output_max': 375, 'power_output_min': 50, 'ramping_up': 100,
-              'ramping_down': 100},
-    'gen_3': {'a': 0.001, 'b': 15, 'c': 70, 'power_output_max': 500, 'power_output_min': 100, 'ramping_up': 200,
-              'ramping_down': 200}
-}
-
-NUM_GEN = len(DG_parameters.keys())
-period = 100  # 处理前 100 个数据
-
-
-def get_dg_info(parameters):
-    p_max = []
-    p_min = []
-    ramping_up = []
-    ramping_down = []
-    a_para = []
-    b_para = []
-    c_para = []
-    for name, gen_info in parameters.items():
-        p_max.append(gen_info['power_output_max'])
-        p_min.append(gen_info['power_output_min'])
-        ramping_up.append(gen_info['ramping_up'])
-        ramping_down.append(gen_info['ramping_down'])
-        a_para.append(gen_info['a'])
-        b_para.append(gen_info['b'])
-        c_para.append(gen_info['c'])
-    return p_max, p_min, ramping_up, ramping_down, a_para, b_para, c_para
-
-
-p_max, p_min, ramping_up, ramping_down, a_para, b_para, c_para = get_dg_info(parameters=DG_parameters)
-
-
-# 目标函数
-def objective(x, period_batch, start, initial_states):
-    total_cost = 0
-    soc, sg1, sg2, sg3 = initial_states
-    for t in range(period_batch):
-        Ac, Ag1, Ag2, Ag3, Av = x[t * 5:(t + 1) * 5]
-        Pc = max(0.2, min(0.8, soc + 0.2 * Ac * 0.9))
-        Pg1 = max(0, min(150, 100 * Ag1 + sg1))
-        Pg2 = max(0, min(375, 100 * Ag2 + sg2))
-        Pg3 = max(0, min(500, 200 * Ag3 + sg3))
-        Pso = max(0, (0.2 * data['irradiance'][start + t] + 0.05 * data['temperature'][start + t] - 9.25) * (1 + Av))
-
-        if 3 <= data['wind_speed'][start + t] < 8:
-            Pw = data['wind_speed'][start + t] ** 3 * 172.2625 / 1000
-        elif 8 <= data['wind_speed'][start + t] < 12:
-            Pw = 64 * 172.2625 / 125
-        else:
-            Pw = 0
-
-        P = Pc + Pg1 + Pg2 + Pg3 + Pso + Pw
-
-        if P >= data['load'][start + t]:
-            Ee = P - data['load'][start + t]
-            Es = 0
-        else:
-            Es = data['load'][start + t] - P
-            Ee = 0
-
-        Cb = 0.01 * battery_capacity + 0.1 * Pc
-        Cg1 = 0.0034 * Pg1 ** 2 + 3 * Pg1 + 30
-        Cg2 = 0.001 * Pg2 ** 2 + 10 * Pg2 + 40
-        Cg3 = 0.001 * Pg3 ** 2 + 15 * Pg3 + 70
-        Cs = 0.01 * Pso
-        Cw = 0.01 * Pw
-        Rs = 0.5 * data['price'][start + t] * Ee
-        Cp = data['price'][start + t] * Es
-        Pe = (Ee - 100) * 50 if Ee > 100 else 0
-        Ps = (Es - 100) * 50 if Es > 100 else 0
-
-        total_cost += (Cb + Cg1 + Cg2 + Cg3 + Cs + Cw + Pe + Ps - Rs + Cp) / 1000
-
-        soc = Pc
-        sg1 = Pg1
-        sg2 = Pg2
-        sg3 = Pg3
-
-    return total_cost
-
-
-# 约束条件
-def constraint(x, period_batch, start, initial_states):
-    constraints = []
-    soc, sg1, sg2, sg3 = initial_states
-    for t in range(period_batch):
-        Ac, Ag1, Ag2, Ag3, Av = x[t * 5:(t + 1) * 5]
-        Pc = max(0.2, min(0.8, soc + 0.2 * Ac * 0.9))
-        Pg1 = max(10, min(150, 100 * Ag1 + sg1))
-        Pg2 = max(50, min(375, 100 * Ag2 + sg2))
-        Pg3 = max(100, min(500, 200 * Ag3 + sg3))
-        Pso = max(0, (0.2 * data['irradiance'][start + t] + 0.05 * data['temperature'][start + t] - 9.25) * (1 + Av))
-
-        if 3 <= data['wind_speed'][start + t] < 8:
-            Pw = data['wind_speed'][start + t] ** 3 * 172.2625 / 1000
-        elif 8 <= data['wind_speed'][start + t] < 12:
-            Pw = 64 * 172.2625 / 125
-        else:
-            Pw = 0
-
-        P = Pc + Pg1 + Pg2 + Pg3 + Pso + Pw
-
-        # 保证变化后的值不小于最小值
-        constraints.append(Pc - 0.2)
-        constraints.append(Pg1 - 10)
-        constraints.append(Pg2 - 50)
-        constraints.append(Pg3 - 100)
-
-        constraints.append(P - data['load'][start + t])
-        soc = Pc
-        sg1 = Pg1
-        sg2 = Pg2
-        sg3 = Pg3
-    return constraints
-
-
-actions = []
-batch_size = 10  # 每次处理 10 个数据
-total_batches = 10  # 总共处理 100 个数据
-start = 0
-
-for batch_num in range(total_batches):
-    end = min(start + batch_size, period)
-    period_batch = end - start
-
-    # 初始化初始状态
-    initial_states = (initial_soc, initial_sg1, initial_sg2, initial_sg3)
-
-    # # 使用随机初始化更接近实际情况
-    # x0 = np.random.uniform(0, 1, period_batch * 5)
-    # bounds = [(-1, 1)] * (period_batch * 5)
-    # cons = {'type': 'ineq', 'fun': lambda x: constraint(x, period_batch, start, initial_states)}
-    # result = minimize(lambda x: objective(x, period_batch, start, initial_states), x0, method='SLSQP', bounds=bounds,
-    #                   constraints=cons)
-    #
-    # actions.extend(result.x.reshape((period_batch, 5)).tolist())
-
-    # 使用粒子群优化算法
-    lb = [-1] * (period_batch * 5)
-    ub = [1] * (period_batch * 5)
-    xopt, fopt = pso(lambda x: objective(x, period_batch, start, initial_states), lb, ub,
-                     ieqcons=[lambda x: constraint(x, period_batch, start, initial_states)], maxiter=100, swarmsize=50)
-
-    actions.extend(xopt.reshape((period_batch, 5)).tolist())
-
-    # 更新初始状态为当前批次最后一个状态
-    last_action = actions[-1]
-    initial_soc = max(0.2, min(0.8, initial_soc + 0.2 * last_action[0] * 0.9))
-    initial_sg1 = max(10, min(150, 100 * last_action[1] + initial_sg1))
-    initial_sg2 = max(50, min(375, 100 * last_action[2] + initial_sg2))
-    initial_sg3 = max(100, min(500, 200 * last_action[3] + initial_sg3))
-
-    start += batch_size
-
-actions_json_path = './actions_batch_p.json'
-try:
-    with open(actions_json_path, 'w') as f:
-        json.dump(actions, f)
-    print(f"文件保存成功：{actions_json_path}")
-except Exception as e:
-    print(f"文件保存失败：{e}")
--- a/genetic_gpu_all.py
+++ b/genetic_gpu_all.py
@ -1,183 +0,0 @@
-import json
-import os
-import time
-import numpy as np
-import pandas as pd
-import torch
-from concurrent.futures import ThreadPoolExecutor
-from deap import base, creator, tools, algorithms
-
-
-def fitness_torch(individuals, price, load, temperature, irradiance, wind_speed, device):
-    individuals_torch = torch.tensor(individuals, device=device)
-    num_individuals = individuals_torch.shape[0]
-    Ac, Ag1, Ag2, Ag3, Av = [individuals_torch[:, i * period:(i + 1) * period] for i in range(5)]
-
-    soc = torch.zeros((num_individuals, period), device=device)
-    Pg1, Pg2, Pg3 = [torch.zeros((num_individuals, period), device=device) for _ in range(3)]
-    Cb, Cg1, Cg2, Cg3, Cs, Cw, Rs, Cp, Pe, Ps, Ee, Es = [torch.zeros((num_individuals, period), device=device) for _ in
-                                                         range(12)]
-
-    price_torch = torch.tensor(price, device=device)
-    load_torch = torch.tensor(load, device=device)
-    temperature_torch = torch.tensor(temperature, device=device)
-    irradiance_torch = torch.tensor(irradiance, device=device)
-    wind_speed_torch = torch.tensor(wind_speed, device=device)
-
-    for t in range(period):
-        if t > 0:
-            soc[:, t] = torch.clamp(soc[:, t - 1] + 0.2 * Ac[:, t] * 0.9, 0.2, 0.8)
-            Pg1[:, t] = torch.clamp(Pg1[:, t - 1] + 100 * Ag1[:, t], min=0, max=150)
-            Pg2[:, t] = torch.clamp(Pg2[:, t - 1] + 100 * Ag2[:, t], min=0, max=375)
-            Pg3[:, t] = torch.clamp(Pg3[:, t - 1] + 200 * Ag3[:, t], min=0, max=500)
-        else:
-            soc[:, t] = 0.4
-            Pg1[:, t] = torch.clamp(100 * Ag1[:, t], min=0, max=150)
-            Pg2[:, t] = torch.clamp(100 * Ag2[:, t], min=0, max=375)
-            Pg3[:, t] = torch.clamp(200 * Ag3[:, t], min=0, max=500)
-
-        Pso = torch.clamp((0.2 * irradiance_torch[t] + 0.05 * temperature_torch[t] - 9.25) * (1 + Av[:, t]), min=0)
-
-        Pw = torch.where(
-            (wind_speed_torch[t] >= 3) & (wind_speed_torch[t] < 8),
-            wind_speed_torch[t] ** 3 * 172.2625 / 1000,
-            torch.where(
-                (wind_speed_torch[t] >= 8) & (wind_speed_torch[t] < 12),
-                64 * 172.2625 / 125,
-                torch.zeros_like(wind_speed_torch[t])
-            )
-        )
-
-        P = Ac[:, t] + Pg1[:, t] + Pg2[:, t] + Pg3[:, t] + Pso + Pw
-
-        Ee[:, t] = torch.where(P >= load_torch[t], P - load_torch[t], torch.zeros_like(P))
-        Es[:, t] = torch.where(P < load_torch[t], load_torch[t] - P, torch.zeros_like(P))
-
-        Cb[:, t] = 0.01 * Ac[:, t] + 0.1 * soc[:, t]
-        Cg1[:, t] = 0.0034 * Pg1[:, t] ** 2 + 3 * Pg1[:, t] + 30
-        Cg2[:, t] = 0.001 * Pg2[:, t] ** 2 + 10 * Pg2[:, t] + 40
-        Cg3[:, t] = 0.001 * Pg3[:, t] ** 2 + 15 * Pg3[:, t] + 70
-        Cs[:, t] = 0.01 * Pso
-        Cw[:, t] = 0.01 * Pw
-        Rs[:, t] = 0.5 * price_torch[t] * Ee[:, t]
-        Cp[:, t] = price_torch[t] * Es[:, t]
-        Pe[:, t] = torch.where(Ee[:, t] > 100, (Ee[:, t] - 100) * 50, torch.zeros_like(Ee[:, t]))
-        Ps[:, t] = torch.where(Es[:, t] > 100, (Es[:, t] - 100) * 50, torch.zeros_like(Es[:, t]))
-
-    total_cost = torch.sum(Cb + Cg1 + Cg2 + Cg3 + Cs + Cw + Pe + Ps - Rs + Cp, dim=1)
-    reward = -total_cost / 1000
-
-    return reward.cpu().numpy()
-
-
-def check_bounds(func):
-    def wrapper(*args, **kwargs):
-        offspring = func(*args, **kwargs)
-        for individual in offspring:
-            for i in range(len(individual)):
-                individual[i] = np.clip(individual[i], -1, 1)
-        return offspring
-    return wrapper
-
-
-def save_progress(population, gen, filename='ga_progress.json'):
-    data = {
-        'population': [[list(ind), ind.fitness.values[0]] for ind in population],
-        'generation': gen
-    }
-    with open(filename, 'w') as f:
-        json.dump(data, f)
-    print(f"进度已保存到第 {gen} 代到 '{filename}'。")
-
-
-def load_progress(filename='ga_progress.json'):
-    if os.path.exists(filename):
-        with open(filename, 'r') as f:
-            data = json.load(f)
-        population = []
-        for ind_data in data['population']:
-            ind = creator.Individual(ind_data[0])
-            ind.fitness.values = (ind_data[1],)
-            population.append(ind)
-        gen = data['generation']
-        print(f"从第 {gen} 代加载进度。")
-        return population, gen
-    else:
-        return None, 0
-
-
-def save_decision_values(best_ind, period, file_suffix):
-    decision_values = [
-        {"x1": best_ind[i], "x2": best_ind[i + period], "x3": best_ind[i + 2 * period], "x4": best_ind[i + 3 * period],
-         "x5": best_ind[i + 4 * period]} for i in range(period)]
-    filename = f'./decision_values_{file_suffix}.json'
-    with open(filename, 'w') as f:
-        json.dump(decision_values, f)
-    print(f"周期 {file_suffix}：决策值已保存到 '{filename}'。")
-
-
-def main():
-    data = pd.read_csv('./data.csv')
-    # period = len(data)
-    period = 2
-    price, load, temperature, irradiance, wind_speed = [data[col].values for col in
-                                                        ['price', 'load', 'temperature', 'irradiance', 'wind_speed']]
-
-    creator.create("FitnessMax", base.Fitness, weights=(1.0,))
-    creator.create("Individual", list, fitness=creator.FitnessMax)
-
-    toolbox = base.Toolbox()
-    toolbox.register("attr_float", np.random.uniform, -1, 1)
-    toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_float, n=5 * period)
-    toolbox.register("population", tools.initRepeat, list, toolbox.individual)
-
-    toolbox.register("mate", check_bounds(tools.cxBlend), alpha=0.5)
-    toolbox.register("mutate", check_bounds(tools.mutGaussian), mu=0, sigma=0.1, indpb=0.2)
-    toolbox.register("select", tools.selTournament, tournsize=3)
-    toolbox.register("evaluate", fitness_torch)
-
-    population, start_gen = load_progress()
-    if population is None:
-        population = toolbox.population(n=500)
-        start_gen = 0
-
-    NGEN, CXPB, MUTPB = 500, 0.7, 0.2
-    batch_size = 500
-
-    for gen in range(start_gen, NGEN):
-        start_time = time.time()
-        offspring = algorithms.varAnd(population, toolbox, cxpb=CXPB, mutpb=MUTPB)
-        num_individuals = len(offspring)
-
-        with ThreadPoolExecutor() as executor:
-            futures = []
-            for i in range(0, num_individuals, batch_size):
-                batch = offspring[i:i + batch_size]
-                individuals = [ind[:] for ind in batch]
-                futures.append(
-                    executor.submit(toolbox.evaluate, individuals, price, load, temperature, irradiance, wind_speed, 0))
-
-            for future in futures:
-                fitnesses = future.result()
-                for ind, fitness in zip(offspring, fitnesses):
-                    ind.fitness.values = (fitness,)
-
-        population = toolbox.select(offspring, k=len(population))
-        end_time = time.time()
-        elapsed_time = end_time - start_time
-        print(f"第 {gen + 1} 代完成于 {elapsed_time:.2f} 秒")
-
-        if (gen + 1) % 100 == 0:
-            best_ind = tools.selBest(population, 1)[0]
-            save_decision_values(best_ind, period, gen + 1)
-            save_progress(population, gen + 1)
-
-    best_ind = tools.selBest(population, 1)[0]
-    print('最佳个体:', best_ind)
-    print('适应度:', best_ind.fitness.values)
-    save_decision_values(best_ind, period, NGEN)
-    save_progress(population, NGEN)
-
-
-if __name__ == "__main__":
-    main()
--- a/genrtic_gpu_7.22.py
+++ b/genrtic_gpu_7.22.py
@ -1,183 +0,0 @@
-import json
-import os
-import time
-import numpy as np
-import pandas as pd
-import torch
-from concurrent.futures import ThreadPoolExecutor
-from deap import base, creator, tools, algorithms
-
-
-def fitness_torch(individuals, price, load, temperature, irradiance, wind_speed, prev_soc, prev_Pg1, prev_Pg2, prev_Pg3,
-                  device):
-    individuals_torch = torch.tensor(individuals, device=device)
-    num = individuals_torch.shape[0]
-    Ac, Ag1, Ag2, Ag3, Av = [individuals_torch[:, i * period:(i + 1) * period] for i in range(5)]
-    # soc = torch.zeros((num, period), device=device)
-    # Pg1, Pg2, Pg3 = [torch.zeros((num, period), device=device) for _ in range(3)]
-    # Cb, Cg1, Cg2, Cg3, Cs, Cw, Rs, Cp, Pe, Ps, Ee, Es = [torch.zeros((num, period), device=device) for _ in range(12)]
-
-    soc = torch.clamp(prev_soc + 0.2 * Ac * 0.9, 0.2, 0.8)
-    Pg1 = torch.clamp(prev_Pg1 + 100 * Ag1, min=0, max=150)
-    Pg2 = torch.clamp(prev_Pg2 + 100 * Ag2, min=0, max=375)
-    Pg3 = torch.clamp(prev_Pg3 + 200 * Ag3, min=0, max=500)
-    Pso = torch.clamp((0.2 * irradiance + 0.05 * temperature - 9.25) * (1 + Av), min=0)
-    Pw = torch.where(
-        (wind_speed >= 3) & (wind_speed < 8),
-        wind_speed ** 3 * 172.2625 / 1000,
-        torch.where(
-            (wind_speed >= 8) & (wind_speed < 12),
-            64 * 172.2625 / 125,
-            torch.zeros_like(wind_speed)
-        )
-    )
-    P = Ac + Pg1 + Pg2 + Pg3 + Pso + Pw
-
-    Ee = torch.where(P >= load, P - load, torch.zeros_like(P))
-    Es = torch.where(P < load, load - P, torch.zeros_like(P))
-
-    Cb = 0.01 * Ac + 0.1 * soc
-    Cg1 = 0.0034 * Pg1 ** 2 + 3 * Pg1 + 30
-    Cg2 = 0.001 * Pg2 ** 2 + 10 * Pg2 + 40
-    Cg3 = 0.001 * Pg3 ** 2 + 15 * Pg3 + 70
-    Cs = 0.01 * Pso
-    Cw = 0.01 * Pw
-    Rs = 0.5 * price * Ee
-    Cp = price * Es
-    Pe = torch.where(Ee > 100, (Ee - 100) * 50, torch.zeros_like(Ee))
-    Ps = torch.where(Es > 100, (Es - 100) * 50, torch.zeros_like(Es))
-
-    total_cost = torch.sum(Cb + Cg1 + Cg2 + Cg3 + Cs + Cw + Pe + Ps - Rs + Cp, dim=1)
-    reward = -total_cost / 1000
-
-    return reward.cpu().numpy(), soc.cpu().numpy(), Pg1.cpu().numpy(), Pg2.cpu().numpy(), Pg3.cpu().numpy()
-
-
-def save_decision_values(best_ind, period, index):
-    decisions = {
-        'Ac': best_ind[0],
-        'Ag1': best_ind[1],
-        'Ag2': best_ind[2],
-        'Ag3': best_ind[3],
-        'Av': best_ind[4]
-    }
-    with open(f'decision_values_{period}_index_{index}.json', 'w') as f:
-        json.dump(decisions, f)
-
-
-def save_progress(population, period):
-    population_data = [ind.tolist() for ind in population]
-    with open(f'population_{period}.json', 'w') as f:
-        json.dump({'period': period, 'population': population_data}, f)
-
-
-def load_progress():
-    if os.path.exists('population_gen_499.json'):
-        with open('population_gen_499.json', 'r') as f:
-            data = json.load(f)
-        return data['population'], data['period']
-    return None, 0
-
-
-def check_bounds(func):
-    def wrapper(*args, **kwargs):
-        offspring = func(*args, **kwargs)
-        if offspring[0] is None or offspring[1] is None:
-            print("Error: One of the offspring is None", offspring)
-            raise ValueError("Offspring cannot be None.")
-        for child in offspring:
-            for i in range(len(child)):
-                if child[i] < -1:
-                    child[i] = -1
-                elif child[i] > 1:
-                    child[i] = 1
-        return offspring
-    return wrapper
-
-
-def main():
-    period = 8760
-    NGEN = 500
-    CXPB, MUTPB = 0.7, 0.2
-    batch_size = 500
-
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-    data = pd.read_csv('./data.csv')
-    price, load, temperature, irradiance, wind_speed = [data[col].values for col in
-                                                        ['price', 'load', 'temperature', 'irradiance', 'wind_speed']]
-
-    creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
-    creator.create("Individual", list, fitness=creator.FitnessMin)
-
-    toolbox = base.Toolbox()
-    toolbox.register("attr_float", np.random.uniform, -1, 1)
-    toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_float, 5)
-    toolbox.register("population", tools.initRepeat, list, toolbox.individual)
-
-    toolbox.register("mate", check_bounds(tools.cxBlend), alpha=0.5)
-    toolbox.register("mutate", check_bounds(tools.mutGaussian), mu=0, sigma=0.1, indpb=0.2)
-    toolbox.register("select", tools.selTournament, tournsize=3)
-    toolbox.register("evaluate", fitness_torch)
-
-    population = load_progress()
-    if population is None:
-        population = toolbox.population(n=NGEN)
-        print("Initial population:", len(population), "and first individual:", population[0])
-
-    prev_soc = torch.tensor([0.4] * NGEN, device=device)
-    prev_Pg1 = torch.zeros(NGEN, device=device)
-    prev_Pg2 = torch.zeros(NGEN, device=device)
-    prev_Pg3 = torch.zeros(NGEN, device=device)
-
-    for index in range(period):
-        for gen in range(NGEN):
-            start_time = time.time()
-
-            offspring = algorithms.varAnd(population, toolbox, cxpb=CXPB, mutpb=MUTPB)
-            num_individuals = len(offspring)
-
-            with ThreadPoolExecutor() as executor:
-                futures = []
-                for i in range(0, num_individuals, batch_size):
-                    batch = offspring[i:i + batch_size]
-                    individuals = [ind[:] for ind in batch]
-                    futures.append(
-                        executor.submit(toolbox.evaluate, individuals, price[index], load[index], temperature[index],
-                                        irradiance[index], wind_speed[index], prev_soc, prev_Pg1, prev_Pg2, prev_Pg3,
-                                        device)
-                    )
-
-                for future in futures:
-                    fitnesses, socs, Pg1s, Pg2s, Pg3s = zip(*future.result())
-                    for ind, fitness in zip(offspring, fitnesses):
-                        ind.fitness.values = (fitness,)
-
-                    prev_soc[:len(socs)] = torch.tensor(socs, device=device)
-                    prev_Pg1[:len(Pg1s)] = torch.tensor(Pg1s, device=device)
-                    prev_Pg2[:len(Pg2s)] = torch.tensor(Pg2s, device=device)
-                    prev_Pg3[:len(Pg3s)] = torch.tensor(Pg3s, device=device)
-
-            population = toolbox.select(offspring, k=len(population))
-            print("Population after selection:", population[:5])
-            end_time = time.time()
-            print(f"第 {index + 1}小时完成花费 {end_time - start_time:.2f} 秒")
-
-        best_ind = tools.selBest(population, 1)[0]
-        print('最佳个体:', best_ind)
-        print('适应度:', best_ind.fitness.values)
-        save_decision_values(best_ind, period, index)
-        save_progress(population, period)
-
-        prev_soc.fill_(0.4)
-        prev_Pg1.fill_(0)
-        prev_Pg2.fill_(0)
-        prev_Pg3.fill_(0)
-        prev_soc[:len(best_ind)] = torch.tensor(best_ind[:period], device=device)
-        prev_Pg1[:len(best_ind)] = torch.tensor(best_ind[period:2 * period], device=device)
-        prev_Pg2[:len(best_ind)] = torch.tensor(best_ind[2 * period:3 * period], device=device)
-        prev_Pg3[:len(best_ind)] = torch.tensor(best_ind[3 * period:4 * period], device=device)
-
-
-if __name__ == "__main__":
-    main()
--- a/module.py
+++ b/module.py
@ -42,7 +42,6 @@ class Battery:
        self.capacity = parameters['capacity']
        self.min_soc = parameters['min_soc']
        self.max_soc = parameters['max_soc']
-        self.initial_capacity = parameters['initial_capacity']
        self.degradation = parameters['degradation']
        self.holding = parameters['holding']
        self.max_charge = parameters['max_charge']
--- a/parameters.py
+++ b/parameters.py
@ -30,7 +30,6 @@ battery_parameters = {
    'holding': 0.1,
    'max_soc': 0.8,
    'min_soc': 0.2,
-    'initial_capacity': 0.4
 }

 dg_parameters = {
--- a/plotDRL.py
+++ b/plotDRL.py
@ -67,8 +67,8 @@ def plot_optimization_result(datasource, directory):  # data source is dataframe
    axs[1, 0].plot(T, datasource['netload'], label='Netload', drawstyle='steps-mid', alpha=0.7)
    axs[1, 0].legend(loc='upper right', bbox_to_anchor=(1.4, 1), fontsize=12, frameon=False, labelspacing=0.3)

-    fig.savefig(f"{directory}/rl.svg", format='svg', dpi=600, bbox_inches='tight')
-    print('rl figure have been ploted and saved')
+    fig.savefig(f"{directory}/gurobi.svg", format='svg', dpi=600, bbox_inches='tight')
+    print('gurobi figure have been ploted and saved')


 def plot_evaluation_information(datasource, directory):
@ -139,8 +139,8 @@ def plot_evaluation_information(datasource, directory):
    axs[1, 1].set_xlabel('Time (h)')
    axs[1, 1].set_ylabel('Cost')
    axs[1, 1].bar(eval_data['time_step'], eval_data['operation_cost'])
-    fig.savefig(f"{directory}/gurobi.svg", format='svg', dpi=600, bbox_inches='tight')
-    print('gurobi figure have been ploted and saved')
+    fig.savefig(f"{directory}/rl.svg", format='svg', dpi=600, bbox_inches='tight')
+    print('rl figure have been ploted and saved')


 def make_dir(directory, feature_change):
--- a/qwen_connect.py
+++ b/qwen_connect.py
@ -1,88 +0,0 @@
-from pathlib import Path
-from openai import OpenAI
-import os
-
-client = OpenAI(
-    api_key='sk-f01744b2801344b1a72f89ec7e290cad',
-    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",  # 填写DashScope服务base_url
-)
-
-file_csv = client.files.create(file=Path("./data.csv"), purpose="file-extract")
-
-# 新文件上传后需要等待模型解析，首轮响应时间可能较长
-completion = client.chat.completions.create(
-    model="qwen-long",
-    messages=[
-        {
-            'role': 'system',
-            'content':
-                '你是一名擅长调试和控制智慧住宅能源系统的大模型和强化学习专家，可根据当前的环境状态给出最优的动作决策。'
-                '目前的能源系统结构主要由太阳能发电站、风能发电站、发电机和电池模块通过电力网与住宅相连接。以下是中文和符号的对照：'
-                'As：电池的充电状态动作；Ag1：发电机1的功率动作；Ag2：发电机2的功率动作；Ag3：发电机3的功率动作；Av：太阳能的电压动作；'
-                'Sp：电价状态；Sc：电池荷电状态；Sl：住宅负载状态；Sg1：发电机1功率状态；Sg2：发电机2功率状态；Sg3：发电机3功率状态；St：温度状态；Si：太阳辐射度状态；Sw：风速状态；'
-                'Pc：电池充放电量；Pg1：发电机1发电量；Pg2：发电机2发电量；Pg3：发电机3发电量；Pso：太阳能发电量；Pw：风能发电量；'
-                'Cb：电池成本；Cg1：发电机1成本；Cg2：发电机2成本；Cg3：发电机3成本；Cs：太阳能成本；Cw：风能成本；'
-                'Rs：销售收入；Cp：购买成本；Ee：过剩能源；Es：短缺能源；Pe：过剩惩罚；Ps：短缺惩罚。'
-                '一、控制方案和目标：'
-                '1、优先使用太阳能和风能发电，当无法满足满负荷时，请使用发电机和电池供电。'
-                '2、在供电时考虑多种供电来源（太阳能、风能、发电机和电池），综合每个发电模块的成本，需满足下一时刻的住宅用电负载，且使供需差异最小化。'
-                '3、减少超过公网交易限额的交易，以避免公共电网波动影响住宅用电的稳定性。'
-                '4、最大化奖励函数并最小化能源不平衡。'
-                '二、连续状态空间：是一个长度为9的向量，包含以下部分：'
-                'Sp：由数据提供；Sc：初始为0.4，始终介于0.2和0.8；Sl：由数据提供；'
-                'Sg1：初始功率为0；Sg2：初始功率为0；Sg3：初始功率为0；'
-                'St：由数据提供；Si：由数据提供；Sw：由数据提供。'
-                '其中Sc，Sg1、Sg2和Sg3始终不小于0。'
-                '三、连续动作空间：一个长度为5的向量，x表示当前时间的取值, 其范围从-1到1，'
-                '每个时刻的值都是在上一时刻基础上进行变化的，Ag1、Ag2和Ag3分别表示相对于各自最大瞬时功率100、100和200的变化值。'
-                '1、Ac：电池总容量为500，初始电池容量为500*0.4=200，电池容量变化100*x；'
-                '2、Ag1：发电量变化100*x；'
-                '3、Ag2：发电量变化100*x；'
-                '4、Ag3：发电量变化200*x；'
-                '5、Av：太阳能电压变为(1+x)倍。'
-                '可根据需求通过改变功率控制发电机的开关，当关闭时保持Sg=0，当打开时需满足Pg约束。'
-                '根据成本，通常前一台发电机的最大功率不足以满足剩余负荷时，才开启后续发电机。'
-                '四、发电量计算公式'
-                '1、Pc = max(0.2, min(0.8, Sc + 0.2 * action[0] * 0.9))，其中0.8和0.2为电池荷电状态上下限；'
-                '2、Pg1 = max(0, min(150, 100 * action[1] + Sg1))，其中150和0为发电机1开启时发电量上下限；'
-                '3、Pg2 = max(0, min(375, 100 * action[2] + Sg2))，其中375和0为发电机2开启时发电量上下限；'
-                '4、Pg3 = max(0, min(500, 200 * action[3] + Sg3))，其中500和0为发电机2开启时发电量上下限；'
-                '5、Pso = max(0, (0.2 * Si + 0.05 * St - 9.25) * (1 + action[4]))，其中太阳能发电功率不小于0。'
-                '6、Pw：与Ws有关'
-                'if 3 <= Ws < 8, Pw = Ws^3 * 172.2625 / 1000'
-                'elif 8 <= Ws < 12, Pw = 64 * 172.2625 / 125'
-                'else Pw = 0'
-                '五、成本和能源平衡计算'
-                'P = Pc + Pg1 + Pg2 + Pg3 + Pso + Pw'
-                'if P >= Sl , Ee = P – Sl, else Es = Sl - P'
-                '1、Cb = 0.01 * Asoc + 0.1 * Sc'
-                '2、Cg1 = 0.0034 * Pg1^2 + 3 * Pg1 + 30'
-                '3、Cg2 = 0.001 * Pg2^2 + 10 * Pg2 + 40'
-                '4、Cg3 = 0.001 * Pg3^2 + 15 * Pg3 + 70'
-                '5、Cs = 0.01 * Pso'
-                '6、Cw = 0.01 * Pw'
-                '7、Rs = 0.5 * Sp * Ee'
-                '8、Cp = Sp * Es'
-                '9、Pe = (Ee - 100) * 50'
-                '10、Ps = (Es - 100) * 50'
-                '因为Rs、Pe和Cp、Ps对应能源过剩和能源短缺情况，只能同时存在一组。'
-                '六、奖励函数'
-                'Reward = - (Cb + Cg1 + Cg2 + Cg3 + Cs + Cw + Pe + Ps - Rs + Cp) / 1000'
-        },
-        {
-            'role': 'system',
-            'content': f'fileid://{file_csv.id}'
-        },
-        {
-            'role': 'user',
-            'content':
-                '读取数据并使用上述公式计算每个时刻的状态，请你使用最合适的优化方法，给出每个时刻的动作控制方案，'
-                '每小时计算一次并生成一个长度为5的数组，表示Ac、Ag1、Ag2、Ag3和Av的动作决策。'
-                '返回的数据应为以下Json格式：[{x1, x2, x3, x4, x5}, {x1, x2, x3, x4, x5}, ..., {x1, x2, x3, x4, x5}]'
-                '只需要json数据，不需要其他任何分析。'
-        }
-    ],
-    stream=False
-)
-
-print(completion.choices[0].message.model_dump())  # 非流式输出
--- a/tools.py
+++ b/tools.py
@ -85,7 +85,7 @@ def optimization_base_result(env, month, day, initial_soc):
                  (battery_energy_change[t] * battery_efficiency) for t in range(1, period)), name='soc update')
    # 设置成本函数
    cost_gen = gp.quicksum(
-        (a_para[g] * gen_output[g, t] * gen_output[g, t] + b_para[g] * gen_output[g, t] + c_para[g] * on_off[g, t]) for
+        (a_para[g] * gen_output[g, t] * gen_output[g, t] + b_para[g] * gen_output[g, t] + c_para[g]) * on_off[g, t] for
        t in range(period) for g in range(NUM_GEN))
    cost_battery = gp.quicksum(battery_energy_change[t] * battery_degradation + soc[t] * battery_holding for t in range(period))
    cost_import = gp.quicksum(grid_energy_import[t] * price[t] for t in range(period))
@ -94,7 +94,6 @@ def optimization_base_result(env, month, day, initial_soc):
    cost_wind = gp.quicksum(wind[t] * wind_cofficient for t in range(period))

    m.setObjective((cost_gen + cost_battery + cost_import - cost_export + cost_solar + cost_wind), GRB.MINIMIZE)
-    m.setParam(GRB.Param.FuncNonlinear, 1)
    m.optimize()

    # 记录数据便于绘图
@ -155,7 +154,7 @@ class Arguments:
        self.num_episode = 1000
        self.gamma = 0.995  # discount factor of future rewards
        # self.reward_scale = 1  # an approximate target reward usually be closed to 256
-        self.learning_rate = 2 ** -14  # 2 ** -14 ~= 6e-5
+        self.learning_rate = 2 ** -14  # 2e-4
        self.soft_update_tau = 2 ** -8  # 2 ** -8 ~= 5e-3
        self.net_dim = 256  # the network width 256
        self.batch_size = 4096  # num of transitions sampled from replay buffer.