import random from PPO_llm import * from plotDRL import * test_day = 9 # 测试天数 def test_ram_episode(env): state = env.reset() sum_reward = 0 sum_unbalance = 0 for i in range(24): action = np.random.uniform(env.action_space.low, env.action_space.high) state, next_state, reward, done = env.step(action) state = next_state sum_reward += reward sum_unbalance += env.real_unbalance record = {'reward': [sum_reward], 'unbalance': [sum_unbalance]} return record def test_llm(env, llm_actions, index): sum_reward = 0 sum_unbalance = 0 for i in range(24): action = llm_actions[index + i] state, next_state, reward, done = env.step(action) sum_reward += reward sum_unbalance += env.real_unbalance record = {'reward': [sum_reward], 'unbalance': [sum_unbalance]} return record def test_one_episode(env, act, device): state = env.reset() sum_reward = 0 sum_unbalance = 0 for i in range(24): s_tensor = torch.as_tensor((state,), device=device) a_tensor = act(s_tensor) action = a_tensor.detach().cpu().numpy()[0] # rl_action = a_tensor.detach().cpu().numpy()[0] # llm_action = llm_actions[index + i] # action = 0.95 * np.array(rl_action) + 0.05 * np.array(llm_action) state, next_state, reward, done = env.step(action) state = next_state sum_reward += reward sum_unbalance += env.real_unbalance record = {'reward': [sum_reward], 'unbalance': [sum_unbalance]} return record def generate_test_dates(): # months_days = {1: 31, 2: 28, 3: 31, 4: 30, 5: 31, 6: 30, 7: 31, 8: 31, 9: 30, 10: 31, 11: 30, 12: 31} dates = [] for month in range(1, 13): # 遍历12个月 start_day = 20 # days_in_month = months_days[month] # 确保每个月至少有9天 # if days_in_month >= start_day + 9: # 生成8个连续的天 for day in range(start_day, start_day + test_day): dates.append((month, day)) return dates def run_multiple_tests(env, args, agent1, agent2): all_ppo = pd.DataFrame() all_llm = pd.DataFrame() all_lmppo = pd.DataFrame() all_milp = pd.DataFrame() all_ram = pd.DataFrame() test_dates = generate_test_dates() # 获取测试日期 llm_actions = load_llm_actions('data/results_day.json') # 对每个日期进行测试 for idx in range(0, len(test_dates), test_day): current_dates = test_dates[idx:idx + test_day] print(f'Start testing for {current_dates[0]} to {current_dates[-1]}') ppo_rewards = [] ppo_unbalances = [] llm_rewards = [] llm_unbalances = [] lmppo_rewards = [] lmppo_unbalances = [] milp_rewards = [] ram_rewards = [] ram_unbalances = [] # 对当前这x天的日期进行逐个测试 for month, day in current_dates: index = (sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24 ppo_res = test_one_episode(env, agent1.act, agent1.device) base_res = optimization_base_result(env, month, day) llm_res = test_llm(env, llm_actions, index) lmppo_res = test_one_episode(env, agent2.act, agent2.device) ram_res = test_ram_episode(env) # 存储每一天的结果 ppo_rewards.append(ppo_res['reward'][0]) ppo_unbalances.append(ppo_res['unbalance'][0]) llm_rewards.append(llm_res['reward'][0]) llm_unbalances.append(llm_res['unbalance'][0]) lmppo_rewards.append(lmppo_res['reward'][0]) lmppo_unbalances.append(lmppo_res['unbalance'][0]) m_reward = - base_res['step_cost'].sum() / 1e3 milp_rewards.append(m_reward) ram_rewards.append(ram_res['reward'][0]) ram_unbalances.append(ram_res['unbalance'][0]) # 计算x天的平均结果 avg_ppo_reward = sum(ppo_rewards) / len(ppo_rewards) avg_ppo_unbalance = sum(ppo_unbalances) / len(ppo_unbalances) avg_llm_reward = sum(llm_rewards) / len(llm_rewards) avg_llm_unbalance = sum(llm_unbalances) / len(llm_unbalances) avg_lmppo_reward = sum(lmppo_rewards) / len(lmppo_rewards) avg_lmppo_unbalance = sum(lmppo_unbalances) / len(lmppo_unbalances) avg_milp_reward = sum(milp_rewards) / len(milp_rewards) avg_ram_reward = sum(ram_rewards) / len(ram_rewards) avg_ram_unbalance = sum(ram_unbalances) / len(ram_unbalances) # 保存每组x天的平均结果 ppo_data = pd.DataFrame({'reward': [avg_ppo_reward], 'unbalance': [avg_ppo_unbalance]}) llm_data = pd.DataFrame({'reward': [avg_llm_reward], 'unbalance': [avg_llm_unbalance]}) lmppo_data = pd.DataFrame({'reward': [avg_lmppo_reward], 'unbalance': [avg_lmppo_unbalance]}) milp_data = pd.DataFrame({'reward': [avg_milp_reward]}) ram_data = pd.DataFrame({'reward': [avg_ram_reward], 'unbalance': [avg_ram_unbalance]}) all_ppo = pd.concat([all_ppo, ppo_data], ignore_index=True) all_llm = pd.concat([all_llm, llm_data], ignore_index=True) all_lmppo = pd.concat([all_lmppo, lmppo_data], ignore_index=True) all_milp = pd.concat([all_milp, milp_data], ignore_index=True) all_ram = pd.concat([all_ram, ram_data], ignore_index=True) # 保存结果 test_ppo_path = f'{args.cwd}/melt_ppo.pkl' test_llm_path = f'{args.cwd}/melt_llm.pkl' test_lmppo_path = f'{args.cwd}/melt_lmppo.pkl' test_milp_path = f'{args.cwd}/melt_milp.pkl' test_ram_path = f'{args.cwd}/melt_ram.pkl' with open(test_ppo_path, 'wb') as tf: pickle.dump(all_ppo, tf) with open(test_milp_path, 'wb') as tf: pickle.dump(all_milp, tf) with open(test_llm_path, 'wb') as f: pickle.dump(all_llm, f) with open(test_lmppo_path, 'wb') as tf: pickle.dump(all_lmppo, tf) with open(test_ram_path, 'wb') as tf: pickle.dump(all_ram, tf) plot_args = PlotArgs() plot_args.feature_change = 'llm_1015' args.cwd = agent_name plot_dir = make_dir(args.cwd, plot_args.feature_change) plot_melt(test_ppo_path, test_llm_path, test_lmppo_path, test_milp_path, test_ram_path, plot_dir) if __name__ == '__main__': args = Arguments() args.visible_gpu = '0' for seed in args.random_seed_list: args.random_seed = seed args.agent1 = args.agent2 = AgentPPO() args.agent1.cri_target = args.agent2.cri_target = True agent_name = f'{args.agent1.__class__.__name__}' args.env = ESSEnv() args.init_before_training() agent1 = args.agent1 agent2 = args.agent2 env = args.env agent1.init(args.net_dim, env.state_space.shape[0], env.action_space.shape[0], args.learning_rate, args.if_gae_or_raw) agent2.init(args.net_dim, env.state_space.shape[0], env.action_space.shape[0], args.learning_rate, args.if_gae_or_raw) cwd = args.cwd gamma = args.gamma batch_size = args.batch_size target_step = args.target_step repeat_times = args.repeat_times soft_update_tau = args.soft_update_tau agent1.state = agent2.state = env.reset() buffer = list() num_episode = args.num_episode # args.test_network = False if args.test_network: args.cwd = agent_name ppo_act_save_path = f'{args.cwd}/actor_10.pth' lmppo_act_save_path = f'{args.cwd}/actor_llm_1015.pth' agent1.act.load_state_dict(torch.load(ppo_act_save_path)) agent2.act.load_state_dict(torch.load(lmppo_act_save_path)) print('parameters have been reload and test') env.TRAIN = False run_multiple_tests(env, args, agent1, agent2)