import random from PPO_llm import * from plotDRL import * def generate_test_dates(num): dates = [] months_days = {1: 31, 2: 28, 3: 31, 4: 30, 5: 31, 6: 30, 7: 31, 8: 31, 9: 30, 10: 31, 11: 30, 12: 31} while len(dates) < num: month = random.randint(1, 12) day = random.randint(20, months_days[month]) if (month, day) not in dates: dates.append((month, day)) return dates def test_llm(env, llm_actions, index): sum_reward = 0 sum_unbalance = 0 for i in range(24): action = llm_actions[index + i] state, next_state, reward, done = env.step(action) sum_reward += reward sum_unbalance += env.real_unbalance record = {'reward': [sum_reward], 'unbalance': [sum_unbalance]} return record def test_one_episode(env, act, device): state = env.reset() sum_reward = 0 sum_unbalance = 0 for i in range(24): s_tensor = torch.as_tensor((state,), device=device) a_tensor = act(s_tensor) action = a_tensor.detach().cpu().numpy()[0] # rl_action = a_tensor.detach().cpu().numpy()[0] # llm_action = llm_actions[index + i] # action = 0.95 * np.array(rl_action) + 0.05 * np.array(llm_action) state, next_state, reward, done = env.step(action) state = next_state sum_reward += reward sum_unbalance += env.real_unbalance record = {'reward': [sum_reward], 'unbalance': [sum_unbalance]} return record def run_multiple_tests(env, args, agent1, agent2, num_tests=10): all_ppo = pd.DataFrame() all_llm = pd.DataFrame() all_lmppo = pd.DataFrame() all_milp = pd.DataFrame() test_dates = generate_test_dates(num_tests) llm_actions = load_llm_actions('data/results.json') for month, day in test_dates: print(f'current testing month is {month}, day is {day}') index = (sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24 ppo_res = test_one_episode(env, agent1.act, agent1.device) base_res = optimization_base_result(env, month, day) llm_res = test_llm(env, llm_actions, index) lmppo_res = test_one_episode(env, agent2.act, agent2.device) ppo_data = pd.DataFrame(ppo_res) m_reward = - base_res['step_cost'].sum()/1e3 milp_data = pd.DataFrame({'reward': [m_reward]}) llm_data = pd.DataFrame(llm_res) lmppo_data = pd.DataFrame(lmppo_res) all_ppo = pd.concat([all_ppo, ppo_data], ignore_index=True) all_milp = pd.concat([all_milp, milp_data], ignore_index=True) all_llm = pd.concat([all_llm, llm_data], ignore_index=True) all_lmppo = pd.concat([all_lmppo, lmppo_data], ignore_index=True) test_ppo_path = f'{args.cwd}/melt_ppo.pkl' test_llm_path = f'{args.cwd}/melt_llm.pkl' test_lmppo_path = f'{args.cwd}/melt_lmppo.pkl' test_milp_path = f'{args.cwd}/melt_milp.pkl' with open(test_ppo_path, 'wb') as tf: pickle.dump(all_ppo, tf) with open(test_milp_path, 'wb') as tf: pickle.dump(all_milp, tf) with open(test_llm_path, 'wb') as f: pickle.dump(all_llm, f) with open(test_lmppo_path, 'wb') as tf: pickle.dump(all_lmppo, tf) plot_args = PlotArgs() plot_args.feature_change = 'llm_1015' args.cwd = agent_name plot_dir = make_dir(args.cwd, plot_args.feature_change) plot_melt(test_ppo_path, test_llm_path, test_lmppo_path, test_milp_path, plot_dir) if __name__ == '__main__': args = Arguments() args.visible_gpu = '0' for seed in args.random_seed_list: args.random_seed = seed args.agent1 = args.agent2 = AgentPPO() args.agent1.cri_target = args.agent2.cri_target = True agent_name = f'{args.agent1.__class__.__name__}' args.env = ESSEnv() args.init_before_training() agent1 = args.agent1 agent2 = args.agent2 env = args.env agent1.init(args.net_dim, env.state_space.shape[0], env.action_space.shape[0], args.learning_rate, args.if_gae_or_raw) agent2.init(args.net_dim, env.state_space.shape[0], env.action_space.shape[0], args.learning_rate, args.if_gae_or_raw) cwd = args.cwd gamma = args.gamma batch_size = args.batch_size target_step = args.target_step repeat_times = args.repeat_times soft_update_tau = args.soft_update_tau agent1.state = agent2.state = env.reset() buffer = list() num_episode = args.num_episode # args.test_network = False if args.test_network: args.cwd = agent_name ppo_act_save_path = f'{args.cwd}/actor_10.pth' lmppo_act_save_path = f'{args.cwd}/actor_llm_1015.pth' agent1.act.load_state_dict(torch.load(ppo_act_save_path)) agent2.act.load_state_dict(torch.load(lmppo_act_save_path)) print('parameters have been reload and test') env.TRAIN = False run_multiple_tests(env, args, agent1, agent2, num_tests=12)