131 lines
5.9 KiB
Python
131 lines
5.9 KiB
Python
import pickle
|
|
import pandas as pd
|
|
import torch
|
|
|
|
from agent import AgentSAC
|
|
from environment import ESSEnv
|
|
from tools import *
|
|
|
|
|
|
def update_buffer(_trajectory):
|
|
ten_state = torch.as_tensor([item[0] for item in _trajectory], dtype=torch.float32)
|
|
ary_other = torch.as_tensor([item[1] for item in _trajectory])
|
|
ary_other[:, 0] = ary_other[:, 0] # ten_reward
|
|
ary_other[:, 1] = (1.0 - ary_other[:, 1]) * gamma # ten_mask = (1.0 - ary_done) * gamma
|
|
|
|
buffer.extend_buffer(ten_state, ary_other)
|
|
|
|
_steps = ten_state.shape[0]
|
|
_r_exp = ary_other[:, 0].mean() # other = (reward, mask, action)
|
|
return _steps, _r_exp
|
|
|
|
|
|
if __name__ == '__main__':
|
|
args = Arguments()
|
|
reward_record = {'episode': [], 'steps': [], 'mean_episode_reward': [], 'unbalance': []}
|
|
loss_record = {'episode': [], 'steps': [], 'critic_loss': [], 'actor_loss': [], 'entropy_loss': []}
|
|
args.visible_gpu = '0'
|
|
for seed in args.random_seed_list:
|
|
args.random_seed = seed
|
|
args.agent = AgentSAC()
|
|
agent_name = f'{args.agent.__class__.__name__}'
|
|
args.agent.cri_target = True
|
|
args.env = ESSEnv()
|
|
args.init_before_training(if_main=True)
|
|
'''init agent and environment'''
|
|
agent = args.agent
|
|
env = args.env
|
|
agent.init(args.net_dim, env.state_space.shape[0], env.action_space.shape[0], args.learning_rate,
|
|
args.if_per_or_gae)
|
|
'''init replay buffer'''
|
|
buffer = ReplayBuffer(max_len=args.max_memo, state_dim=env.state_space.shape[0],
|
|
action_dim=env.action_space.shape[0])
|
|
'''start training'''
|
|
cwd = args.cwd
|
|
gamma = args.gamma
|
|
batch_size = args.batch_size # how much data should be used to update net
|
|
target_step = args.target_step # how manysteps of one episode should stop
|
|
repeat_times = args.repeat_times # how many times should update for one batch size data
|
|
soft_update_tau = args.soft_update_tau
|
|
agent.state = env.reset()
|
|
'''collect data and train and update network'''
|
|
num_episode = args.num_episode
|
|
'''here record real unbalance'''
|
|
# args.train = False
|
|
# args.save_network = False
|
|
# args.test_network = False
|
|
# args.save_test_data = False
|
|
# args.compare_with_gurobi = False
|
|
if args.train:
|
|
collect_data = True
|
|
while collect_data:
|
|
print(f'buffer:{buffer.now_len}')
|
|
with torch.no_grad():
|
|
trajectory = agent.explore_env(env, target_step)
|
|
steps, r_exp = update_buffer(trajectory)
|
|
buffer.update_now_len()
|
|
if buffer.now_len >= 10000:
|
|
collect_data = False
|
|
for i_episode in range(num_episode):
|
|
critic_loss, actor_loss, entropy_loss = agent.update_net(buffer, batch_size, repeat_times,
|
|
soft_update_tau)
|
|
loss_record['critic_loss'].append(critic_loss)
|
|
loss_record['actor_loss'].append(actor_loss)
|
|
loss_record['entropy_loss'].append(entropy_loss)
|
|
with torch.no_grad():
|
|
episode_reward, episode_unbalance = get_episode_return(env, agent.act, agent.device)
|
|
reward_record['mean_episode_reward'].append(episode_reward)
|
|
reward_record['unbalance'].append(episode_unbalance)
|
|
print(f'epsiode: {i_episode}, reward: {episode_reward}, unbalance:{episode_unbalance}, buffer_length: {buffer.now_len}')
|
|
if i_episode % 10 == 0:
|
|
# target_step
|
|
with torch.no_grad():
|
|
trajectory = agent.explore_env(env, target_step)
|
|
steps, r_exp = update_buffer(trajectory)
|
|
|
|
act_save_path = f'{args.cwd}/actor.pth'
|
|
loss_record_path = f'{args.cwd}/loss_data.pkl'
|
|
reward_record_path = f'{args.cwd}/reward_data.pkl'
|
|
|
|
if args.save_network:
|
|
with open(loss_record_path, 'wb') as tf:
|
|
pickle.dump(loss_record, tf)
|
|
with open(reward_record_path, 'wb') as tf:
|
|
pickle.dump(reward_record, tf)
|
|
torch.save(agent.act.state_dict(), act_save_path)
|
|
print('actor parameters have been saved')
|
|
|
|
if args.test_network:
|
|
args.cwd = agent_name
|
|
agent.act.load_state_dict(torch.load(act_save_path))
|
|
print('parameters have been reload and test')
|
|
record = test_one_episode(env, agent.act, agent.device)
|
|
eval_data = pd.DataFrame(record['system_info'])
|
|
eval_data.columns = ['time_step', 'price', 'netload', 'action', 'real_action', 'soc', 'battery',
|
|
'gen1', 'gen2', 'gen3', 'pv', 'wind', 'unbalance', 'operation_cost', 'reward']
|
|
if args.save_test_data:
|
|
test_data_save_path = f'{args.cwd}/test_data.pkl'
|
|
with open(test_data_save_path, 'wb') as tf:
|
|
pickle.dump(record, tf)
|
|
|
|
'''compare with gurobi data and results'''
|
|
if args.compare_with_gurobi:
|
|
month = record['init_info'][0][0]
|
|
day = record['init_info'][0][1]
|
|
initial_soc = record['init_info'][0][3]
|
|
base_result = optimization_base_result(env, month, day, initial_soc)
|
|
if args.plot_on:
|
|
from plotDRL import PlotArgs, make_dir, plot_evaluation_information, plot_optimization_result
|
|
|
|
plot_args = PlotArgs()
|
|
plot_args.feature_change = '10'
|
|
args.cwd = agent_name
|
|
plot_dir = make_dir(args.cwd, plot_args.feature_change)
|
|
plot_optimization_result(base_result, plot_dir)
|
|
plot_evaluation_information(args.cwd + '/' + 'test_data.pkl', plot_dir)
|
|
'''compare the different cost get from gurobi and SAC'''
|
|
ration = sum(eval_data['operation_cost']) / sum(base_result['step_cost'])
|
|
print('rl_cost:', sum(eval_data['operation_cost']))
|
|
print('gurobi_cost:', sum(base_result['step_cost']))
|
|
print('ration:', ration)
|