save
This commit is contained in:
parent
5d923043d5
commit
a03ce7b404
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,68 @@
|
||||||
|
class Constant:
|
||||||
|
MONTHS_LEN = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
|
||||||
|
|
||||||
|
|
||||||
|
class DataManager:
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self.Prices = []
|
||||||
|
self.Temper = []
|
||||||
|
self.Solar = []
|
||||||
|
self.Electricity = []
|
||||||
|
self.Heat = []
|
||||||
|
self.People = []
|
||||||
|
|
||||||
|
def add_price_element(self, e): self.Prices.append(e)
|
||||||
|
|
||||||
|
def add_temper_element(self, e): self.Temper.append(e)
|
||||||
|
|
||||||
|
def add_solar_element(self, e): self.Solar.append(e)
|
||||||
|
|
||||||
|
def add_electricity_element(self, e): self.Electricity.append(e)
|
||||||
|
|
||||||
|
def add_heat_element(self, e): self.Heat.append(e)
|
||||||
|
|
||||||
|
def add_people_element(self, e): self.People.append(e)
|
||||||
|
|
||||||
|
# get current time data based on given month day, and day_time
|
||||||
|
def get_price_data(self, month, day, day_time):
|
||||||
|
return self.Prices[(sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24 + day_time]
|
||||||
|
|
||||||
|
def get_temper_data(self, month, day, day_time):
|
||||||
|
return self.Temper[(sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24 + day_time]
|
||||||
|
|
||||||
|
def get_solar_data(self, month, day, day_time):
|
||||||
|
return self.Solar[(sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24 + day_time]
|
||||||
|
|
||||||
|
def get_load_data(self, month, day, day_time):
|
||||||
|
return self.Electricity[(sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24 + day_time]
|
||||||
|
|
||||||
|
def get_heat_data(self, month, day, day_time):
|
||||||
|
return self.Heat[(sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24 + day_time]
|
||||||
|
|
||||||
|
def get_people_data(self, month, day, day_time):
|
||||||
|
return self.People[(sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24 + day_time]
|
||||||
|
|
||||||
|
# get series data for one episode
|
||||||
|
def get_series_price_data(self, month, day):
|
||||||
|
return self.Prices[(sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24:
|
||||||
|
(sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24 + 24]
|
||||||
|
|
||||||
|
def get_series_temper_data(self, month, day):
|
||||||
|
return self.Temper[(sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24:
|
||||||
|
(sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24 + 24]
|
||||||
|
|
||||||
|
def get_series_solar_data(self, month, day):
|
||||||
|
return self.Solar[(sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24:
|
||||||
|
(sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24 + 24]
|
||||||
|
|
||||||
|
def get_series_load_data(self, month, day):
|
||||||
|
return self.Electricity[(sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24:
|
||||||
|
(sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24 + 24]
|
||||||
|
|
||||||
|
def get_series_heat_data(self, month, day):
|
||||||
|
return self.Heat[(sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24:
|
||||||
|
(sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24 + 24]
|
||||||
|
|
||||||
|
def get_series_people_data(self, month, day):
|
||||||
|
return self.People[(sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24:
|
||||||
|
(sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24 + 24]
|
|
@ -0,0 +1,153 @@
|
||||||
|
import gym
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from data_manager import *
|
||||||
|
from module import *
|
||||||
|
from parameters import *
|
||||||
|
|
||||||
|
|
||||||
|
class WgzGym(gym.Env):
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
super(WgzGym, self).__init__()
|
||||||
|
self.excess = None
|
||||||
|
self.shedding = None
|
||||||
|
self.unbalance = None
|
||||||
|
self.real_unbalance = None
|
||||||
|
self.operation_cost = None
|
||||||
|
self.current_output = None
|
||||||
|
self.final_step_outputs = None
|
||||||
|
self.data_manager = DataManager()
|
||||||
|
self._load_year_data()
|
||||||
|
self.month = 1
|
||||||
|
self.day = 1
|
||||||
|
self.TRAIN = True
|
||||||
|
self.current_time = None
|
||||||
|
self.episode_length = 24
|
||||||
|
self.penalty_coefficient = 50 # 约束惩罚系数
|
||||||
|
self.sell_coefficient = 0.1 # 售出利润系数
|
||||||
|
self.EC_parameters = kwargs.get('EC_parameters', EC_parameters) # 电解水制氢器
|
||||||
|
self.HST_parameters = kwargs.get('dg_parameters', dg_parameters) # 储氢罐
|
||||||
|
|
||||||
|
self.grid = Grid()
|
||||||
|
self.EC = EC(self.EC_parameters)
|
||||||
|
self.HST = HST(self.HST_parameters)
|
||||||
|
|
||||||
|
self.action_space = gym.spaces.Box(low=-1, high=1, shape=(3,), dtype=np.float32)
|
||||||
|
'''
|
||||||
|
时间 光伏 温度(湿度暂未考虑) 电需 热需(转化为对应热水所需瓦数) 人数 电价 7
|
||||||
|
电解水制氢功率 市电功率 储氢罐容量占比 3
|
||||||
|
'''
|
||||||
|
self.state_space = gym.spaces.Box(low=0, high=1, shape=(10,), dtype=np.float32)
|
||||||
|
|
||||||
|
def reset(self, *args):
|
||||||
|
self.month = np.random.randint(1, 13) # choose 12 month
|
||||||
|
if self.TRAIN:
|
||||||
|
self.day = np.random.randint(1, 20)
|
||||||
|
else:
|
||||||
|
self.day = np.random.randint(20, Constant.MONTHS_LEN[self.month - 1])
|
||||||
|
self.current_time = 0
|
||||||
|
self.EC.reset()
|
||||||
|
self.HST.reset()
|
||||||
|
return self._build_state()
|
||||||
|
|
||||||
|
def _build_state(self):
|
||||||
|
soc = self.HST.SOC()
|
||||||
|
ec_output = self.EC.current_output
|
||||||
|
time_step = self.current_time
|
||||||
|
|
||||||
|
price = self.data_manager.get_price_data(self.month, self.day, self.current_time)
|
||||||
|
temper = self.data_manager.get_temperature_data(self.month, self.day, self.current_time)
|
||||||
|
solar = self.data_manager.get_solar_data(self.month, self.day, self.current_time)
|
||||||
|
load = self.data_manager.get_load_data(self.month, self.day, self.current_time)
|
||||||
|
heat = self.data_manager.get_heat_data(self.month, self.day, self.current_time)
|
||||||
|
people = self.data_manager.get_people_data(self.month, self.day, self.current_time)
|
||||||
|
|
||||||
|
obs = np.concatenate((np.float32(time_step), np.float32(soc), np.float32(price), np.float32(netload),
|
||||||
|
np.float32(dg1_output), np.float32(dg2_output), np.float32(dg3_output),
|
||||||
|
np.float32(temperature), np.float32(irradiance), np.float32(windspeed)), axis=None)
|
||||||
|
return obs
|
||||||
|
|
||||||
|
def step(self, action): # state transition: current_obs->take_action->get_reward->get_finish->next_obs
|
||||||
|
# 在每个组件中添加动作
|
||||||
|
current_obs = self._build_state()
|
||||||
|
temperature = current_obs[7]
|
||||||
|
irradiance = current_obs[8]
|
||||||
|
self.wind.current_power = current_obs[9]
|
||||||
|
self.battery.step(action[0]) # 执行状态转换,电池当前容量也改变
|
||||||
|
self.dg1.step(action[1])
|
||||||
|
self.dg2.step(action[2])
|
||||||
|
self.dg3.step(action[3])
|
||||||
|
self.solar.step(temperature, irradiance, action[4])
|
||||||
|
self.current_output = np.array((self.dg1.current_output, self.dg2.current_output, self.dg3.current_output,
|
||||||
|
-self.battery.energy_change, self.solar.current_power, self.wind.current_power))
|
||||||
|
actual_production = sum(self.current_output)
|
||||||
|
price = current_obs[1]
|
||||||
|
netload = current_obs[3] - self.solar.output_change
|
||||||
|
unbalance = actual_production - netload
|
||||||
|
|
||||||
|
# reward = 0.0
|
||||||
|
excess_penalty = 0
|
||||||
|
deficient_penalty = 0
|
||||||
|
sell_benefit, buy_cost = 0, 0
|
||||||
|
self.excess, self.shedding = 0, 0
|
||||||
|
if unbalance >= 0: # 过剩
|
||||||
|
if unbalance <= self.grid.exchange_ability:
|
||||||
|
sell_benefit = self.grid.get_cost(price, unbalance) * self.sell_coefficient
|
||||||
|
else:
|
||||||
|
sell_benefit = self.grid.get_cost(price, self.grid.exchange_ability) * self.sell_coefficient
|
||||||
|
# real unbalance:超电网限值
|
||||||
|
self.excess = unbalance - self.grid.exchange_ability
|
||||||
|
excess_penalty = self.excess * self.penalty_coefficient
|
||||||
|
else: # unbalance <0, 缺少惩罚
|
||||||
|
if abs(unbalance) <= self.grid.exchange_ability:
|
||||||
|
buy_cost = self.grid.get_cost(price, abs(unbalance))
|
||||||
|
else:
|
||||||
|
buy_cost = self.grid.get_cost(price, self.grid.exchange_ability)
|
||||||
|
self.shedding = abs(unbalance) - self.grid.exchange_ability
|
||||||
|
deficient_penalty = self.shedding * self.penalty_coefficient
|
||||||
|
battery_cost = self.battery.get_cost(self.battery.energy_change)
|
||||||
|
dg1_cost = self.dg1.get_cost(self.dg1.current_output)
|
||||||
|
dg2_cost = self.dg2.get_cost(self.dg2.current_output)
|
||||||
|
dg3_cost = self.dg3.get_cost(self.dg3.current_output)
|
||||||
|
solar_cost = self.solar.get_cost(self.solar.current_power)
|
||||||
|
wind_cost = self.wind.gen_cost(self.wind.current_power)
|
||||||
|
|
||||||
|
self.operation_cost = (battery_cost + dg1_cost + dg2_cost + dg3_cost + solar_cost + wind_cost
|
||||||
|
+ excess_penalty + deficient_penalty - sell_benefit + buy_cost)
|
||||||
|
reward = - self.operation_cost / 1e3
|
||||||
|
self.unbalance = unbalance
|
||||||
|
self.real_unbalance = self.shedding + self.excess
|
||||||
|
final_step_outputs = [self.dg1.current_output, self.dg2.current_output, self.dg3.current_output,
|
||||||
|
self.battery.current_capacity, self.solar.current_power, self.wind.current_power]
|
||||||
|
self.current_time += 1
|
||||||
|
finish = (self.current_time == self.episode_length)
|
||||||
|
if finish:
|
||||||
|
self.final_step_outputs = final_step_outputs
|
||||||
|
self.current_time = 0
|
||||||
|
next_obs = self.reset()
|
||||||
|
else:
|
||||||
|
next_obs = self._build_state()
|
||||||
|
return current_obs, next_obs, float(reward), finish
|
||||||
|
|
||||||
|
def _load_year_data(self):
|
||||||
|
data_df = pd.read_csv('data/all_data.csv', sep=',')
|
||||||
|
solar = data_df['solar_power'].to_numpy(dtype=float)
|
||||||
|
temper = data_df['temper'].to_numpy(dtype=float)
|
||||||
|
energy = data_df['energy_demand'].to_numpy(dtype=float)
|
||||||
|
water = data_df['water_demand'].to_numpy(dtype=float)
|
||||||
|
people = data_df['people_count'].to_numpy(dtype=float)
|
||||||
|
price = data_df['price'].to_numpy(dtype=float)
|
||||||
|
|
||||||
|
'''可根据需求重新设计训练数据大小'''
|
||||||
|
|
||||||
|
def process_elements(elements, transform_function, add_function):
|
||||||
|
for e in elements:
|
||||||
|
transformed_e = transform_function(e)
|
||||||
|
add_function(transformed_e)
|
||||||
|
|
||||||
|
process_elements(solar, lambda x: x, self.data_manager.add_load_element)
|
||||||
|
process_elements(temper, lambda x: x, self.data_manager.add_load_element)
|
||||||
|
process_elements(energy, lambda x: x, self.data_manager.add_irradiance_element)
|
||||||
|
process_elements(water, lambda x: x, self.data_manager.add_temperature_element)
|
||||||
|
process_elements(people, lambda x: x, self.data_manager.add_wind_element)
|
||||||
|
process_elements(price, lambda x: x, self.data_manager.add_price_element)
|
|
@ -0,0 +1,78 @@
|
||||||
|
class EC:
|
||||||
|
def __init__(self, params):
|
||||||
|
self.current_output = None
|
||||||
|
self.electricity_efficiency = params['electricity_efficiency']
|
||||||
|
self.hydrogen_produce = params['hydrogen_produce']
|
||||||
|
self.power_max = params['power_max']
|
||||||
|
self.power_min = params['power_min']
|
||||||
|
self.ramp = params['ramp']
|
||||||
|
self.lifetime = params['lifetime']
|
||||||
|
self.equipment_cost = params['equipment_cost']
|
||||||
|
self.carbon_reduce = params['carbon_reduce']
|
||||||
|
|
||||||
|
def step(self, action_ec):
|
||||||
|
output = self.current_output + action_ec * self.ramp
|
||||||
|
output = max(self.power_min, min(self.power_max, output)) if output > 0 else 0
|
||||||
|
self.current_output = output
|
||||||
|
|
||||||
|
def get_cost(self, price):
|
||||||
|
return self.equipment_cost / self.lifetime + price * self.current_output
|
||||||
|
|
||||||
|
def get_hydrogen(self):
|
||||||
|
return self.current_output * self.electricity_efficiency * self.hydrogen_produce
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.current_output = 0
|
||||||
|
|
||||||
|
|
||||||
|
class HST:
|
||||||
|
def __init__(self, params):
|
||||||
|
self.current_capacity = None
|
||||||
|
self.hydrogen_change = None
|
||||||
|
self.capacity = params['capacity']
|
||||||
|
self.min_soc = params['min_soc']
|
||||||
|
self.max_soc = params['max_soc']
|
||||||
|
self.degradation = params['degradation']
|
||||||
|
self.holding = params['holding']
|
||||||
|
self.ramp = params['ramp']
|
||||||
|
self.efficiency = params['efficiency']
|
||||||
|
|
||||||
|
'''
|
||||||
|
储氢罐的充气速率 = 电解水制氢速率 (电解水制氢会满足热水需求?)
|
||||||
|
'''
|
||||||
|
def step(self, action_hst):
|
||||||
|
energy = action_hst * self.ramp
|
||||||
|
current_energy = self.current_capacity * self.capacity
|
||||||
|
updated_capacity = max(self.min_soc, min(self.max_soc, (current_energy + energy) / self.capacity))
|
||||||
|
self.hydrogen_change = (updated_capacity - self.current_capacity) * self.capacity
|
||||||
|
self.current_capacity = updated_capacity # update capacity to current state
|
||||||
|
|
||||||
|
def get_cost(self, energy_change):
|
||||||
|
cost = abs(energy_change) * self.degradation
|
||||||
|
return cost
|
||||||
|
|
||||||
|
def SOC(self):
|
||||||
|
return self.current_capacity
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.current_capacity = 0.2
|
||||||
|
|
||||||
|
|
||||||
|
class Grid:
|
||||||
|
def __init__(self):
|
||||||
|
self.delta = 1
|
||||||
|
self.exchange_ability = 100
|
||||||
|
|
||||||
|
def get_cost(self, current_price, energy_exchange):
|
||||||
|
return current_price * energy_exchange * self.delta
|
||||||
|
|
||||||
|
def retrieve_past_price(self):
|
||||||
|
result = []
|
||||||
|
# 过去24小时的价格起始、结束索引
|
||||||
|
start_index = max(0, 24 * (self.day - 1) + self.time - 24)
|
||||||
|
end_index = 24 * (self.day - 1) + self.time
|
||||||
|
past_price = self.price[start_index:end_index]
|
||||||
|
result.extend(past_price)
|
||||||
|
# current_day_price = self.price[24 * self.day:24 * self.day + self.time]
|
||||||
|
# result.extend(current_day_price)
|
||||||
|
return result
|
|
@ -0,0 +1,51 @@
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
|
|
||||||
|
class ActorPPO(nn.Module):
|
||||||
|
def __init__(self, mid_dim, state_dim, action_dim):
|
||||||
|
super().__init__()
|
||||||
|
self.net = nn.Sequential(nn.Linear(state_dim, mid_dim), nn.ReLU(),
|
||||||
|
nn.Linear(mid_dim, mid_dim), nn.ReLU(),
|
||||||
|
nn.Linear(mid_dim, mid_dim), nn.Hardswish(),
|
||||||
|
nn.Linear(mid_dim, action_dim))
|
||||||
|
# the logarithm (log) of standard deviation (std) of action, it is a trainable parameter
|
||||||
|
self.a_logstd = nn.Parameter(torch.zeros((1, action_dim)) - 0.5, requires_grad=True)
|
||||||
|
self.sqrt_2pi_log = np.log(np.sqrt(2 * np.pi))
|
||||||
|
|
||||||
|
def forward(self, state):
|
||||||
|
return self.net(state).tanh() # action.tanh() limit the data output of action
|
||||||
|
|
||||||
|
def get_action(self, state):
|
||||||
|
a_avg = self.forward(state) # too big for the action
|
||||||
|
a_std = self.a_logstd.exp()
|
||||||
|
|
||||||
|
noise = torch.randn_like(a_avg)
|
||||||
|
action = a_avg + noise * a_std
|
||||||
|
return action, noise
|
||||||
|
|
||||||
|
def get_logprob_entropy(self, state, action):
|
||||||
|
a_avg = self.forward(state)
|
||||||
|
a_std = self.a_logstd.exp()
|
||||||
|
delta = ((a_avg - action) / a_std).pow(2) * 0.5
|
||||||
|
logprob = -(self.a_logstd + self.sqrt_2pi_log + delta).sum(1) # new_logprob
|
||||||
|
|
||||||
|
dist_entropy = (logprob.exp() * logprob).mean() # policy entropy
|
||||||
|
return logprob, dist_entropy
|
||||||
|
|
||||||
|
def get_old_logprob(self, _action, noise): # noise = action - a_noise
|
||||||
|
delta = noise.pow(2) * 0.5
|
||||||
|
return -(self.a_logstd + self.sqrt_2pi_log + delta).sum(1) # old_logprob
|
||||||
|
|
||||||
|
|
||||||
|
class CriticAdv(nn.Module):
|
||||||
|
def __init__(self, mid_dim, state_dim, _action_dim):
|
||||||
|
super().__init__()
|
||||||
|
self.net = nn.Sequential(nn.Linear(state_dim, mid_dim), nn.ReLU(),
|
||||||
|
nn.Linear(mid_dim, mid_dim), nn.ReLU(),
|
||||||
|
nn.Linear(mid_dim, mid_dim), nn.Hardswish(),
|
||||||
|
nn.Linear(mid_dim, 1))
|
||||||
|
|
||||||
|
def forward(self, state):
|
||||||
|
return self.net(state) # Advantage value
|
|
@ -0,0 +1,17 @@
|
||||||
|
EC_parameters = {
|
||||||
|
'electrolysis_efficiency': 0.8,
|
||||||
|
'hydrogen_produce': 0.5,
|
||||||
|
'power_max': 200,
|
||||||
|
'power_min': 0,
|
||||||
|
'ramp': 100,
|
||||||
|
'lifetime': 6000, # hour
|
||||||
|
'equipment_cost': 10000, # yuan
|
||||||
|
'carbon_reduce': 1,
|
||||||
|
}
|
||||||
|
|
||||||
|
HST_parameters = {
|
||||||
|
'capacity': 1000,
|
||||||
|
'min_soc': 0.1,
|
||||||
|
'max_soc': 0.9,
|
||||||
|
'efficiency': 0.95,
|
||||||
|
}
|
|
@ -0,0 +1,55 @@
|
||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
def test_one_episode(env, act, device):
|
||||||
|
"""to get evaluate information, here record the unbalance of after taking action"""
|
||||||
|
record_state = []
|
||||||
|
record_action = []
|
||||||
|
record_reward = []
|
||||||
|
record_unbalance = []
|
||||||
|
record_system_info = [] # [time,price,netload,action,real action,soc,output*4,unbalance(exchange+penalty),cost]
|
||||||
|
record_init_info = [] # include month,day,time,intial soc
|
||||||
|
env.TRAIN = False
|
||||||
|
state = env.reset()
|
||||||
|
record_init_info.append([env.month, env.day, env.current_time, env.battery.current_capacity])
|
||||||
|
print(f'current testing month is {env.month}, day is {env.day},initial_soc is {env.battery.current_capacity}')
|
||||||
|
for i in range(24):
|
||||||
|
s_tensor = torch.as_tensor((state,), device=device)
|
||||||
|
a_tensor = act(s_tensor)
|
||||||
|
action = a_tensor.detach().cpu().numpy()[0] # not need detach(), because with torch.no_grad() outside
|
||||||
|
real_action = action
|
||||||
|
state, next_state, reward, done = env.step(action)
|
||||||
|
|
||||||
|
record_system_info.append([state[0], state[1], state[3] + env.wind.current_power, action, real_action,
|
||||||
|
env.battery.SOC(), env.battery.energy_change, next_state[4], next_state[5],
|
||||||
|
next_state[6], env.solar.current_power, env.wind.current_power, env.unbalance,
|
||||||
|
env.operation_cost, reward])
|
||||||
|
record_state.append(state)
|
||||||
|
record_action.append(real_action)
|
||||||
|
record_reward.append(reward)
|
||||||
|
record_unbalance.append(env.unbalance)
|
||||||
|
state = next_state
|
||||||
|
# add information of last step dg1, dh2, dg3, soc, tem, irr
|
||||||
|
record_system_info[-1][7:12] = [env.final_step_outputs[0], env.final_step_outputs[1], env.final_step_outputs[2],
|
||||||
|
env.final_step_outputs[4], env.final_step_outputs[5]]
|
||||||
|
record_system_info[-1][5] = env.final_step_outputs[3]
|
||||||
|
record = {'init_info': record_init_info, 'system_info': record_system_info, 'state': record_state,
|
||||||
|
'action': record_action, 'reward': record_reward, 'unbalance': record_unbalance}
|
||||||
|
return record
|
||||||
|
|
||||||
|
|
||||||
|
def get_episode_return(env, act, device):
|
||||||
|
episode_reward = 0.0 # sum of rewards in an episode
|
||||||
|
episode_unbalance = 0.0
|
||||||
|
state = env.reset()
|
||||||
|
for i in range(24):
|
||||||
|
s_tensor = torch.as_tensor((state,), device=device)
|
||||||
|
a_tensor = act(s_tensor)
|
||||||
|
action = a_tensor.detach().cpu().numpy()[0] # not need detach(), because with torch.no_grad() outside
|
||||||
|
state, next_state, reward, done, = env.step(action)
|
||||||
|
state = next_state
|
||||||
|
episode_reward += reward
|
||||||
|
episode_unbalance += env.real_unbalance
|
||||||
|
if done:
|
||||||
|
break
|
||||||
|
return episode_reward, episode_unbalance
|
281
train.py
281
train.py
|
@ -0,0 +1,281 @@
|
||||||
|
import os
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
os.environ['OMP_WAIT_POLICY'] = 'PASSIVE' # 确保在pytorch前设置
|
||||||
|
from copy import deepcopy
|
||||||
|
import pandas as pd
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from models.env import WgzGym
|
||||||
|
from models.net import ActorPPO, CriticAdv
|
||||||
|
from models.tools import get_episode_return, test_one_episode
|
||||||
|
|
||||||
|
|
||||||
|
def smooth_rewards(rewards, window=10):
|
||||||
|
rewards = rewards.unsqueeze(0).unsqueeze(0) # 将 rewards 转为 [1, 1, len] 的形状以适应 conv1d
|
||||||
|
kernel = torch.ones(1, 1, window, device=rewards.device) / window # 创建一个均匀的滑动平均核
|
||||||
|
smoothed_rewards = F.conv1d(rewards, kernel, padding='valid') # 滑动平均
|
||||||
|
smoothed_rewards = smoothed_rewards.squeeze(0).squeeze(0) # 去掉多余的维度
|
||||||
|
# 保持与原始奖励序列相同的长度,将前 window-1 个奖励保持不变
|
||||||
|
return torch.cat((rewards[0, 0, :window - 1], smoothed_rewards))
|
||||||
|
|
||||||
|
|
||||||
|
def update_buffer(_trajectory):
|
||||||
|
_trajectory = list(map(list, zip(*_trajectory))) # 2D-list transpose, here cut the trajectory into 5 parts
|
||||||
|
ten_state = torch.as_tensor(_trajectory[0]) # tensor state here
|
||||||
|
ten_reward = torch.as_tensor(_trajectory[1], dtype=torch.float32)
|
||||||
|
# _trajectory[2] = done, 将 done 替换为掩码,节省内存
|
||||||
|
ten_mask = (1.0 - torch.as_tensor(_trajectory[2], dtype=torch.float32)) * gamma
|
||||||
|
ten_action = torch.as_tensor(_trajectory[3])
|
||||||
|
ten_noise = torch.as_tensor(_trajectory[4], dtype=torch.float32)
|
||||||
|
buffer[:] = (ten_state, ten_action, ten_noise, ten_reward, ten_mask) # list store tensors
|
||||||
|
_steps = ten_reward.shape[0] # steps collected in all trajectories
|
||||||
|
_r_exp = ten_reward.mean() # the mean reward
|
||||||
|
return _steps, _r_exp
|
||||||
|
|
||||||
|
|
||||||
|
class AgentPPO:
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.state = None
|
||||||
|
self.device = None
|
||||||
|
self.action_dim = None
|
||||||
|
|
||||||
|
self.criterion = torch.nn.SmoothL1Loss()
|
||||||
|
self.cri = self.cri_target = self.if_use_cri_target = self.cri_optim = self.ClassCri = None
|
||||||
|
self.act = self.act_target = self.if_use_act_target = self.act_optim = self.ClassAct = None
|
||||||
|
|
||||||
|
'''init modify'''
|
||||||
|
self.ClassCri = CriticAdv
|
||||||
|
self.ClassAct = ActorPPO
|
||||||
|
|
||||||
|
self.ratio_clip = 0.2
|
||||||
|
self.lambda_entropy = 0.02 # be 0.01~0.05
|
||||||
|
self.lambda_gae_adv = 0.98 # be 0.95~0.99
|
||||||
|
self.get_reward_sum = None
|
||||||
|
self.trajectory_list = None
|
||||||
|
|
||||||
|
def init(self, net_dim, state_dim, action_dim, learning_rate=1e-4, gpu_id=0):
|
||||||
|
self.device = torch.device(f"cuda:{gpu_id}" if (torch.cuda.is_available() and (gpu_id >= 0)) else "cpu")
|
||||||
|
self.trajectory_list = list()
|
||||||
|
self.get_reward_sum = self.get_reward_sum_gae
|
||||||
|
|
||||||
|
self.cri = self.ClassCri(net_dim, state_dim, action_dim).to(self.device)
|
||||||
|
self.act = self.ClassAct(net_dim, state_dim, action_dim).to(self.device) if self.ClassAct else self.cri
|
||||||
|
self.cri_target = deepcopy(self.cri) if self.if_use_cri_target else self.cri
|
||||||
|
self.act_target = deepcopy(self.act) if self.if_use_act_target else self.act
|
||||||
|
|
||||||
|
self.cri_optim = torch.optim.Adam(self.cri.parameters(), learning_rate)
|
||||||
|
self.act_optim = torch.optim.Adam(self.act.parameters(), learning_rate) if self.ClassAct else self.cri
|
||||||
|
|
||||||
|
def select_action(self, state):
|
||||||
|
states = torch.as_tensor((state,), dtype=torch.float32, device=self.device)
|
||||||
|
actions, noises = self.act.get_action(states)
|
||||||
|
return actions[0].detach().cpu().numpy(), noises[0].detach().cpu().numpy()
|
||||||
|
|
||||||
|
def explore_env(self, env, target_step):
|
||||||
|
state = self.state
|
||||||
|
trajectory_temp = list()
|
||||||
|
last_done = 0
|
||||||
|
for i in range(target_step):
|
||||||
|
action, noise = self.select_action(state)
|
||||||
|
state, next_state, reward, done = env.step(np.tanh(action))
|
||||||
|
trajectory_temp.append((state, reward, done, action, noise))
|
||||||
|
if done:
|
||||||
|
state = env.reset()
|
||||||
|
last_done = i
|
||||||
|
else:
|
||||||
|
state = next_state
|
||||||
|
self.state = state
|
||||||
|
|
||||||
|
'''splice list'''
|
||||||
|
# store 0 trajectory information to list
|
||||||
|
trajectory_list = self.trajectory_list + trajectory_temp[:last_done + 1]
|
||||||
|
self.trajectory_list = trajectory_temp[last_done:]
|
||||||
|
return trajectory_list
|
||||||
|
|
||||||
|
def update_net(self, buffer, batch_size, repeat_times, soft_update_tau):
|
||||||
|
"""put data extract and update network together"""
|
||||||
|
with torch.no_grad():
|
||||||
|
buf_len = buffer[0].shape[0]
|
||||||
|
# decompose buffer data
|
||||||
|
buf_state, buf_action, buf_noise, buf_reward, buf_mask = [ten.to(self.device) for ten in buffer]
|
||||||
|
|
||||||
|
'''get buf_r_sum, buf_logprob'''
|
||||||
|
bs = batch_size
|
||||||
|
buf_value = [self.cri_target(buf_state[i:i + bs]) for i in range(0, buf_len, bs)]
|
||||||
|
buf_value = torch.cat(buf_value, dim=0)
|
||||||
|
buf_logprob = self.act.get_old_logprob(buf_action, buf_noise)
|
||||||
|
|
||||||
|
buf_r_sum, buf_advantage = self.get_reward_sum(buf_len, buf_reward, buf_mask, buf_value) # detach()
|
||||||
|
# normalize advantage
|
||||||
|
buf_advantage = (buf_advantage - buf_advantage.mean()) / (buf_advantage.std() + 1e-5)
|
||||||
|
buf_advantage = smooth_rewards(buf_advantage, window=10)
|
||||||
|
del buf_noise, buffer[:]
|
||||||
|
|
||||||
|
'''PPO: Surrogate objective of Trust Region'''
|
||||||
|
obj_critic = obj_actor = None
|
||||||
|
for _ in range(int(buf_len / batch_size * repeat_times)):
|
||||||
|
indices = torch.randint(buf_len, size=(batch_size,), requires_grad=False, device=self.device)
|
||||||
|
|
||||||
|
state = buf_state[indices]
|
||||||
|
action = buf_action[indices]
|
||||||
|
r_sum = buf_r_sum[indices]
|
||||||
|
logprob = buf_logprob[indices]
|
||||||
|
advantage = buf_advantage[indices]
|
||||||
|
|
||||||
|
new_logprob, obj_entropy = self.act.get_logprob_entropy(state, action) # it is obj_actor
|
||||||
|
ratio = (new_logprob - logprob.detach()).exp()
|
||||||
|
surrogate1 = advantage * ratio
|
||||||
|
surrogate2 = advantage * ratio.clamp(1 - self.ratio_clip, 1 + self.ratio_clip)
|
||||||
|
obj_surrogate = -torch.min(surrogate1, surrogate2).mean()
|
||||||
|
obj_actor = obj_surrogate + obj_entropy * self.lambda_entropy
|
||||||
|
self.optim_update(self.act_optim, obj_actor) # update actor
|
||||||
|
|
||||||
|
value = self.cri(state).squeeze(1) # critic network predicts the reward_sum (Q value) of state
|
||||||
|
# use smoothloss L1 to evaluate the value loss
|
||||||
|
# obj_critic = self.criterion(value, r_sum) / (r_sum.std() + 1e-6)
|
||||||
|
obj_critic = self.criterion(value, r_sum)
|
||||||
|
self.optim_update(self.cri_optim, obj_critic) # calculate and update the back propogation of value loss
|
||||||
|
# choose to use soft update
|
||||||
|
self.soft_update(self.cri_target, self.cri, soft_update_tau) if self.cri_target is not self.cri else None
|
||||||
|
|
||||||
|
a_std_log = getattr(self.act, 'a_std_log', torch.zeros(1))
|
||||||
|
return obj_critic.item(), obj_actor.item(), a_std_log.mean().item() # logging_tuple
|
||||||
|
|
||||||
|
def get_reward_sum_gae(self, buf_len, ten_reward, ten_mask, ten_value) -> (torch.Tensor, torch.Tensor):
|
||||||
|
buf_r_sum = torch.empty(buf_len, dtype=torch.float32, device=self.device) # old policy value
|
||||||
|
buf_advantage = torch.empty(buf_len, dtype=torch.float32, device=self.device) # advantage value
|
||||||
|
pre_r_sum = 0.0
|
||||||
|
pre_advantage = 0.0 # advantage value of previous step
|
||||||
|
for i in range(buf_len - 1, -1, -1):
|
||||||
|
buf_r_sum[i] = ten_reward[i] + ten_mask[i] * pre_r_sum
|
||||||
|
pre_r_sum = buf_r_sum[i]
|
||||||
|
buf_advantage[i] = ten_reward[i] + ten_mask[i] * (pre_advantage - ten_value[i])
|
||||||
|
pre_advantage = ten_value[i] + buf_advantage[i] * self.lambda_gae_adv
|
||||||
|
return buf_r_sum, buf_advantage
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def optim_update(optimizer, objective):
|
||||||
|
optimizer.zero_grad()
|
||||||
|
objective.backward()
|
||||||
|
optimizer.step()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def soft_update(target_net, current_net, tau):
|
||||||
|
for tar, cur in zip(target_net.parameters(), current_net.parameters()):
|
||||||
|
tar.data.copy_(cur.data.__mul__(tau) + tar.data.__mul__(1.0 - tau))
|
||||||
|
|
||||||
|
|
||||||
|
class Arguments:
|
||||||
|
def __init__(self, agent=None, env=None):
|
||||||
|
self.agent = agent
|
||||||
|
self.env = env
|
||||||
|
self.cwd = None # current work directory. None means set automatically
|
||||||
|
self.if_remove = False # remove the cwd folder? (True, False, None:ask me)
|
||||||
|
self.visible_gpu = '0' # os.environ['CUDA_VISIBLE_DEVICES'] = '0, 2,'
|
||||||
|
self.num_threads = 32 # cpu_num for evaluate model
|
||||||
|
|
||||||
|
'''Arguments for training'''
|
||||||
|
self.num_episode = 1000
|
||||||
|
self.gamma = 0.995 # discount factor of reward
|
||||||
|
self.learning_rate = 1e-4 # 1e-4 2 ** -14 2e-4
|
||||||
|
self.soft_update_tau = 2 ** -8 # 1e-3 2 ** -8
|
||||||
|
|
||||||
|
self.net_dim = 256 # the network width
|
||||||
|
self.batch_size = 4096 # num of transitions sampled from replay buffer
|
||||||
|
self.repeat_times = 2 ** 5 # collect target_step, then update network
|
||||||
|
self.target_step = 4096 # repeatedly update network to keep critic's loss small
|
||||||
|
self.max_memo = self.target_step # capacity of replay buffer
|
||||||
|
|
||||||
|
'''Arguments for evaluate'''
|
||||||
|
self.random_seed = 1234
|
||||||
|
# self.random_seed_list = [1234, 2234, 3234, 4234, 5234]
|
||||||
|
self.random_seed_list = [1234]
|
||||||
|
self.train = True
|
||||||
|
self.save_network = True
|
||||||
|
self.test_network = True
|
||||||
|
self.save_test_data = True
|
||||||
|
|
||||||
|
def init_before_training(self):
|
||||||
|
if self.cwd is None:
|
||||||
|
agent_name = self.agent.__class__.__name__
|
||||||
|
self.cwd = f'./{agent_name}'
|
||||||
|
|
||||||
|
np.random.seed(self.random_seed)
|
||||||
|
torch.manual_seed(self.random_seed)
|
||||||
|
torch.set_num_threads(self.num_threads)
|
||||||
|
torch.set_default_dtype(torch.float32)
|
||||||
|
os.environ['CUDA_VISIBLE_DEVICES'] = str(self.visible_gpu)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
args = Arguments()
|
||||||
|
reward_record = {'episode': [], 'steps': [], 'mean_episode_reward': [], 'unbalance': []}
|
||||||
|
loss_record = {'episode': [], 'steps': [], 'critic_loss': [], 'actor_loss': [], 'entropy_loss': []}
|
||||||
|
args.visible_gpu = '0'
|
||||||
|
for seed in args.random_seed_list:
|
||||||
|
args.random_seed = seed
|
||||||
|
args.agent = AgentPPO()
|
||||||
|
agent_name = f'{args.agent.__class__.__name__}'
|
||||||
|
args.agent.cri_target = True
|
||||||
|
args.env = WgzGym()
|
||||||
|
args.init_before_training()
|
||||||
|
'''init agent and environment'''
|
||||||
|
agent = args.agent
|
||||||
|
env = args.env
|
||||||
|
agent.init(args.net_dim, env.state_space.shape[0], env.action_space.shape[0], args.learning_rate)
|
||||||
|
gamma = args.gamma
|
||||||
|
batch_size = args.batch_size # data used to update net
|
||||||
|
target_step = args.target_step # steps of one episode should stop
|
||||||
|
repeat_times = args.repeat_times # times should update for one batch size data
|
||||||
|
soft_update_tau = args.soft_update_tau
|
||||||
|
num_episode = args.num_episode
|
||||||
|
agent.state = env.reset()
|
||||||
|
'''init buffer'''
|
||||||
|
buffer = list()
|
||||||
|
'''init training params'''
|
||||||
|
# args.train = False
|
||||||
|
# args.save_network = False
|
||||||
|
# args.test_network = False
|
||||||
|
# args.save_test_data = False
|
||||||
|
if args.train:
|
||||||
|
for i_episode in range(num_episode):
|
||||||
|
with torch.no_grad():
|
||||||
|
trajectory_list = agent.explore_env(env, target_step)
|
||||||
|
_steps, _r_exp = update_buffer(trajectory_list)
|
||||||
|
critic_loss, actor_loss, entropy_loss = agent.update_net(buffer, batch_size, repeat_times,
|
||||||
|
soft_update_tau)
|
||||||
|
loss_record['critic_loss'].append(critic_loss)
|
||||||
|
loss_record['actor_loss'].append(actor_loss)
|
||||||
|
loss_record['entropy_loss'].append(entropy_loss)
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
episode_reward, episode_unbalance = get_episode_return(env, agent.act, agent.device)
|
||||||
|
reward_record['mean_episode_reward'].append(episode_reward)
|
||||||
|
reward_record['unbalance'].append(episode_unbalance)
|
||||||
|
print(f'epsiode: {i_episode}, reward: {episode_reward}, unbalance: {episode_unbalance}')
|
||||||
|
|
||||||
|
act_save_path = f'{args.cwd}/actor.pth'
|
||||||
|
loss_record_path = f'{args.cwd}/loss.pkl'
|
||||||
|
reward_record_path = f'{args.cwd}/reward.pkl'
|
||||||
|
|
||||||
|
if args.save_network:
|
||||||
|
with open(loss_record_path, 'wb') as tf:
|
||||||
|
pickle.dump(loss_record, tf)
|
||||||
|
with open(reward_record_path, 'wb') as tf:
|
||||||
|
pickle.dump(reward_record, tf)
|
||||||
|
torch.save(agent.act.state_dict(), act_save_path)
|
||||||
|
print('actor params have been saved')
|
||||||
|
|
||||||
|
if args.test_network:
|
||||||
|
args.cwd = agent_name
|
||||||
|
agent.act.load_state_dict(torch.load(act_save_path))
|
||||||
|
print('params have been reload and test')
|
||||||
|
record = test_one_episode(env, agent.act, agent.device)
|
||||||
|
eval_data = pd.DataFrame(record['system_info'])
|
||||||
|
eval_data.columns = ['time_step', 'price', 'load', 'action', 'real_action', 'soc', 'battery',
|
||||||
|
'gen1', 'gen2', 'gen3', 'pv', 'wind', 'unbalance', 'operation_cost', 'reward']
|
||||||
|
if args.save_test_data:
|
||||||
|
test_data_save_path = f'{args.cwd}/test.pkl'
|
||||||
|
with open(test_data_save_path, 'wb') as tf:
|
||||||
|
pickle.dump(record, tf)
|
Loading…
Reference in New Issue