'init'

2024-06-18 10:49:43 +08:00 · 2024-06-18 10:49:43 +08:00 · 5fd1ac5e14
commit 5fd1ac5e14
100 changed files with 422120 additions and 0 deletions
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -0,0 +1,8 @@
+# 默认忽略的文件
+/shelf/
+/workspace.xml
+# 基于编辑器的 HTTP 客户端请求
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
--- a/.idea/DRL-for-Energy-Systems.iml
+++ b/.idea/DRL-for-Energy-Systems.iml
@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="jdk" jdkName="rl-microgrid" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+</module>
--- a/.idea/csv-editor.xml
+++ b/.idea/csv-editor.xml
@ -0,0 +1,44 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="CsvFileAttributes">
+    <option name="attributeMap">
+      <map>
+        <entry key="\data\houseload.csv">
+          <value>
+            <Attribute>
+              <option name="separator" value="," />
+            </Attribute>
+          </value>
+        </entry>
+        <entry key="\data\irradiance.csv">
+          <value>
+            <Attribute>
+              <option name="separator" value="," />
+            </Attribute>
+          </value>
+        </entry>
+        <entry key="\data\prices.csv">
+          <value>
+            <Attribute>
+              <option name="separator" value=";" />
+            </Attribute>
+          </value>
+        </entry>
+        <entry key="\data\temper.csv">
+          <value>
+            <Attribute>
+              <option name="separator" value="," />
+            </Attribute>
+          </value>
+        </entry>
+        <entry key="\data\wind.csv">
+          <value>
+            <Attribute>
+              <option name="separator" value="," />
+            </Attribute>
+          </value>
+        </entry>
+      </map>
+    </option>
+  </component>
+</project>
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ b/.idea/inspectionProfiles/Project_Default.xml
@ -0,0 +1,151 @@
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="DuplicatedCode" enabled="false" level="WEAK WARNING" enabled_by_default="false">
+      <Languages>
+        <language minSize="266" name="Python" />
+      </Languages>
+    </inspection_tool>
+    <inspection_tool class="PyArgumentListInspection" enabled="false" level="WARNING" enabled_by_default="false" />
+    <inspection_tool class="PyIncorrectDocstringInspection" enabled="false" level="WEAK WARNING" enabled_by_default="false" />
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="false" level="WARNING" enabled_by_default="false">
+      <option name="ignoredPackages">
+        <value>
+          <list size="97">
+            <item index="0" class="java.lang.String" itemvalue="absl-py" />
+            <item index="1" class="java.lang.String" itemvalue="certifi" />
+            <item index="2" class="java.lang.String" itemvalue="astunparse" />
+            <item index="3" class="java.lang.String" itemvalue="cachetools" />
+            <item index="4" class="java.lang.String" itemvalue="charset-normalizer" />
+            <item index="5" class="java.lang.String" itemvalue="cloudpickle" />
+            <item index="6" class="java.lang.String" itemvalue="contourpy" />
+            <item index="7" class="java.lang.String" itemvalue="google-pasta" />
+            <item index="8" class="java.lang.String" itemvalue="protobuf" />
+            <item index="9" class="java.lang.String" itemvalue="tensorflow-estimator" />
+            <item index="10" class="java.lang.String" itemvalue="joblib" />
+            <item index="11" class="java.lang.String" itemvalue="threadpoolctl" />
+            <item index="12" class="java.lang.String" itemvalue="opt-einsum" />
+            <item index="13" class="java.lang.String" itemvalue="scikit-learn" />
+            <item index="14" class="java.lang.String" itemvalue="python-dateutil" />
+            <item index="15" class="java.lang.String" itemvalue="setuptools" />
+            <item index="16" class="java.lang.String" itemvalue="cycler" />
+            <item index="17" class="java.lang.String" itemvalue="gast" />
+            <item index="18" class="java.lang.String" itemvalue="markupsafe" />
+            <item index="19" class="java.lang.String" itemvalue="mkl" />
+            <item index="20" class="java.lang.String" itemvalue="fsspec" />
+            <item index="21" class="java.lang.String" itemvalue="pyasn1-modules" />
+            <item index="22" class="java.lang.String" itemvalue="filelock" />
+            <item index="23" class="java.lang.String" itemvalue="oauthlib" />
+            <item index="24" class="java.lang.String" itemvalue="keras" />
+            <item index="25" class="java.lang.String" itemvalue="pyparsing" />
+            <item index="26" class="java.lang.String" itemvalue="sympy" />
+            <item index="27" class="java.lang.String" itemvalue="libclang" />
+            <item index="28" class="java.lang.String" itemvalue="python" />
+            <item index="29" class="java.lang.String" itemvalue="werkzeug" />
+            <item index="30" class="java.lang.String" itemvalue="gymnasium" />
+            <item index="31" class="java.lang.String" itemvalue="sqlite" />
+            <item index="32" class="java.lang.String" itemvalue="h5py" />
+            <item index="33" class="java.lang.String" itemvalue="stable-baselines3" />
+            <item index="34" class="java.lang.String" itemvalue="tensorboard-data-server" />
+            <item index="35" class="java.lang.String" itemvalue="wrapt" />
+            <item index="36" class="java.lang.String" itemvalue="kiwisolver" />
+            <item index="37" class="java.lang.String" itemvalue="typing-extensions" />
+            <item index="38" class="java.lang.String" itemvalue="vc" />
+            <item index="39" class="java.lang.String" itemvalue="tensorflow-intel" />
+            <item index="40" class="java.lang.String" itemvalue="jinja2" />
+            <item index="41" class="java.lang.String" itemvalue="fonttools" />
+            <item index="42" class="java.lang.String" itemvalue="flatbuffers" />
+            <item index="43" class="java.lang.String" itemvalue="tensorboard" />
+            <item index="44" class="java.lang.String" itemvalue="tbb" />
+            <item index="45" class="java.lang.String" itemvalue="matplotlib" />
+            <item index="46" class="java.lang.String" itemvalue="openssl" />
+            <item index="47" class="java.lang.String" itemvalue="ca-certificates" />
+            <item index="48" class="java.lang.String" itemvalue="idna" />
+            <item index="49" class="java.lang.String" itemvalue="mkl-service" />
+            <item index="50" class="java.lang.String" itemvalue="mkl_fft" />
+            <item index="51" class="java.lang.String" itemvalue="rsa" />
+            <item index="52" class="java.lang.String" itemvalue="networkx" />
+            <item index="53" class="java.lang.String" itemvalue="numpy" />
+            <item index="54" class="java.lang.String" itemvalue="pyasn1" />
+            <item index="55" class="java.lang.String" itemvalue="importlib-metadata" />
+            <item index="56" class="java.lang.String" itemvalue="requests-oauthlib" />
+            <item index="57" class="java.lang.String" itemvalue="seaborn" />
+            <item index="58" class="java.lang.String" itemvalue="zipp" />
+            <item index="59" class="java.lang.String" itemvalue="markdown" />
+            <item index="60" class="java.lang.String" itemvalue="ml-dtypes" />
+            <item index="61" class="java.lang.String" itemvalue="urllib3" />
+            <item index="62" class="java.lang.String" itemvalue="vs2015_runtime" />
+            <item index="63" class="java.lang.String" itemvalue="numpy-base" />
+            <item index="64" class="java.lang.String" itemvalue="scipy" />
+            <item index="65" class="java.lang.String" itemvalue="google-auth-oauthlib" />
+            <item index="66" class="java.lang.String" itemvalue="intel-openmp" />
+            <item index="67" class="java.lang.String" itemvalue="tzdata" />
+            <item index="68" class="java.lang.String" itemvalue="farama-notifications" />
+            <item index="69" class="java.lang.String" itemvalue="packaging" />
+            <item index="70" class="java.lang.String" itemvalue="torch" />
+            <item index="71" class="java.lang.String" itemvalue="et-xmlfile" />
+            <item index="72" class="java.lang.String" itemvalue="tensorflow-io-gcs-filesystem" />
+            <item index="73" class="java.lang.String" itemvalue="pandas" />
+            <item index="74" class="java.lang.String" itemvalue="termcolor" />
+            <item index="75" class="java.lang.String" itemvalue="importlib-resources" />
+            <item index="76" class="java.lang.String" itemvalue="mkl_random" />
+            <item index="77" class="java.lang.String" itemvalue="icc_rt" />
+            <item index="78" class="java.lang.String" itemvalue="mpmath" />
+            <item index="79" class="java.lang.String" itemvalue="pillow" />
+            <item index="80" class="java.lang.String" itemvalue="grpcio" />
+            <item index="81" class="java.lang.String" itemvalue="pytz" />
+            <item index="82" class="java.lang.String" itemvalue="google-auth" />
+            <item index="83" class="java.lang.String" itemvalue="openpyxl" />
+            <item index="84" class="java.lang.String" itemvalue="blas" />
+            <item index="85" class="java.lang.String" itemvalue="gym" />
+            <item index="86" class="java.lang.String" itemvalue="nltk" />
+            <item index="87" class="java.lang.String" itemvalue="pygame" />
+            <item index="88" class="java.lang.String" itemvalue="mo-gymnasium" />
+            <item index="89" class="java.lang.String" itemvalue="rich" />
+            <item index="90" class="java.lang.String" itemvalue="pynvml" />
+            <item index="91" class="java.lang.String" itemvalue="mujoco" />
+            <item index="92" class="java.lang.String" itemvalue="pybullet-gym" />
+            <item index="93" class="java.lang.String" itemvalue="mpi4py" />
+            <item index="94" class="java.lang.String" itemvalue="tqdm" />
+            <item index="95" class="java.lang.String" itemvalue="ipython" />
+            <item index="96" class="java.lang.String" itemvalue="tensorboardX" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="E731" />
+        </list>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="N812" />
+          <option value="N806" />
+          <option value="N802" />
+          <option value="N803" />
+          <option value="N801" />
+          <option value="N813" />
+        </list>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyShadowingNamesInspection" enabled="false" level="WEAK WARNING" enabled_by_default="false" />
+    <inspection_tool class="PyUnresolvedReferencesInspection" enabled="false" level="WARNING" enabled_by_default="false">
+      <option name="ignoredIdentifiers">
+        <list>
+          <option value="filename" />
+          <option value="int.*" />
+          <option value="object.*" />
+        </list>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="SpellCheckingInspection" enabled="false" level="TYPO" enabled_by_default="false">
+      <option name="processCode" value="true" />
+      <option name="processLiterals" value="true" />
+      <option name="processComments" value="true" />
+    </inspection_tool>
+  </profile>
+</component>
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Black">
+    <option name="sdkName" value="rl-microgrid" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="rl-microgrid" project-jdk-type="Python SDK" />
+</project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/DRL-for-Energy-Systems.iml" filepath="$PROJECT_DIR$/.idea/DRL-for-Energy-Systems.iml" />
+    </modules>
+  </component>
+</project>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
--- a/AgentDDPG/DRL__plots/Evoluation
+++ b/AgentDDPG/DRL__plots/Evoluation
--- a/AgentDDPG/DRL__plots/optimization_information.svg
+++ b/AgentDDPG/DRL__plots/optimization_information.svg
--- a/AgentDDPG/actor.pth
+++ b/AgentDDPG/actor.pth
--- a/AgentDDPG/loss_data.pkl
+++ b/AgentDDPG/loss_data.pkl
--- a/AgentDDPG/reward_data.pkl
+++ b/AgentDDPG/reward_data.pkl
--- a/AgentDDPG/test_data.pkl
+++ b/AgentDDPG/test_data.pkl
--- a/AgentPPO/DRL__plots/Evoluation
+++ b/AgentPPO/DRL__plots/Evoluation
--- a/AgentPPO/DRL__plots/optimization_information.svg
+++ b/AgentPPO/DRL__plots/optimization_information.svg
--- a/AgentPPO/actor.pth
+++ b/AgentPPO/actor.pth
--- a/AgentPPO/loss_data.pkl
+++ b/AgentPPO/loss_data.pkl
--- a/AgentPPO/reward_data.pkl
+++ b/AgentPPO/reward_data.pkl
--- a/AgentPPO/test_data.pkl
+++ b/AgentPPO/test_data.pkl
--- a/AgentSAC/DRL__plots/Evoluation
+++ b/AgentSAC/DRL__plots/Evoluation
--- a/AgentSAC/DRL__plots/optimization_information.svg
+++ b/AgentSAC/DRL__plots/optimization_information.svg
--- a/AgentSAC/actor.pth
+++ b/AgentSAC/actor.pth
--- a/AgentSAC/loss_data.pkl
+++ b/AgentSAC/loss_data.pkl
--- a/AgentSAC/reward_data.pkl
+++ b/AgentSAC/reward_data.pkl
--- a/AgentSAC/test_data.pkl
+++ b/AgentSAC/test_data.pkl
--- a/AgentTD3/DRL__plots/Evoluation
+++ b/AgentTD3/DRL__plots/Evoluation
--- a/AgentTD3/DRL__plots/optimization_information.svg
+++ b/AgentTD3/DRL__plots/optimization_information.svg
--- a/AgentTD3/actor.pth
+++ b/AgentTD3/actor.pth
--- a/AgentTD3/loss_data.pkl
+++ b/AgentTD3/loss_data.pkl
--- a/AgentTD3/reward_data.pkl
+++ b/AgentTD3/reward_data.pkl
--- a/AgentTD3/test_data.pkl
+++ b/AgentTD3/test_data.pkl
--- a/DDPG.py
+++ b/DDPG.py
@ -0,0 +1,133 @@
+import pickle
+import pandas as pd
+import torch
+
+from agent import AgentDDPG
+from environment import ESSEnv
+from tools import *
+
+
+def update_buffer(_trajectory):
+    ten_state = torch.as_tensor([item[0] for item in _trajectory], dtype=torch.float32)
+    ary_other = torch.as_tensor([item[1] for item in _trajectory])
+    ary_other[:, 0] = ary_other[:, 0]  # ten_reward
+    ary_other[:, 1] = (1.0 - ary_other[:, 1]) * gamma  # ten_mask = (1.0 - ary_done) * gamma
+
+    buffer.extend_buffer(ten_state, ary_other)
+
+    _steps = ten_state.shape[0]
+    _r_exp = ary_other[:, 0].mean()  # other = (reward, mask, action)
+    return _steps, _r_exp
+
+
+if __name__ == '__main__':
+    args = Arguments()
+    '''here record real unbalance'''
+    reward_record = {'episode': [], 'steps': [], 'mean_episode_reward': [], 'unbalance': []}
+    loss_record = {'episode': [], 'steps': [], 'critic_loss': [], 'actor_loss': [], 'entropy_loss': []}
+    args.visible_gpu = '0'
+    for seed in args.random_seed_list:
+        args.random_seed = seed
+        args.agent = AgentDDPG()
+        agent_name = f'{args.agent.__class__.__name__}'
+        args.agent.cri_target = True
+        args.env = ESSEnv()
+        # creat lists of lists/or creat a long list?
+        args.init_before_training(if_main=True)
+        '''init agent and environment'''
+        agent = args.agent
+        env = args.env
+        agent.init(args.net_dim, env.state_space.shape[0], env.action_space.shape[0], args.learning_rate,
+                   args.if_per_or_gae)
+        '''init replay buffer'''
+        buffer = ReplayBuffer(max_len=args.max_memo, state_dim=env.state_space.shape[0],
+                              action_dim=env.action_space.shape[0])
+        '''start training'''
+        cwd = args.cwd
+        gamma = args.gamma
+        batch_size = args.batch_size
+        target_step = args.target_step  # how manysteps of one episode should stop
+        repeat_times = args.repeat_times  # how many times should update for one batch size data
+        soft_update_tau = args.soft_update_tau
+        # get the first experience from
+        agent.state = env.reset()
+        '''collect data and train and update network'''
+        num_episode = args.num_episode
+        # args.train=False
+        # args.save_network=False
+        # args.test_network=False
+        # args.save_test_data=False
+        # args.compare_with_pyomo=False
+
+        if args.train:
+            collect_data = True
+            while collect_data:
+                print(f'buffer:{buffer.now_len}')
+                with torch.no_grad():
+                    trajectory = agent.explore_env(env, target_step)
+                    steps, r_exp = update_buffer(trajectory)
+                    buffer.update_now_len()
+                if buffer.now_len >= 10000:
+                    collect_data = False
+            for i_episode in range(num_episode):
+                critic_loss, actor_loss = agent.update_net(buffer, batch_size, repeat_times, soft_update_tau)
+                loss_record['critic_loss'].append(critic_loss)
+                loss_record['actor_loss'].append(actor_loss)
+                with torch.no_grad():
+                    episode_reward, episode_unbalance = get_episode_return(env, agent.act, agent.device)
+                    reward_record['mean_episode_reward'].append(episode_reward)
+                    reward_record['unbalance'].append(episode_unbalance)
+                print(f'current epsiode is {i_episode}, reward:{episode_reward},'
+                      f'unbalance:{episode_unbalance},buffer_length: {buffer.now_len}')
+                if i_episode % 10 == 0:
+                    # target_step
+                    with torch.no_grad():
+                        trajectory = agent.explore_env(env, target_step)
+                        steps, r_exp = update_buffer(trajectory)
+    loss_record_path = f'{args.cwd}/loss_data.pkl'
+    reward_record_path = f'{args.cwd}/reward_data.pkl'
+    # current only store last seed corresponded actor 
+    act_save_path = f'{args.cwd}/actor.pth'
+
+    with open(loss_record_path, 'wb') as tf:
+        pickle.dump(loss_record, tf)
+    with open(reward_record_path, 'wb') as tf:
+        pickle.dump(reward_record, tf)
+
+    if args.save_network:
+        torch.save(agent.act.state_dict(), act_save_path)
+        print('actor parameters have been saved')
+
+    if args.test_network:
+        args.cwd = agent_name
+        agent.act.load_state_dict(torch.load(act_save_path))
+        print('parameters have been reload and test')
+        record = test_one_episode(env, agent.act, agent.device)
+        eval_data = pd.DataFrame(record['system_info'])
+        eval_data.columns = ['time_step', 'price', 'netload', 'action', 'real_action', 'soc', 'battery', 'gen1', 'gen2',
+                             'gen3', 'unbalance', 'operation_cost']
+    if args.save_test_data:
+        test_data_save_path = f'{args.cwd}/test_data.pkl'
+        with open(test_data_save_path, 'wb') as tf:
+            pickle.dump(record, tf)
+
+    '''compare with pyomo data and results'''
+    if args.compare_with_pyomo:
+        month = record['init_info'][0][0]
+        day = record['init_info'][0][1]
+        initial_soc = record['init_info'][0][3]
+        base_result = optimization_base_result(env, month, day, initial_soc)
+    if args.plot_on:
+        from plotDRL import PlotArgs, make_dir, plot_evaluation_information, plot_optimization_result
+
+        plot_args = PlotArgs()
+        plot_args.feature_change = ''
+        args.cwd = agent_name
+        plot_dir = make_dir(args.cwd, plot_args.feature_change)
+        plot_optimization_result(base_result, plot_dir)
+        plot_evaluation_information(args.cwd + '/' + 'test_data.pkl', plot_dir)
+    '''compare the different cost get from pyomo and DDPG'''
+    ration = sum(eval_data['operation_cost']) / sum(base_result['step_cost'])
+    print('operation_cost_sum:', sum(eval_data['operation_cost']))
+    print('step_cost_sum:', sum(base_result['step_cost']))
+    print('ration:', ration)
--- a/PPO.py
+++ b/PPO.py
@ -0,0 +1,399 @@
+import os
+import pickle
+from copy import deepcopy
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn as nn
+from environment import ESSEnv
+from tools import get_episode_return, test_one_episode, optimization_base_result
+
+os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
+script_name = os.path.basename(__file__)
+
+
+# after adding layer normalization, it doesn't work
+class ActorPPO(nn.Module):
+    def __init__(self, mid_dim, state_dim, action_dim, layer_norm=False):
+        super().__init__()
+        self.net = nn.Sequential(nn.Linear(state_dim, mid_dim), nn.ReLU(),
+                                 nn.Linear(mid_dim, mid_dim), nn.ReLU(),
+                                 nn.Linear(mid_dim, mid_dim), nn.Hardswish(),
+                                 nn.Linear(mid_dim, action_dim), )
+
+        # the logarithm (log) of standard deviation (std) of action, it is a trainable parameter
+        self.a_logstd = nn.Parameter(torch.zeros((1, action_dim)) - 0.5, requires_grad=True)
+        self.sqrt_2pi_log = np.log(np.sqrt(2 * np.pi))
+        if layer_norm:
+            self.layer_norm(self.net)
+
+    @staticmethod
+    def layer_norm(layer, std=1.0, bias_const=0.0):
+        for i in layer:
+            if hasattr(i, 'weight'):
+                torch.nn.init.orthogonal_(i.weight, std)
+                torch.nn.init.constant_(i.bias, bias_const)
+
+    def forward(self, state):
+        return self.net(state).tanh()  # action.tanh() # in this way limit the data output of action
+
+    def get_action(self, state):
+        a_avg = self.net(state)  # too big for the action
+        a_std = self.a_logstd.exp()
+
+        noise = torch.randn_like(a_avg)
+        action = a_avg + noise * a_std
+        return action, noise
+
+    def get_logprob_entropy(self, state, action):
+        a_avg = self.net(state)
+        a_std = self.a_logstd.exp()
+
+        delta = ((a_avg - action) / a_std).pow(2) * 0.5
+        logprob = -(self.a_logstd + self.sqrt_2pi_log + delta).sum(1)  # new_logprob
+
+        dist_entropy = (logprob.exp() * logprob).mean()  # policy entropy
+        return logprob, dist_entropy
+
+    def get_old_logprob(self, _action, noise):  # noise = action - a_noise
+        delta = noise.pow(2) * 0.5
+        return -(self.a_logstd + self.sqrt_2pi_log + delta).sum(1)  # old_logprob
+
+
+class CriticAdv(nn.Module):
+    def __init__(self, mid_dim, state_dim, _action_dim, layer_norm=False):
+        super().__init__()
+        self.net = nn.Sequential(nn.Linear(state_dim, mid_dim), nn.ReLU(),
+                                 nn.Linear(mid_dim, mid_dim), nn.ReLU(),
+                                 nn.Linear(mid_dim, mid_dim), nn.Hardswish(),
+                                 nn.Linear(mid_dim, 1))
+        if layer_norm:
+            self.layer_norm(self.net, std=1.0)
+
+    @staticmethod
+    def layer_norm(layer, std=1.0, bias_const=0.0):
+        for i in layer:
+            if hasattr(i, 'weight'):
+                torch.nn.init.orthogonal_(i.weight, std)
+                torch.nn.init.constant_(i.bias, bias_const)
+
+    def forward(self, state):
+        return self.net(state)  # Advantage value
+
+
+class AgentPPO:
+    def __init__(self):
+        super().__init__()
+        self.state = None
+        self.device = None
+        self.action_dim = None
+        self.get_obj_critic = None
+
+        self.criterion = torch.nn.SmoothL1Loss()
+        self.cri = self.cri_target = self.if_use_cri_target = self.cri_optim = self.ClassCri = None
+        self.act = self.act_target = self.if_use_act_target = self.act_optim = self.ClassAct = None
+
+        '''init modify'''
+        self.ClassCri = CriticAdv
+        self.ClassAct = ActorPPO
+
+        self.ratio_clip = 0.2  # ratio.clamp(1 - clip, 1 + clip)
+        self.lambda_entropy = 0.02  # could be 0.01~0.05
+        self.lambda_gae_adv = 0.98  # could be 0.95~0.99, GAE (Generalized Advantage Estimation. ICLR.2016.)
+        self.get_reward_sum = None  # self.get_reward_sum_gae if if_use_gae else self.get_reward_sum_raw
+        self.trajectory_list = None
+
+    def init(self, net_dim, state_dim, action_dim, learning_rate=1e-4, if_use_gae=False, gpu_id=0):
+        self.device = torch.device(f"cuda:{gpu_id}" if (torch.cuda.is_available() and (gpu_id >= 0)) else "cpu")
+        self.trajectory_list = list()
+        # choose whether to use gae or not
+        self.get_reward_sum = self.get_reward_sum_gae if if_use_gae else self.get_reward_sum_raw
+
+        self.cri = self.ClassCri(net_dim, state_dim, action_dim).to(self.device)
+        self.act = self.ClassAct(net_dim, state_dim, action_dim).to(self.device) if self.ClassAct else self.cri
+        self.cri_target = deepcopy(self.cri) if self.if_use_cri_target else self.cri
+        self.act_target = deepcopy(self.act) if self.if_use_act_target else self.act
+
+        self.cri_optim = torch.optim.Adam(self.cri.parameters(), learning_rate)
+        self.act_optim = torch.optim.Adam(self.act.parameters(), learning_rate) if self.ClassAct else self.cri
+        del self.ClassCri, self.ClassAct  # why del self.ClassCri and self.ClassAct here, to save memory?
+
+    def select_action(self, state):
+        states = torch.as_tensor((state,), dtype=torch.float32, device=self.device)
+        actions, noises = self.act.get_action(states)
+        return actions[0].detach().cpu().numpy(), noises[0].detach().cpu().numpy()
+
+    def explore_env(self, env, target_step):
+        state = self.state  # sent state to agent and then agent sent state to method
+        trajectory_temp = list()
+        last_done = 0
+        for i in range(target_step):
+            action, noise = self.select_action(state)
+            state, next_state, reward, done, = env.step(
+                np.tanh(action))  # here the step of cut action is finally organized into the environment.
+            trajectory_temp.append((state, reward, done, action, noise))
+            if done:
+                state = env.reset()
+                last_done = i
+            else:
+                state = next_state
+        self.state = state
+
+        '''splice list'''
+        trajectory_list = self.trajectory_list + trajectory_temp[
+                                                 :last_done + 1]  # store 0 trajectory information to the list
+        self.trajectory_list = trajectory_temp[last_done:]
+        return trajectory_list
+
+    def update_net(self, buffer, batch_size, repeat_times, soft_update_tau):
+        """put data extract and update network together"""
+        with torch.no_grad():
+            buf_len = buffer[0].shape[0]
+            buf_state, buf_action, buf_noise, buf_reward, buf_mask = [ten.to(self.device) for ten in
+                                                                      buffer]  # decompose buffer data
+
+            '''get buf_r_sum, buf_logprob'''
+            bs = 4096  # set a smaller 'BatchSize' when out of GPU memory: 1024, could change to 4096
+            buf_value = [self.cri_target(buf_state[i:i + bs]) for i in range(0, buf_len, bs)]  #
+            buf_value = torch.cat(buf_value, dim=0)
+            buf_logprob = self.act.get_old_logprob(buf_action, buf_noise)
+
+            buf_r_sum, buf_advantage = self.get_reward_sum(buf_len, buf_reward, buf_mask, buf_value)  # detach()
+            # normalize advantage
+            buf_advantage = (buf_advantage - buf_advantage.mean()) / (buf_advantage.std() + 1e-5)
+            del buf_noise, buffer[:]
+
+        '''PPO: Surrogate objective of Trust Region'''
+        obj_critic = obj_actor = None
+        for _ in range(int(buf_len / batch_size * repeat_times)):
+            indices = torch.randint(buf_len, size=(batch_size,), requires_grad=False, device=self.device)
+
+            state = buf_state[indices]
+            action = buf_action[indices]
+            r_sum = buf_r_sum[indices]
+            logprob = buf_logprob[indices]
+            advantage = buf_advantage[indices]
+
+            new_logprob, obj_entropy = self.act.get_logprob_entropy(state, action)  # it is obj_actor
+            ratio = (new_logprob - logprob.detach()).exp()
+            surrogate1 = advantage * ratio
+            surrogate2 = advantage * ratio.clamp(1 - self.ratio_clip, 1 + self.ratio_clip)
+            obj_surrogate = -torch.min(surrogate1, surrogate2).mean()
+            obj_actor = obj_surrogate + obj_entropy * self.lambda_entropy
+            self.optim_update(self.act_optim, obj_actor)  # update actor
+
+            value = self.cri(state).squeeze(1)  # critic network predicts the reward_sum (Q value) of state
+            # use smoothloss L1 to evaluate the value loss
+            # obj_critic = self.criterion(value, r_sum) / (r_sum.std() + 1e-6)
+            obj_critic = self.criterion(value, r_sum)
+            self.optim_update(self.cri_optim, obj_critic)  # calculate and update the back propogation of value loss
+            # choose whether to use soft update
+            self.soft_update(self.cri_target, self.cri, soft_update_tau) if self.cri_target is not self.cri else None
+
+        a_std_log = getattr(self.act, 'a_std_log', torch.zeros(1))
+        return obj_critic.item(), obj_actor.item(), a_std_log.mean().item()  # logging_tuple
+
+    def get_reward_sum_raw(self, buf_len, buf_reward, buf_mask, buf_value) -> (torch.Tensor, torch.Tensor):
+        buf_r_sum = torch.empty(buf_len, dtype=torch.float32, device=self.device)  # reward sum
+
+        pre_r_sum = 0
+        for i in range(buf_len - 1, -1, -1):
+            buf_r_sum[i] = buf_reward[i] + buf_mask[i] * pre_r_sum
+            pre_r_sum = buf_r_sum[i]
+        buf_advantage = buf_r_sum - (buf_mask * buf_value[:, 0])
+        return buf_r_sum, buf_advantage
+
+    def get_reward_sum_gae(self, buf_len, ten_reward, ten_mask, ten_value) -> (torch.Tensor, torch.Tensor):
+        buf_r_sum = torch.empty(buf_len, dtype=torch.float32, device=self.device)  # old policy value
+        buf_advantage = torch.empty(buf_len, dtype=torch.float32, device=self.device)  # advantage value
+
+        pre_r_sum = 0
+        pre_advantage = 0  # advantage value of previous step
+        for i in range(buf_len - 1, -1, -1):
+            buf_r_sum[i] = ten_reward[i] + ten_mask[i] * pre_r_sum
+            pre_r_sum = buf_r_sum[i]
+            buf_advantage[i] = ten_reward[i] + ten_mask[i] * (pre_advantage - ten_value[i])  # fix a bug here
+            pre_advantage = ten_value[i] + buf_advantage[i] * self.lambda_gae_adv
+        return buf_r_sum, buf_advantage
+
+    @staticmethod
+    def optim_update(optimizer, objective):
+        optimizer.zero_grad()
+        objective.backward()
+        optimizer.step()
+
+    @staticmethod
+    def soft_update(target_net, current_net, tau):
+        for tar, cur in zip(target_net.parameters(), current_net.parameters()):
+            tar.data.copy_(cur.data.__mul__(tau) + tar.data.__mul__(1.0 - tau))
+
+
+class Arguments:
+    def __init__(self, agent=None, env=None):
+        self.agent = agent  # Deep Reinforcement Learning algorithm
+        self.env = env  # the environment for training
+        self.cwd = None  # current work directory. None means set automatically
+        self.if_remove = False  # remove the cwd folder? (True, False, None:ask me)
+        self.visible_gpu = '0'  # for example: os.environ['CUDA_VISIBLE_DEVICES'] = '0, 2,'
+        # self.worker_num = 2  # rollout workers number pre GPU (adjust it to get high GPU usage)
+        self.num_threads = 32  # cpu_num for evaluate model, torch.set_num_threads(self.num_threads)
+
+        '''Arguments for training'''
+        self.num_episode = 1000  # to control the train episodes for PPO
+        self.gamma = 0.995  # discount factor of future rewards
+        self.learning_rate = 2 ** -14  # 2e-4
+        self.soft_update_tau = 2 ** -8  # 2 ** -8 ~= 5e-3
+
+        self.net_dim = 256  # the network width
+        self.batch_size = 4096  # num of transitions sampled from replay buffer.
+        self.repeat_times = 2 ** 3  # collect target_step, then update network
+        self.target_step = 4096  # repeatedly update network to keep critic's loss small
+        self.max_memo = self.target_step  # capacity of replay buffer
+        self.if_per_or_gae = False  # GAE for on-policy sparse reward: Generalized Advantage Estimation.
+
+        '''Arguments for evaluate'''
+        self.random_seed = 0  # initialize random seed in self.init_before_training()
+        # self.random_seed_list = [1234, 2234, 3234, 4234, 5234]
+        self.random_seed_list = [1234]
+        self.train = True
+        self.save_network = True
+        self.test_network = True
+        self.save_test_data = True
+        self.compare_with_pyomo = True
+        self.plot_on = True
+
+    def init_before_training(self, if_main):
+        if self.cwd is None:
+            agent_name = self.agent.__class__.__name__
+            self.cwd = f'./{agent_name}'
+
+        if if_main:
+            import shutil  # remove history according to bool(if_remove)
+            if self.if_remove is None:
+                self.if_remove = bool(input(f"| PRESS 'y' to REMOVE: {self.cwd}? ") == 'y')
+            elif self.if_remove:
+                shutil.rmtree(self.cwd, ignore_errors=True)
+                print(f"| Remove cwd: {self.cwd}")
+            os.makedirs(self.cwd, exist_ok=True)
+
+        np.random.seed(self.random_seed)
+        torch.manual_seed(self.random_seed)
+        torch.set_num_threads(self.num_threads)
+        torch.set_default_dtype(torch.float32)
+
+        os.environ['CUDA_VISIBLE_DEVICES'] = str(self.visible_gpu)
+
+
+def update_buffer(_trajectory):
+    _trajectory = list(map(list, zip(*_trajectory)))  # 2D-list transpose, here cut the trajectory into 5 parts 
+    ten_state = torch.as_tensor(_trajectory[0])  # tensor state here
+    ten_reward = torch.as_tensor(_trajectory[1], dtype=torch.float32)
+    # _trajectory[2] = done, replace done by mask, save memory
+    ten_mask = (1.0 - torch.as_tensor(_trajectory[2], dtype=torch.float32)) * gamma
+    ten_action = torch.as_tensor(_trajectory[3])
+    ten_noise = torch.as_tensor(_trajectory[4], dtype=torch.float32)
+
+    buffer[:] = (ten_state, ten_action, ten_noise, ten_reward, ten_mask)  # list store tensors
+
+    _steps = ten_reward.shape[0]  # how many steps are collected in all trajectories
+    _r_exp = ten_reward.mean()  # the mean reward
+    return _steps, _r_exp
+
+
+if __name__ == '__main__':
+    args = Arguments()
+    reward_record = {'episode': [], 'steps': [], 'mean_episode_reward': [], 'unbalance': []}
+    loss_record = {'episode': [], 'steps': [], 'critic_loss': [], 'actor_loss': [], 'entropy_loss': []}
+    args.visible_gpu = '0'
+    for seed in args.random_seed_list:
+        args.random_seed = seed
+        args.agent = AgentPPO()
+
+        agent_name = f'{args.agent.__class__.__name__}'
+        args.agent.cri_target = True
+        args.env = ESSEnv()
+        args.init_before_training(if_main=True)
+        '''init agent and environment'''
+        agent = args.agent
+        env = args.env
+        agent.init(args.net_dim, env.state_space.shape[0], env.action_space.shape[0], args.learning_rate,
+                   args.if_per_or_gae)
+
+        cwd = args.cwd
+        gamma = args.gamma
+        batch_size = args.batch_size  # how much data should be used to update net
+        target_step = args.target_step  # how manysteps of one episode should stop
+        repeat_times = args.repeat_times  # how many times should update for one batch size data
+        soft_update_tau = args.soft_update_tau
+        agent.state = env.reset()
+        '''init buffer'''
+        buffer = list()
+        '''init training parameters'''
+        num_episode = args.num_episode
+        # args.train=False
+        # args.save_network=False
+        # args.test_network=False
+        # args.save_test_data=False
+        # args.compare_with_pyomo=False
+        if args.train:
+            for i_episode in range(num_episode):
+                with torch.no_grad():
+                    trajectory_list = agent.explore_env(env, target_step)
+                    steps, r_exp = update_buffer(trajectory_list)
+                critic_loss, actor_loss, entropy_loss = agent.update_net(buffer, batch_size, repeat_times,
+                                                                         soft_update_tau)
+                loss_record['critic_loss'].append(critic_loss)
+                loss_record['actor_loss'].append(actor_loss)
+                loss_record['entropy_loss'].append(entropy_loss)
+
+                with torch.no_grad():
+                    episode_reward, episode_unbalance = get_episode_return(env, agent.act, agent.device)
+                    reward_record['mean_episode_reward'].append(episode_reward)
+                    reward_record['unbalance'].append(episode_unbalance)
+                print(f'current epsiode is {i_episode}, reward:{episode_reward},unbalance:{episode_unbalance}')
+    act_save_path = f'{args.cwd}/actor.pth'
+    loss_record_path = f'{args.cwd}/loss_data.pkl'
+    reward_record_path = f'{args.cwd}/reward_data.pkl'
+
+    with open(loss_record_path, 'wb') as tf:
+        pickle.dump(loss_record, tf)
+    with open(reward_record_path, 'wb') as tf:
+        pickle.dump(reward_record, tf)
+
+    if args.save_network:
+        torch.save(agent.act.state_dict(), act_save_path)
+        print('actor parameters have been saved')
+
+    if args.test_network:
+        args.cwd = agent_name + '0618'
+        agent.act.load_state_dict(torch.load(act_save_path))
+        print('parameters have been reload and test')
+        record = test_one_episode(env, agent.act, agent.device)
+        eval_data = pd.DataFrame(record['information'])
+        eval_data.columns = ['time_step', 'price', 'netload', 'action', 'real_action', 'soc', 'battery', 'gen1', 'gen2',
+                             'gen3', 'unbalance', 'operation_cost']
+    if args.save_test_data:
+        test_data_save_path = f'{args.cwd}/test_data.pkl'
+        with open(test_data_save_path, 'wb') as tf:
+            pickle.dump(record, tf)
+
+    '''compare with pyomo data and results'''
+    if args.compare_with_pyomo:
+        month = record['init_info'][0][0]
+        day = record['init_info'][0][1]
+        initial_soc = record['init_info'][0][3]
+        base_result = optimization_base_result(env, month, day, initial_soc)
+    if args.plot_on:
+        from plotDRL import PlotArgs, make_dir, plot_evaluation_information, plot_optimization_result
+
+        plot_args = PlotArgs()
+        plot_args.feature_change = ''
+        args.cwd = agent_name
+        plot_dir = make_dir(args.cwd, plot_args.feature_change)
+        plot_optimization_result(base_result, plot_dir)
+        plot_evaluation_information(args.cwd + '/' + 'test_data.pkl', plot_dir)
+    '''compare the different cost get from pyomo and PPO'''
+    ration = sum(eval_data['operation_cost']) / sum(base_result['step_cost'])
+    print('operation_cost_sum:', sum(eval_data['operation_cost']))
+    print('step_cost_sum:', sum(base_result['step_cost']))
+    print('ration:', ration)
--- a/SAC.py
+++ b/SAC.py
@ -0,0 +1,137 @@
+import pickle
+import pandas as pd
+import torch
+
+from agent import AgentSAC
+from environment import ESSEnv
+from tools import *
+
+
+def update_buffer(_trajectory):
+    ten_state = torch.as_tensor([item[0] for item in _trajectory], dtype=torch.float32)
+    ary_other = torch.as_tensor([item[1] for item in _trajectory])
+    ary_other[:, 0] = ary_other[:, 0]  # ten_reward
+    ary_other[:, 1] = (1.0 - ary_other[:, 1]) * gamma  # ten_mask = (1.0 - ary_done) * gamma
+
+    buffer.extend_buffer(ten_state, ary_other)
+
+    _steps = ten_state.shape[0]
+    _r_exp = ary_other[:, 0].mean()  # other = (reward, mask, action)
+    return _steps, _r_exp
+
+
+if __name__ == '__main__':
+    args = Arguments()
+    reward_record = {'episode': [], 'steps': [], 'mean_episode_reward': [], 'unbalance': []}
+    loss_record = {'episode': [], 'steps': [], 'critic_loss': [], 'actor_loss': [], 'entropy_loss': []}
+    args.visible_gpu = '0'
+    for seed in args.random_seed_list:
+        args.random_seed = seed
+        args.agent = AgentSAC()
+        agent_name = f'{args.agent.__class__.__name__}'
+        args.agent.cri_target = True
+        args.env = ESSEnv()
+        args.init_before_training(if_main=True)
+        '''init agent and environment'''
+        agent = args.agent
+        env = args.env
+        agent.init(args.net_dim, env.state_space.shape[0], env.action_space.shape[0], args.learning_rate,
+                   args.if_per_or_gae)
+        '''init replay buffer'''
+        buffer = ReplayBuffer(max_len=args.max_memo, state_dim=env.state_space.shape[0],
+                              action_dim=env.action_space.shape[0])
+        '''start training'''
+        cwd = args.cwd
+        gamma = args.gamma
+        batch_size = args.batch_size  # how much data should be used to update net
+        target_step = args.target_step  # how manysteps of one episode should stop
+        repeat_times = args.repeat_times  # how many times should update for one batch size data
+        soft_update_tau = args.soft_update_tau
+
+        agent.state = env.reset()
+
+        '''collect data and train and update network'''
+        num_episode = args.num_episode
+        '''here record real unbalance'''
+
+        ##
+        # args.train=False
+        # args.save_network=False
+        # args.test_network=False
+        # args.save_test_data=False
+        # args.compare_with_pyomo=False
+        #
+        if args.train:
+            collect_data = True
+            while collect_data:
+                print(f'buffer:{buffer.now_len}')
+                with torch.no_grad():
+                    trajectory = agent.explore_env(env, target_step)
+                    steps, r_exp = update_buffer(trajectory)
+                    buffer.update_now_len()
+                if buffer.now_len >= 10000:
+                    collect_data = False
+            for i_episode in range(num_episode):
+                critic_loss, actor_loss, entropy_loss = agent.update_net(buffer, batch_size, repeat_times,
+                                                                         soft_update_tau)
+                loss_record['critic_loss'].append(critic_loss)
+                loss_record['actor_loss'].append(actor_loss)
+                loss_record['entropy_loss'].append(entropy_loss)
+                with torch.no_grad():
+                    episode_reward, episode_unbalance = get_episode_return(env, agent.act, agent.device)
+                    reward_record['mean_episode_reward'].append(episode_reward)
+                    reward_record['unbalance'].append(episode_unbalance)
+                print(
+                    f'current epsiode is {i_episode}, reward:{episode_reward},'
+                    f'unbalance:{episode_unbalance},buffer_length: {buffer.now_len}')
+                if i_episode % 10 == 0:
+                    # target_step
+                    with torch.no_grad():
+                        trajectory = agent.explore_env(env, target_step)
+                        steps, r_exp = update_buffer(trajectory)
+    act_save_path = f'{args.cwd}/actor.pth'
+    loss_record_path = f'{args.cwd}/loss_data.pkl'
+    reward_record_path = f'{args.cwd}/reward_data.pkl'
+
+    with open(loss_record_path, 'wb') as tf:
+        pickle.dump(loss_record, tf)
+    with open(reward_record_path, 'wb') as tf:
+        pickle.dump(reward_record, tf)
+
+    if args.save_network:
+        torch.save(agent.act.state_dict(), act_save_path)
+        print('actor parameters have been saved')
+
+    if args.test_network:
+        args.cwd = agent_name
+        agent.act.load_state_dict(torch.load(act_save_path))
+        print('parameters have been reload and test')
+        record = test_one_episode(env, agent.act, agent.device)
+        eval_data = pd.DataFrame(record['information'])
+        eval_data.columns = ['time_step', 'price', 'netload', 'action', 'real_action', 'soc', 'battery', 'gen1', 'gen2',
+                             'gen3', 'unbalance', 'operation_cost']
+    if args.save_test_data:
+        test_data_save_path = f'{args.cwd}/test_data.pkl'
+        with open(test_data_save_path, 'wb') as tf:
+            pickle.dump(record, tf)
+
+    '''compare with pyomo data and results'''
+    if args.compare_with_pyomo:
+        month = record['init_info'][0][0]
+        day = record['init_info'][0][1]
+        initial_soc = record['init_info'][0][3]
+        base_result = optimization_base_result(env, month, day, initial_soc)
+    if args.plot_on:
+        from plotDRL import PlotArgs, make_dir, plot_evaluation_information, plot_optimization_result
+
+        plot_args = PlotArgs()
+        plot_args.feature_change = ''
+        args.cwd = agent_name
+        plot_dir = make_dir(args.cwd, plot_args.feature_change)
+        plot_optimization_result(base_result, plot_dir)
+        plot_evaluation_information(args.cwd + '/' + 'test_data.pkl', plot_dir)
+    '''compare the different cost get from pyomo and SAC'''
+    ration = sum(eval_data['operation_cost']) / sum(base_result['step_cost'])
+    print('operation_cost_sum:', sum(eval_data['operation_cost']))
+    print('step_cost_sum:', sum(base_result['step_cost']))
+    print('ration:', ration)
--- a/TD3.py
+++ b/TD3.py
@ -0,0 +1,133 @@
+import pickle
+import pandas as pd
+import torch
+
+from agent import AgentTD3
+from environment import ESSEnv
+from tools import *
+
+
+def update_buffer(_trajectory):
+    ten_state = torch.as_tensor([item[0] for item in _trajectory], dtype=torch.float32)
+    ary_other = torch.as_tensor([item[1] for item in _trajectory])
+    ary_other[:, 0] = ary_other[:, 0]  # ten_reward
+    ary_other[:, 1] = (1.0 - ary_other[:, 1]) * gamma  # ten_mask = (1.0 - ary_done) * gamma
+
+    buffer.extend_buffer(ten_state, ary_other)
+
+    _steps = ten_state.shape[0]
+    _r_exp = ary_other[:, 0].mean()  # other = (reward, mask, action)
+    return _steps, _r_exp
+
+
+if __name__ == '__main__':
+    args = Arguments()
+    reward_record = {'episode': [], 'steps': [], 'mean_episode_reward': [], 'unbalance': []}
+    loss_record = {'episode': [], 'steps': [], 'critic_loss': [], 'actor_loss': [], 'entropy_loss': []}
+    args.visible_gpu = '0'
+    for seed in args.random_seed_list:
+        args.random_seed = seed
+        args.agent = AgentTD3()
+        agent_name = f'{args.agent.__class__.__name__}'
+        args.agent.cri_target = True
+        args.env = ESSEnv()
+        args.init_before_training(if_main=True)
+        '''init agent and environment'''
+        agent = args.agent
+        env = args.env
+        agent.init(args.net_dim, env.state_space.shape[0], env.action_space.shape[0], args.learning_rate,
+                   args.if_per_or_gae)
+        '''init replay buffer'''
+        buffer = ReplayBuffer(max_len=args.max_memo, state_dim=env.state_space.shape[0],
+                              action_dim=env.action_space.shape[0])
+        '''start training'''
+        cwd = args.cwd
+        gamma = args.gamma
+        batch_size = args.batch_size  # how much data should be used to update net
+        target_step = args.target_step  # how manysteps of one episode should stop
+        repeat_times = args.repeat_times  # how many times should update for one batch size data
+        soft_update_tau = args.soft_update_tau
+
+        agent.state = env.reset()
+        '''collect data and train and update network'''
+        num_episode = args.num_episode
+
+        # args.train=False
+        # args.save_network=False
+        # args.test_network=False
+        # args.save_test_data=False
+        # args.compare_with_pyomo=False
+
+        if args.train:
+            collect_data = True
+            while collect_data:
+                print(f'buffer:{buffer.now_len}')
+                with torch.no_grad():
+                    trajectory = agent.explore_env(env, target_step)
+                    steps, r_exp = update_buffer(trajectory)
+                    buffer.update_now_len()
+                if buffer.now_len >= 10000:
+                    collect_data = False
+            for i_episode in range(num_episode):
+                critic_loss, actor_loss = agent.update_net(buffer, batch_size, repeat_times, soft_update_tau)
+                loss_record['critic_loss'].append(critic_loss)
+                loss_record['actor_loss'].append(actor_loss)
+                with torch.no_grad():
+                    episode_reward, episode_unbalance = get_episode_return(env, agent.act, agent.device)
+                    reward_record['mean_episode_reward'].append(episode_reward)
+                    reward_record['unbalance'].append(episode_unbalance)
+                print(
+                    f'current epsiode is {i_episode}, reward:{episode_reward},'
+                    f'unbalance:{episode_unbalance},buffer_length: {buffer.now_len}')
+                if i_episode % 10 == 0:
+                    # target_step
+                    with torch.no_grad():
+                        trajectory = agent.explore_env(env, target_step)
+                        steps, r_exp = update_buffer(trajectory)
+    loss_record_path = f'{args.cwd}/loss_data.pkl'
+    reward_record_path = f'{args.cwd}/reward_data.pkl'
+    # current only store last seed corresponded actor
+    act_save_path = f'{args.cwd}/actor.pth'
+
+    with open(loss_record_path, 'wb') as tf:
+        pickle.dump(loss_record, tf)
+    with open(reward_record_path, 'wb') as tf:
+        pickle.dump(reward_record, tf)
+
+    if args.save_network:
+        torch.save(agent.act.state_dict(), act_save_path)
+        print('actor parameters have been saved')
+
+    if args.test_network:
+        args.cwd = agent_name
+        agent.act.load_state_dict(torch.load(act_save_path))
+        print('parameters have been reload and test')
+        record = test_one_episode(env, agent.act, agent.device)
+        eval_data = pd.DataFrame(record['information'])
+        eval_data.columns = ['time_step', 'price', 'netload', 'action', 'real_action', 'soc', 'battery', 'gen1', 'gen2',
+                             'gen3', 'unbalance', 'operation_cost']
+    if args.save_test_data:
+        test_data_save_path = f'{args.cwd}/test_data.pkl'
+        with open(test_data_save_path, 'wb') as tf:
+            pickle.dump(record, tf)
+
+    '''compare with pyomo data and results'''
+    if args.compare_with_pyomo:
+        month = record['init_info'][0][0]
+        day = record['init_info'][0][1]
+        initial_soc = record['init_info'][0][3]
+        base_result = optimization_base_result(env, month, day, initial_soc)
+    if args.plot_on:
+        from plotDRL import PlotArgs, make_dir, plot_evaluation_information, plot_optimization_result
+
+        plot_args = PlotArgs()
+        plot_args.feature_change = ''
+        args.cwd = agent_name
+        plot_dir = make_dir(args.cwd, plot_args.feature_change)
+        plot_optimization_result(base_result, plot_dir)
+        plot_evaluation_information(args.cwd + '/' + 'test_data.pkl', plot_dir)
+    '''compare the different cost get from pyomo and TD3'''
+    ration = sum(eval_data['operation_cost']) / sum(base_result['step_cost'])
+    print('operation_cost_sum:', sum(eval_data['operation_cost']))
+    print('step_cost_sum:', sum(base_result['step_cost']))
+    print('ration:', ration)
--- a/pycache/data_manager.cpython-39.pyc
+++ b/pycache/data_manager.cpython-39.pyc
--- a/pycache/environment.cpython-39.pyc
+++ b/pycache/environment.cpython-39.pyc
--- a/pycache/module.cpython-39.pyc
+++ b/pycache/module.cpython-39.pyc
--- a/pycache/parameters.cpython-39.pyc
+++ b/pycache/parameters.cpython-39.pyc
--- a/pycache/tools.cpython-39.pyc
+++ b/pycache/tools.cpython-39.pyc
--- a/agent.py
+++ b/agent.py
@ -0,0 +1,328 @@
+import numpy.random as rd
+import os
+from copy import deepcopy
+from net import *
+
+
+class AgentBase:
+    def __init__(self):
+        self.state = None
+        self.device = None
+        self.action_dim = None
+        self.if_off_policy = None
+        self.explore_noise = None
+        self.trajectory_list = None
+
+        self.criterion = torch.nn.SmoothL1Loss()
+        self.cri = self.cri_target = self.if_use_cri_target = self.cri_optim = self.ClassCri = None
+        self.act = self.act_target = self.if_use_act_target = self.act_optim = self.ClassAct = None
+
+    def init(self, net_dim, state_dim, action_dim, learning_rate=1e-4, _if_per_or_gae=False, gpu_id=0):
+        # explict call self.init() for multiprocessing
+        self.device = torch.device(f"cuda:{gpu_id}" if (torch.cuda.is_available() and (gpu_id >= 0)) else "cpu")
+        self.action_dim = action_dim
+
+        self.cri = self.ClassCri(net_dim, state_dim, action_dim).to(self.device)
+        self.act = self.ClassAct(net_dim, state_dim, action_dim).to(self.device) if self.ClassAct else self.cri
+        self.cri_target = deepcopy(self.cri) if self.if_use_cri_target else self.cri
+        self.act_target = deepcopy(self.act) if self.if_use_act_target else self.act
+
+        self.cri_optim = torch.optim.Adam(self.cri.parameters(), learning_rate)
+        self.act_optim = torch.optim.Adam(self.act.parameters(), learning_rate) if self.ClassAct else self.cri
+        del self.ClassCri, self.ClassAct
+
+    def select_action(self, state) -> np.ndarray:
+        states = torch.as_tensor((state,), dtype=torch.float32, device=self.device)
+        action = self.act(states)[0]
+        action = (action + torch.randn_like(action) * self.explore_noise).clamp(-1, 1)
+        return action.detach().cpu().numpy()
+
+    def explore_env(self, env, target_step):
+        trajectory = list()
+
+        state = self.state
+        for _ in range(target_step):
+            action = self.select_action(state)
+            state, next_state, reward, done, = env.step(action)
+            trajectory.append((state, (reward, done, *action)))
+            state = env.reset() if done else next_state
+        self.state = state
+        return trajectory
+
+    @staticmethod
+    def optim_update(optimizer, objective):
+        optimizer.zero_grad()
+        objective.backward()
+        optimizer.step()
+
+    @staticmethod
+    def soft_update(target_net, current_net, tau):
+        for tar, cur in zip(target_net.parameters(), current_net.parameters()):
+            tar.data.copy_(cur.data * tau + tar.data * (1.0 - tau))
+
+    def save_or_load_agent(self, cwd, if_save):
+        def load_torch_file(model_or_optim, _path):
+            state_dict = torch.load(_path, map_location=lambda storage, loc: storage)
+            model_or_optim.load_state_dict(state_dict)
+
+        name_obj_list = [('actor', self.act), ('act_target', self.act_target), ('act_optim', self.act_optim),
+                         ('critic', self.cri), ('cri_target', self.cri_target), ('cri_optim', self.cri_optim), ]
+        name_obj_list = [(name, obj) for name, obj in name_obj_list if obj is not None]
+        if if_save:
+            for name, obj in name_obj_list:
+                save_path = f"{cwd}/{name}.pth"
+                torch.save(obj.state_dict(), save_path)
+        else:
+            for name, obj in name_obj_list:
+                save_path = f"{cwd}/{name}.pth"
+                load_torch_file(obj, save_path) if os.path.isfile(save_path) else None
+
+
+class AgentDDPG(AgentBase):
+    def __init__(self):
+        super().__init__()
+        self.explore_noise = 0.1  # explore noise of action
+        self.if_use_cri_target = self.if_use_act_target = True
+        self.ClassCri = Critic
+        self.ClassAct = Actor
+
+    def update_net(self, buffer, batch_size, repeat_times, soft_update_tau) -> (float, float):
+        buffer.update_now_len()
+        obj_critic = obj_actor = None
+        for _ in range(int(buffer.now_len / batch_size * repeat_times)):
+            obj_critic, state = self.get_obj_critic(buffer, batch_size)  # critic loss
+            self.optim_update(self.cri_optim, obj_critic)
+            self.soft_update(self.cri_target, self.cri, soft_update_tau)
+
+            action_pg = self.act(state)  # policy gradient
+            obj_actor = -self.cri(state, action_pg).mean()  # actor loss, makes it bigger
+            self.optim_update(self.act_optim, obj_actor)
+            self.soft_update(self.act_target, self.act, soft_update_tau)
+        return obj_actor.item(), obj_critic.item()
+
+    def get_obj_critic(self, buffer, batch_size) -> (torch.Tensor, torch.Tensor):
+        with torch.no_grad():
+            reward, mask, action, state, next_s = buffer.sample_batch(batch_size)
+            next_q = self.cri_target(next_s, self.act_target(next_s))
+            q_label = reward + mask * next_q
+        q_value = self.cri(state, action)
+        obj_critic = self.criterion(q_value, q_label)
+        return obj_critic, state
+
+
+class AgentTD3(AgentBase):
+    def __init__(self):
+        super().__init__()
+        self.explore_noise = 0.1  # standard deviation of exploration noise
+        self.policy_noise = 0.2  # standard deviation of policy noise
+        self.update_freq = 2  # delay update frequency
+        self.if_use_cri_target = self.if_use_act_target = True
+        self.ClassCri = CriticTwin
+        self.ClassAct = Actor
+
+    def update_net(self, buffer, batch_size, repeat_times, soft_update_tau) -> tuple:
+        buffer.update_now_len()
+        obj_critic = obj_actor = None
+        for update_c in range(int(buffer.now_len / batch_size * repeat_times)):
+            obj_critic, state = self.get_obj_critic(buffer, batch_size)
+            self.optim_update(self.cri_optim, obj_critic)
+
+            action_pg = self.act(state)  # policy gradient
+            obj_actor = -self.cri_target(state, action_pg).mean()  # use cri_target instead of cri for stable training
+            self.optim_update(self.act_optim, obj_actor)
+            if update_c % self.update_freq == 0:  # delay update
+                self.soft_update(self.cri_target, self.cri, soft_update_tau)
+                self.soft_update(self.act_target, self.act, soft_update_tau)
+        return obj_critic.item() / 2, obj_actor.item()
+
+    def get_obj_critic(self, buffer, batch_size) -> (torch.Tensor, torch.Tensor):
+        with torch.no_grad():
+            reward, mask, action, state, next_s = buffer.sample_batch(batch_size)
+            next_a = self.act_target.get_action(next_s, self.policy_noise)  # policy noise
+            next_q = torch.min(*self.cri_target.get_q1_q2(next_s, next_a))  # twin critics
+            q_label = reward + mask * next_q
+
+        q1, q2 = self.cri.get_q1_q2(state, action)
+        obj_critic = self.criterion(q1, q_label) + self.criterion(q2, q_label)  # twin critics
+        return obj_critic, state
+
+
+class AgentSAC(AgentBase):
+    def __init__(self):
+        super().__init__()
+        self.ClassCri = CriticTwin
+        self.ClassAct = ActorSAC
+        self.if_use_cri_target = True
+        self.if_use_act_target = False
+
+        self.alpha_log = None
+        self.alpha_optim = None
+        self.target_entropy = None
+
+    def init(self, net_dim, state_dim, action_dim, learning_rate=1e-4, _if_use_per=False, gpu_id=0, env_num=1):
+        super().init(net_dim, state_dim, action_dim, learning_rate, _if_use_per, gpu_id)
+
+        self.alpha_log = torch.tensor((-np.log(action_dim) * np.e,), dtype=torch.float32,
+                                      requires_grad=True, device=self.device)  # trainable parameter
+        self.alpha_optim = torch.optim.Adam((self.alpha_log,), lr=learning_rate)
+        self.target_entropy = np.log(action_dim)
+
+    def select_action(self, state):
+        states = torch.as_tensor((state,), dtype=torch.float32, device=self.device)
+        actions = self.act.get_action(states)
+        return actions.detach().cpu().numpy()[0]
+
+    def explore_env(self, env, target_step):
+        trajectory = list()
+
+        state = self.state
+        for _ in range(target_step):
+            action = self.select_action(state)
+            state, next_state, reward, done, = env.step(action)
+            trajectory.append((state, (reward, done, *action)))
+            state = env.reset() if done else next_state
+        self.state = state
+        return trajectory
+
+    def update_net(self, buffer, batch_size, repeat_times, soft_update_tau):
+        buffer.update_now_len()
+
+        alpha = self.alpha_log.exp().detach()
+        obj_critic = obj_actor = None
+        for _ in range(int(buffer.now_len * repeat_times / batch_size)):
+            '''objective of critic (loss function of critic)'''
+            with torch.no_grad():
+                reward, mask, action, state, next_s = buffer.sample_batch(batch_size)
+                next_a, next_log_prob = self.act_target.get_action_logprob(next_s)
+                next_q = torch.min(*self.cri_target.get_q1_q2(next_s, next_a))
+                q_label = reward + mask * (next_q + next_log_prob * alpha)
+            q1, q2 = self.cri.get_q1_q2(state, action)
+            obj_critic = self.criterion(q1, q_label) + self.criterion(q2, q_label)
+            self.optim_update(self.cri_optim, obj_critic)
+            self.soft_update(self.cri_target, self.cri, soft_update_tau)
+
+            '''objective of alpha (temperature parameter automatic adjustment)'''
+            action_pg, log_prob = self.act.get_action_logprob(state)  # policy gradient
+            obj_alpha = (self.alpha_log * (log_prob - self.target_entropy).detach()).mean()
+            self.optim_update(self.alpha_optim, obj_alpha)
+
+            '''objective of actor'''
+            alpha = self.alpha_log.exp().detach()
+            with torch.no_grad():
+                self.alpha_log[:] = self.alpha_log.clamp(-20, 2)
+            obj_actor = -(torch.min(*self.cri_target.get_q1_q2(state, action_pg)) + log_prob * alpha).mean()
+            self.optim_update(self.act_optim, obj_actor)
+
+            self.soft_update(self.act_target, self.act, soft_update_tau)
+
+        return obj_critic.item(), obj_actor.item(), alpha.item()
+
+
+class AgentPPO(AgentBase):
+    def __init__(self):
+        super().__init__()
+        self.ClassCri = CriticAdv
+        self.ClassAct = ActorPPO
+
+        self.if_off_policy = False
+        self.ratio_clip = 0.2  # ratio.clamp(1 - clip, 1 + clip)
+        self.lambda_entropy = 0.02  # could be 0.01~0.05
+        self.lambda_gae_adv = 0.98  # could be 0.95~0.99, GAE (Generalized Advantage Estimation. ICLR.2016.)
+        self.get_reward_sum = None  # self.get_reward_sum_gae if if_use_gae else self.get_reward_sum_raw
+
+    def init(self, net_dim, state_dim, action_dim, learning_rate=1e-4, if_use_gae=False, gpu_id=0, env_num=1):
+        super().init(net_dim, state_dim, action_dim, learning_rate, if_use_gae, gpu_id)
+        self.trajectory_list = list()
+        self.get_reward_sum = self.get_reward_sum_gae if if_use_gae else self.get_reward_sum_raw
+
+    def select_action(self, state):
+        states = torch.as_tensor((state,), dtype=torch.float32, device=self.device)
+        actions, noises = self.act.get_action(states)
+        return actions[0].detach().cpu().numpy(), noises[0].detach().cpu().numpy()
+
+    def explore_env(self, env, target_step):
+        state = self.state
+        trajectory_temp = list()
+        last_done = 0
+        for i in range(target_step):
+            action, noise = self.select_action(state)
+            next_state, reward, done, _ = env.step(np.tanh(action))
+            trajectory_temp.append((state, reward, done, action, noise))
+            if done:
+                state = env.reset()
+                last_done = i
+            else:
+                state = next_state
+        self.state = state
+
+        '''splice list'''
+        trajectory_list = self.trajectory_list + trajectory_temp[:last_done + 1]
+        self.trajectory_list = trajectory_temp[last_done:]
+        return trajectory_list
+
+    def update_net(self, buffer, batch_size, repeat_times, soft_update_tau):
+        with torch.no_grad():
+            buf_len = buffer[0].shape[0]
+            buf_state, buf_action, buf_noise, buf_reward, buf_mask = [ten.to(self.device) for ten in buffer]
+            # (ten_state, ten_action, ten_noise, ten_reward, ten_mask) = buffer
+
+            '''get buf_r_sum, buf_logprob'''
+            bs = 2 ** 10  # set a smaller 'BatchSize' when out of GPU memory.
+            buf_value = [self.cri_target(buf_state[i:i + bs]) for i in range(0, buf_len, bs)]
+            buf_value = torch.cat(buf_value, dim=0)
+            buf_logprob = self.act.get_old_logprob(buf_action, buf_noise)
+
+            buf_r_sum, buf_advantage = self.get_reward_sum(buf_len, buf_reward, buf_mask, buf_value)  # detach()
+            buf_advantage = (buf_advantage - buf_advantage.mean()) / (buf_advantage.std() + 1e-5)
+            del buf_noise, buffer[:]
+
+        '''PPO: Surrogate objective of Trust Region'''
+        obj_critic = obj_actor = None
+        for _ in range(int(buf_len / batch_size * repeat_times)):
+            indices = torch.randint(buf_len, size=(batch_size,), requires_grad=False, device=self.device)
+
+            state = buf_state[indices]
+            action = buf_action[indices]
+            r_sum = buf_r_sum[indices]
+            logprob = buf_logprob[indices]
+            advantage = buf_advantage[indices]
+
+            new_logprob, obj_entropy = self.act.get_logprob_entropy(state, action)  # it is obj_actor
+            ratio = (new_logprob - logprob.detach()).exp()
+            surrogate1 = advantage * ratio
+            surrogate2 = advantage * ratio.clamp(1 - self.ratio_clip, 1 + self.ratio_clip)
+            obj_surrogate = -torch.min(surrogate1, surrogate2).mean()
+            obj_actor = obj_surrogate + obj_entropy * self.lambda_entropy
+            self.optim_update(self.act_optim, obj_actor)
+
+            value = self.cri(state).squeeze(1)  # critic network predicts the reward_sum (Q value) of state
+            obj_critic = self.criterion(value, r_sum) / (r_sum.std() + 1e-6)
+            self.optim_update(self.cri_optim, obj_critic)
+            self.soft_update(self.cri_target, self.cri, soft_update_tau) if self.cri_target is not self.cri else None
+
+        a_std_log = getattr(self.act, 'a_std_log', torch.zeros(1))
+        return obj_critic.item(), obj_actor.item(), a_std_log.mean().item()  # logging_tuple
+
+    def get_reward_sum_raw(self, buf_len, buf_reward, buf_mask, buf_value) -> (torch.Tensor, torch.Tensor):
+        buf_r_sum = torch.empty(buf_len, dtype=torch.float32, device=self.device)  # reward sum
+
+        pre_r_sum = 0
+        for i in range(buf_len - 1, -1, -1):
+            buf_r_sum[i] = buf_reward[i] + buf_mask[i] * pre_r_sum
+            pre_r_sum = buf_r_sum[i]
+        buf_advantage = buf_r_sum - (buf_mask * buf_value[:, 0])
+        return buf_r_sum, buf_advantage
+
+    def get_reward_sum_gae(self, buf_len, ten_reward, ten_mask, ten_value):
+        """tensor, tensor """
+        buf_r_sum = torch.empty(buf_len, dtype=torch.float32, device=self.device)  # old policy value
+        buf_advantage = torch.empty(buf_len, dtype=torch.float32, device=self.device)  # advantage value
+
+        pre_r_sum = 0
+        pre_advantage = 0  # advantage value of previous step
+        for i in range(buf_len - 1, -1, -1):
+            buf_r_sum[i] = ten_reward[i] + ten_mask[i] * pre_r_sum
+            pre_r_sum = buf_r_sum[i]
+            buf_advantage[i] = ten_reward[i] + ten_mask[i] * (pre_advantage - ten_value[i])  # fix a bug here
+            pre_advantage = ten_value[i] + buf_advantage[i] * self.lambda_gae_adv
+        return buf_r_sum, buf_advantage
--- a/data/houseload.csv
+++ b/data/houseload.csv
--- a/data/iowa.nc
+++ b/data/iowa.nc
--- a/data/irradiance.csv
+++ b/data/irradiance.csv
--- a/data/prices.csv
+++ b/data/prices.csv
--- a/data/residential/USA_IA_Algona.Muni.AP.725457_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Algona.Muni.AP.725457_TMY3_BASE.csv
--- a/data/residential/USA_IA_Atlantic.Muni.AP.725453_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Atlantic.Muni.AP.725453_TMY3_BASE.csv
--- a/data/residential/USA_IA_Boone.Muni.AP.725486_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Boone.Muni.AP.725486_TMY3_BASE.csv
--- a/data/residential/USA_IA_Burlington.Muni.AP.725455_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Burlington.Muni.AP.725455_TMY3_BASE.csv
--- a/data/residential/USA_IA_Carroll.Muni.AP.725468_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Carroll.Muni.AP.725468_TMY3_BASE.csv
--- a/data/residential/USA_IA_Cedar.Rapids.Muni.AP.725450_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Cedar.Rapids.Muni.AP.725450_TMY3_BASE.csv
--- a/data/residential/USA_IA_Chariton.Muni.AP.725469_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Chariton.Muni.AP.725469_TMY3_BASE.csv
--- a/data/residential/USA_IA_Charles.City.Muni.AP.725463_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Charles.City.Muni.AP.725463_TMY3_BASE.csv
--- a/data/residential/USA_IA_Clarinda.Muni.AP.725479_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Clarinda.Muni.AP.725479_TMY3_BASE.csv
--- a/data/residential/USA_IA_Clinton.Muni.AWOS.725473_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Clinton.Muni.AWOS.725473_TMY3_BASE.csv
--- a/data/residential/USA_IA_Council.Bluffs.Muni.AP.725497_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Council.Bluffs.Muni.AP.725497_TMY3_BASE.csv
--- a/data/residential/USA_IA_Creston.Muni.AP.725474_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Creston.Muni.AP.725474_TMY3_BASE.csv
--- a/data/residential/USA_IA_Decorah.Muni.AP.725476_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Decorah.Muni.AP.725476_TMY3_BASE.csv
--- a/data/residential/USA_IA_Denison.Muni.AP.725477_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Denison.Muni.AP.725477_TMY3_BASE.csv
--- a/data/residential/USA_IA_Des.Moines.Intl.AP.725460_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Des.Moines.Intl.AP.725460_TMY3_BASE.csv
--- a/data/residential/USA_IA_Dubuque.Rgnl.AP.725470_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Dubuque.Rgnl.AP.725470_TMY3_BASE.csv
--- a/data/residential/USA_IA_Estherville.Muni.AP.726499_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Estherville.Muni.AP.726499_TMY3_BASE.csv
--- a/data/residential/USA_IA_Fairfield.Muni.AP.726498_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Fairfield.Muni.AP.726498_TMY3_BASE.csv
--- a/data/residential/USA_IA_Fort.Dodge.AWOS.725490_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Fort.Dodge.AWOS.725490_TMY3_BASE.csv
--- a/data/residential/USA_IA_Fort.Madison.Muni.AP.725483_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Fort.Madison.Muni.AP.725483_TMY3_BASE.csv
--- a/data/residential/USA_IA_Keokuk.Muni.AP.725456_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Keokuk.Muni.AP.725456_TMY3_BASE.csv
--- a/data/residential/USA_IA_Knoxville.Muni.AP.725493_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Knoxville.Muni.AP.725493_TMY3_BASE.csv
--- a/data/residential/USA_IA_Le.Mars.Muni.AP.725484_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Le.Mars.Muni.AP.725484_TMY3_BASE.csv
--- a/data/residential/USA_IA_Mason.City.Muni.AP.725485_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Mason.City.Muni.AP.725485_TMY3_BASE.csv
--- a/data/residential/USA_IA_Monticello.Muni.AP.725475_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Monticello.Muni.AP.725475_TMY3_BASE.csv
--- a/data/residential/USA_IA_Muscatine.Muni.AP.725487_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Muscatine.Muni.AP.725487_TMY3_BASE.csv
--- a/data/residential/USA_IA_Newton.Muni.AP.725464_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Newton.Muni.AP.725464_TMY3_BASE.csv
--- a/data/residential/USA_IA_Oelwein.Muni.AP.725488_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Oelwein.Muni.AP.725488_TMY3_BASE.csv
--- a/data/residential/USA_IA_Orange.City.Muni.AP.725489_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Orange.City.Muni.AP.725489_TMY3_BASE.csv
--- a/data/residential/USA_IA_Ottumwa.Industrial.AP.725465_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Ottumwa.Industrial.AP.725465_TMY3_BASE.csv
--- a/data/residential/USA_IA_Red.Oak.Muni.AP.725494_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Red.Oak.Muni.AP.725494_TMY3_BASE.csv
--- a/data/residential/USA_IA_Sheldon.Muni.AP.725495_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Sheldon.Muni.AP.725495_TMY3_BASE.csv
--- a/data/residential/USA_IA_Shenandoah.Muni.AP.725467_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Shenandoah.Muni.AP.725467_TMY3_BASE.csv
--- a/data/residential/USA_IA_Sioux.City-Sioux.Gateway.AP.725570_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Sioux.City-Sioux.Gateway.AP.725570_TMY3_BASE.csv
--- a/data/residential/USA_IA_Spencer.Muni.AP.726500_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Spencer.Muni.AP.726500_TMY3_BASE.csv
--- a/data/residential/USA_IA_Storm.Lake.Muni.AP.725496_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Storm.Lake.Muni.AP.725496_TMY3_BASE.csv
--- a/data/residential/USA_IA_Washington.Muni.AP.725454_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Washington.Muni.AP.725454_TMY3_BASE.csv
--- a/data/residential/USA_IA_Waterloo.Muni.AP.725480_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Waterloo.Muni.AP.725480_TMY3_BASE.csv
--- a/data/residential/USA_IA_Webster.City.Muni.AP.725478_TMY3_BASE.csv
+++ b/data/residential/USA_IA_Webster.City.Muni.AP.725478_TMY3_BASE.csv
--- a/data/station.csv
+++ b/data/station.csv
@ -0,0 +1,2 @@
+state,lon,lat
+iowa,93.5,41.1
--- a/data/temper.csv
+++ b/data/temper.csv
--- a/data/wind.csv
+++ b/data/wind.csv
--- a/data_manager.py
+++ b/data_manager.py
@ -0,0 +1,69 @@
+class Constant:
+    MONTHS_LEN = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
+    MAX_STEP_HOURS = 24 * 30
+
+
+class DataManager:
+    def __init__(self) -> None:
+        self.Pv = []
+        self.Prices = []
+        self.Load_Consumption = []
+        self.Temperature = []
+        self.Irradiance = []
+        self.Wind = []
+
+    def add_pv_element(self, element): self.Pv.append(element)
+
+    def add_price_element(self, element): self.Prices.append(element)
+
+    def add_load_element(self, element): self.Load_Consumption.append(element)
+
+    def add_temperature_element(self, element): self.Temperature.append(element)
+
+    def add_irradiance_element(self, element): self.Irradiance.append(element)
+
+    def add_wind_element(self, element): self.Wind.append(element)
+
+    # get current time data based on given month day, and day_time
+    def get_pv_data(self, month, day, day_time):
+        return self.Pv[(sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24 + day_time]
+
+    def get_price_data(self, month, day, day_time):
+        return self.Prices[(sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24 + day_time]
+
+    def get_load_cons_data(self, month, day, day_time):
+        return self.Load_Consumption[(sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24 + day_time]
+
+    def get_temperature_data(self, month, day, day_time):
+        return self.Temperature[(sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24 + day_time]
+
+    def get_irradiance_data(self, month, day, day_time):
+        return self.Irradiance[(sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24 + day_time]
+
+    def get_wind_data(self, month, day, day_time):
+        return self.Wind[(sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24 + day_time]
+
+    # get series data for one episode
+    def get_series_pv_data(self, month, day):
+        return self.Pv[(sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24:
+                       (sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24 + 24]
+
+    def get_series_price_data(self, month, day):
+        return self.Prices[(sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24:
+                           (sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24 + 24]
+
+    def get_series_load_cons_data(self, month, day):
+        return self.Load_Consumption[(sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24:
+                                     (sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24 + 24]
+
+    def get_series_temperature_data(self, month, day):
+        return self.Temperature[(sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24:
+                                (sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24 + 24]
+
+    def get_series_irradiance_data(self, month, day):
+        return self.Irradiance[(sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24:
+                               (sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24 + 24]
+
+    def get_series_wind_data(self, month, day):
+        return self.Wind[(sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24:
+                         (sum(Constant.MONTHS_LEN[:month - 1]) + day - 1) * 24 + 24]
--- a/environment.py
+++ b/environment.py
@ -0,0 +1,194 @@
+import gym
+import numpy as np
+import pandas as pd
+from gym import spaces
+from module import *
+from parameters import *
+from data_manager import *
+
+
+class ESSEnv(gym.Env):
+    def __init__(self, **kwargs):
+        super(ESSEnv, self).__init__()
+        self.excess = None
+        self.unbalance = None
+        self.shedding = None
+        self.real_unbalance = None
+        self.operation_cost = None
+        self.current_output = None
+        self.final_step_outputs = None
+        self.data_manager = DataManager()
+        self._load_year_data()
+        self.month = None
+        self.day = None
+        self.TRAIN = True
+        self.current_time = None
+        self.episode_length = kwargs.get('episode_length', 24)
+        self.battery_parameters = kwargs.get('battery_parameters', battery_parameters)
+        self.dg_parameters = kwargs.get('dg_parameters', dg_parameters)
+        self.solar_parameters = kwargs.get('solar_parameters', solar_parameters)
+        self.wind_parameters = kwargs.get('wind_parameters', wind_parameters)
+        self.penalty_coefficient = 50  # control soft penalty constrain
+        self.sell_coefficient = 0.5  # control sell benefits
+
+        self.grid = Grid()
+        self.battery = Battery(self.battery_parameters)
+        self.dg1 = DG(self.dg_parameters['gen_1'])
+        self.dg2 = DG(self.dg_parameters['gen_2'])
+        self.dg3 = DG(self.dg_parameters['gen_3'])
+        self.solar = Solar(self.solar_parameters)
+        self.wind = Wind(self.wind_parameters)
+
+        self.action_space = spaces.Box(low=-1, high=1, shape=(5,), dtype=np.float32)  # 已增加调节电压动作
+        self.state_space = spaces.Box(low=-np.inf, high=np.inf, shape=(10,), dtype=np.float32)
+
+    def reset(self, *args):
+        self.month = np.random.randint(1, 13)  # choose 12 month
+        if self.TRAIN:
+            self.day = np.random.randint(1, 20)
+        else:
+            self.day = np.random.randint(20, Constant.MONTHS_LEN[self.month] - 1)
+        self.current_time = 0
+        self.battery.reset()
+        self.dg1.reset()
+        self.dg2.reset()
+        self.dg3.reset()
+        self.solar.reset()
+        self.wind.reset()
+        return self._build_state()
+
+    def _build_state(self):
+        soc = self.battery.SOC()
+        dg1_output = self.dg1.current_output
+        dg2_output = self.dg2.current_output
+        dg3_output = self.dg3.current_output
+        time_step = self.current_time
+
+        price = self.data_manager.get_price_data(self.month, self.day, self.current_time)
+        house_load = self.data_manager.get_load_cons_data(self.month, self.day, self.current_time)
+        temperature = self.data_manager.get_temperature_data(self.month, self.day, self.current_time)
+        irradiance = self.data_manager.get_irradiance_data(self.month, self.day, self.current_time)
+        wind_speed = self.data_manager.get_wind_data(self.month, self.day, self.current_time)
+        # print('house_load:', house_load)
+        pv_generation = self.solar.step(temperature, irradiance)
+        wd_generation = self.wind.step(wind_speed)
+        generation = pv_generation + wd_generation
+        net_load = house_load - generation
+
+        obs = np.concatenate((np.float32(time_step), np.float32(price), np.float32(soc), np.float32(net_load),
+                              np.float32(dg1_output), np.float32(dg2_output), np.float32(dg3_output),
+                              np.float32(temperature), np.float32(irradiance), np.float32(wind_speed)), axis=None)
+        return obs
+
+    def step(self, action):  # state transition: current_obs->take_action->get_reward->get_finish->next_obs
+        # put action into each component
+        current_obs = self._build_state()
+        temperature = current_obs[7]
+        irradiance = current_obs[8]
+        wind_speed = current_obs[9]
+        self.battery.step(action[0])  # execute the state-transition part, battery.current_capacity also changed
+        self.dg1.step(action[1])
+        self.dg2.step(action[2])
+        self.dg3.step(action[3])
+        self.solar.step(action[4], temperature, irradiance)
+        self.wind.step(wind_speed)
+        self.current_output = np.array((self.dg1.current_output, self.dg2.current_output, self.dg3.current_output,
+                                        -self.battery.energy_change, self.solar.current_power, self.wind.current_power))
+        actual_production = sum(self.current_output)
+        price = current_obs[1]
+        netload = current_obs[3]
+        # print('actual_production:', actual_production, 'netload:', netload)
+        unbalance = actual_production - netload
+
+        reward = 0
+        excess_penalty = 0  # 过多
+        deficient_penalty = 0  # 过少
+        sell_benefit = 0
+        buy_cost = 0
+        self.excess = 0
+        self.shedding = 0
+        if unbalance >= 0:  # now in excess condition
+            if unbalance <= self.grid.exchange_ability:
+                # sell money to grid is little [0.029,0.1]
+                sell_benefit = self.grid.get_cost(price, unbalance) * self.sell_coefficient
+            else:
+                sell_benefit = self.grid.get_cost(price, self.grid.exchange_ability) * self.sell_coefficient
+                # real unbalance that even grid could not meet
+                self.excess = unbalance - self.grid.exchange_ability
+                excess_penalty = self.excess * self.penalty_coefficient
+        else:  # unbalance <0, its load shedding model, deficient penalty is used
+            if abs(unbalance) <= self.grid.exchange_ability:
+                buy_cost = self.grid.get_cost(price, abs(unbalance))
+            else:
+                buy_cost = self.grid.get_cost(price, self.grid.exchange_ability)
+                self.shedding = abs(unbalance) - self.grid.exchange_ability
+                deficient_penalty = self.shedding * self.penalty_coefficient
+        battery_cost = self.battery.get_cost(self.battery.energy_change)
+        dg1_cost = self.dg1.get_cost(self.dg1.current_output)
+        dg2_cost = self.dg2.get_cost(self.dg2.current_output)
+        dg3_cost = self.dg3.get_cost(self.dg3.current_output)
+        solar_cost = self.solar.get_cost(self.solar.current_power)
+        wind_cost = self.wind.gen_cost(self.wind.current_power)
+
+        reward -= (battery_cost + dg1_cost + dg2_cost + dg3_cost + solar_cost + wind_cost + excess_penalty +
+                   deficient_penalty - sell_benefit + buy_cost) / 1e3
+        self.operation_cost = (battery_cost + dg1_cost + dg2_cost + dg3_cost + solar_cost + wind_cost + excess_penalty +
+                               deficient_penalty - sell_benefit + buy_cost)
+        self.unbalance = unbalance
+        self.real_unbalance = self.shedding + self.excess
+        final_step_outputs = [self.dg1.current_output, self.dg2.current_output, self.dg3.current_output,
+                              self.battery.current_capacity, self.solar.current_power, self.wind.current_power]
+        self.current_time += 1
+        finish = (self.current_time == self.episode_length)
+        if finish:
+            self.final_step_outputs = final_step_outputs
+            self.current_time = 0
+            next_obs = self.reset()
+        else:
+            next_obs = self._build_state()
+        return current_obs, next_obs, float(reward), finish
+
+    # def render(self, current_obs, next_obs, reward, finish):
+    #     print('day={},hour={:2d}, state={}, next_state={}, reward={:.4f}, terminal={}\n'.
+    #           format(self.day, self.current_time, current_obs, next_obs, reward, finish))
+
+    def _load_year_data(self):
+        # pv_df = pd.read_csv('data/pv.csv', sep=',')
+        price_df = pd.read_csv('data/prices.csv', sep=';')
+        load_df = pd.read_csv('data/houseload.csv', sep=',')
+        irradiance_df = pd.read_csv('data/irradiance.csv', sep=',')
+        temperature_df = pd.read_csv('data/temper.csv', sep=',')
+        wind_df = pd.read_csv('data/wind.csv', sep=',')
+
+        # pv = pv_df['pv'].to_numpy(dtype=float)
+        price = price_df['Price'].apply(lambda x: x.replace(',', '.')).to_numpy(dtype=float)
+        load = load_df['houseload'].to_numpy(dtype=float)
+        irradiance = irradiance_df['irradiance'].to_numpy(dtype=float)
+        temperature = temperature_df['t2m'].to_numpy(dtype=float)
+        wind = wind_df['wind_speed'].to_numpy(dtype=float)
+
+        '''redesign the magnitude for price and amount of generation as well as demand'''
+        def process_elements(elements, transform_function, add_function):
+            for element in elements:
+                transformed_element = transform_function(element)
+                add_function(transformed_element)
+
+        # process_elements(pv, lambda x: x, self.data_manager.add_pv_element)
+        process_elements(price, lambda x: max(x / 10, 0.5), self.data_manager.add_price_element)
+        process_elements(load, lambda x: x * 5, self.data_manager.add_load_element)
+        process_elements(irradiance, lambda x: x, self.data_manager.add_irradiance_element)
+        process_elements(temperature, lambda x: x - 273.15, self.data_manager.add_temperature_element)
+        process_elements(wind, lambda x: x, self.data_manager.add_wind_element)
+
+# if __name__ == '__main__':
+#     env = ESSEnv()
+#     env.TRAIN = False
+#     rewards = []
+#     env.reset()
+#     tem_action = [0.1, 0.1, 0.1, 0.1, 0.1]
+#     for _ in range(144):
+#         print(f'current month is {env.month}, current day is {env.day}, current time is {env.current_time}')
+#         current_obs, next_obs, reward, finish = env.step(tem_action)
+#         env.render(current_obs, next_obs, reward, finish)
+#         current_obs = next_obs
+#         rewards.append(reward)
--- a/generate_household_data.py
+++ b/generate_household_data.py
@ -0,0 +1,35 @@
+import pandas as pd
+import numpy as np
+import os
+from nltk import word_tokenize
+
+
+def func(i, j):
+    directory_path = "C:/Users/97532/Desktop/DRL-for-Energy-Systems/data/residential"
+    entries = os.listdir(directory_path)[i:j]
+    folders = [entry for entry in entries if os.path.join(directory_path, entry)]
+    entries = [os.path.join(directory_path, folder) for folder in folders]
+    combined_data = pd.DataFrame()
+
+    for file_path in entries:
+        current_data = pd.read_csv(file_path, encoding='utf-8')
+        tokenize_col = current_data.columns
+        tokenized_sentences_list = [word_tokenize(text) for text in tokenize_col]
+
+        occurrences_electricity = np.array(
+            [(i, j) for i, sublist in enumerate(tokenized_sentences_list) for j, word in enumerate(sublist) if
+             word == 'Electricity'])[:, 0]
+
+        Electricity_consumption_wind_solar = current_data[tokenize_col[occurrences_electricity]].sum(axis=1)
+        current_data = Electricity_consumption_wind_solar.to_frame("Household_load")
+
+        combined_data = pd.concat([combined_data, current_data], axis=1)
+
+    Series_type = combined_data.sum(axis=1)
+    Series_type.to_csv('houseload.csv', index=False)
+    total_energy = np.array(Series_type).ravel()
+    return total_energy
+
+
+if __name__ == '__main__':
+    func(0, 38)
--- a/gurobi.log
+++ b/gurobi.log
@ -0,0 +1,4 @@
+
+Gurobi 11.0.2 (win64, Python) logging started Fri May 17 12:42:18 2024
+
+Set parameter LogFile to value "gurobi.log"
--- a/module.py
+++ b/module.py
@ -0,0 +1,173 @@
+import numpy as np
+
+
+class DG:
+    """simulate a simple diesel generator"""
+
+    def __init__(self, parameters):
+        self.current_output = None
+        self.name = parameters.keys()
+        self.a_factor = parameters['a']
+        self.b_factor = parameters['b']
+        self.c_factor = parameters['c']
+        self.power_output_max = parameters['power_output_max']
+        self.power_output_min = parameters['power_output_min']
+        self.ramping_up = parameters['ramping_up']
+        self.ramping_down = parameters['ramping_down']
+        self.last_step_output = None
+
+    def step(self, action_gen):
+        output_change = action_gen * self.ramping_up  # constrain the output_change with ramping up boundary
+        output = self.current_output + output_change
+        if output > 0:
+            output = max(self.power_output_min, min(self.power_output_max, output))  # meet the constraint
+        else:
+            output = 0
+        self.current_output = output
+
+    def get_cost(self, output):
+        if output <= 0:
+            cost = 0
+        else:
+            cost = (self.a_factor * pow(output, 2) + self.b_factor * output + self.c_factor)
+        return cost
+
+    def reset(self):
+        self.current_output = 0
+
+
+class Battery:
+    """simulate a simple battery"""
+
+    def __init__(self, parameters):
+        self.current_capacity = None
+        self.energy_change = None
+        self.capacity = parameters['capacity']
+        self.max_soc = parameters['max_soc']
+        self.initial_capacity = parameters['initial_capacity']
+        self.min_soc = parameters['min_soc']
+        self.degradation = parameters['degradation']  # degradation cost 1.2
+        self.max_charge = parameters['max_charge']  # max charge ability
+        self.max_discharge = parameters['max_discharge']
+        self.efficiency = parameters['efficiency']
+
+    def step(self, action_battery):
+        energy = action_battery * self.max_charge
+        updated_capacity = max(self.min_soc,
+                               min(self.max_soc, (self.current_capacity * self.capacity + energy) / self.capacity))
+        # if charge, positive, if discharge, negative
+        self.energy_change = (updated_capacity - self.current_capacity) * self.capacity
+        self.current_capacity = updated_capacity  # update capacity to current codition
+
+    def get_cost(self, energy):  # cost depends on the energy change
+        cost = energy ** 2 * self.degradation
+        return cost
+
+    def SOC(self):
+        return self.current_capacity
+
+    def reset(self):
+        self.current_capacity = np.random.uniform(0.2, 0.8)
+
+
+class Solar:
+    def __init__(self, parameters):
+        self.current_power = None
+        self.base_voltage = parameters['V_b']
+        self.sc_current = parameters['I_sc0']
+        self.oc_voltage = parameters['V_oc0']
+        self.s_resistance = parameters['R_s']
+        self.sh_resistance = parameters['R_sh']
+        self.temper_coefficient = parameters['k_v']
+        self.opex_cofficient = parameters['k_o']
+        self.refer_irradiance = parameters['G_ref']
+        self.refer_temperature = parameters['T_ref']
+
+    def step(self, temperature, irradiance, action_voltage=0):
+        I_sc = self.sc_current * (irradiance / self.refer_irradiance)
+        V_oc = self.oc_voltage + self.temper_coefficient * (temperature - self.refer_temperature)
+
+        current = I_sc - (V_oc / self.sh_resistance)
+        # current = I_sc
+        # for _ in range(10):  # 迭代次数
+        #     new_current = I_sc - (V_oc + current * self.s_resistance) / self.sh_resistance
+        #     if abs(new_current - current) < 1e-6:  # 收敛条件
+        #         break
+        #     current = new_current
+        self.current_power = max((1 + action_voltage) * self.base_voltage * current, 0)
+        return self.current_power
+
+    def get_cost(self, current_power):
+        cost = current_power * self.opex_cofficient
+        return cost
+
+    def reset(self):
+        self.current_power = 0
+
+
+class Wind:
+    def __init__(self, parameters):
+        self.current_power = None
+        self.cutin_speed = parameters['cutin_speed']
+        self.cutout_speed = parameters['cutout_speed']
+        self.rated_speed = parameters['rated_speed']
+        self.air_density = parameters['air_density']
+        self.rotor_radius = parameters['rotor_radius']
+        self.power_coefficient = parameters['power_coefficient']
+        self.generator_efficiency = parameters['generator_efficiency']
+        self.opex_cofficient = parameters['opex_cofficient']
+
+    def step(self, wind_speed):
+        if self.cutin_speed <= wind_speed < self.rated_speed:
+            self.current_power = (0.5 * self.air_density * self.rotor_radius ** 2 * wind_speed ** 3 *
+                                  self.power_coefficient * self.generator_efficiency) / 1e3
+        elif self.rated_speed <= wind_speed < self.cutout_speed:
+            self.current_power = (0.5 * self.air_density * self.rotor_radius ** 2 * self.rated_speed ** 3 *
+                                  self.power_coefficient * self.generator_efficiency) / 1e3
+        else:
+            self.current_power = 0
+
+        return self.current_power
+
+    def gen_cost(self, current_power):
+        cost = current_power * self.opex_cofficient
+        return cost
+
+    def reset(self):
+        self.current_power = 0
+
+
+class Grid:
+    def __init__(self):
+        self.on = True
+        self.delta = 1
+        if self.on:
+            self.exchange_ability = 100
+        else:
+            self.exchange_ability = 0
+
+    def get_cost(self, current_price, energy_exchange):
+        return current_price * energy_exchange * self.delta
+
+    def retrieve_past_price(self):
+        result = []
+        # 过去24小时的价格起始、结束索引
+        start_index = max(0, 24 * (self.day - 1) + self.time - 24)
+        end_index = 24 * (self.day - 1) + self.time
+        past_price = self.price[start_index:end_index]
+        result.extend(past_price)
+        # current_day_price = self.price[24 * self.day:24 * self.day + self.time]
+        # result.extend(current_day_price)
+        return result
+
+    # def retrive_past_price(self):
+    #     result = []
+    #     if self.day < 1:
+    #         past_price = self.past_price
+    #     else:
+    #         past_price = self.price[24 * (self.day - 1):24 * self.day]
+    #     for item in past_price[(self.time - 24)::]:
+    #         result.append(item)
+    #     for item in self.price[24 * self.day:(24 * self.day + self.time)]:
+    #         result.append(item)
+    #     return result
--- a/net.py
+++ b/net.py
@ -0,0 +1,136 @@
+import numpy as np
+import torch
+import torch.nn as nn
+
+
+class Actor(nn.Module):
+    def __init__(self, mid_dim, state_dim, action_dim):
+        super().__init__()
+        self.net = nn.Sequential(nn.Linear(state_dim, mid_dim), nn.ReLU(),
+                                 nn.Linear(mid_dim, mid_dim), nn.ReLU(),
+                                 nn.Linear(mid_dim, mid_dim), nn.Hardswish(),
+                                 nn.Linear(mid_dim, action_dim))
+
+    def forward(self, state):
+        return self.net(state).tanh()  # action.tanh()
+
+    def get_action(self, state, action_std):
+        action = self.net(state).tanh()
+        noise = (torch.randn_like(action) * action_std).clamp(-0.5, 0.5)
+        return (action + noise).clamp(-1.0, 1.0)
+
+
+class ActorSAC(nn.Module):
+    def __init__(self, mid_dim, state_dim, action_dim):
+        super().__init__()
+        self.net_state = nn.Sequential(nn.Linear(state_dim, mid_dim), nn.ReLU(),
+                                       nn.Linear(mid_dim, mid_dim), nn.ReLU(), )
+        self.net_a_avg = nn.Sequential(nn.Linear(mid_dim, mid_dim), nn.Hardswish(),
+                                       nn.Linear(mid_dim, action_dim))  # the average of action
+        self.net_a_std = nn.Sequential(nn.Linear(mid_dim, mid_dim), nn.Hardswish(),
+                                       nn.Linear(mid_dim, action_dim))  # the log_std of action
+        self.log_sqrt_2pi = np.log(np.sqrt(2 * np.pi))
+
+    def forward(self, state):
+        tmp = self.net_state(state)
+        return self.net_a_avg(tmp).tanh()  # action
+
+    def get_action(self, state):
+        t_tmp = self.net_state(state)
+        a_avg = self.net_a_avg(t_tmp)  # NOTICE! it is a_avg without .tanh()
+        a_std = self.net_a_std(t_tmp).clamp(-20, 2).exp()
+        return torch.normal(a_avg, a_std).tanh()  # re-parameterize
+
+    def get_action_logprob(self, state):
+        t_tmp = self.net_state(state)
+        a_avg = self.net_a_avg(t_tmp)  # NOTICE! it needs a_avg.tanh()
+        a_std_log = self.net_a_std(t_tmp).clamp(-20, 2)
+        a_std = a_std_log.exp()
+
+        noise = torch.randn_like(a_avg, requires_grad=True)
+        a_tan = (a_avg + a_std * noise).tanh()  # action.tanh()
+
+        log_prob = a_std_log + self.log_sqrt_2pi + noise.pow(2).__mul__(0.5)  # noise.pow(2) * 0.5
+        log_prob = log_prob + (-a_tan.pow(2) + 1.000001).log()  # fix log_prob using the derivative of action.tanh()
+        return a_tan, log_prob.sum(1, keepdim=True)
+
+
+class ActorPPO(nn.Module):
+    def __init__(self, mid_dim, state_dim, action_dim):
+        super().__init__()
+        self.net = nn.Sequential(nn.Linear(state_dim, mid_dim), nn.ReLU(),
+                                 nn.Linear(mid_dim, mid_dim), nn.ReLU(),
+                                 nn.Linear(mid_dim, mid_dim), nn.Hardswish(),
+                                 nn.Linear(mid_dim, action_dim), )
+
+        # the logarithm (log) of standard deviation (std) of action, it is a trainable parameter
+        self.a_std_log = nn.Parameter(torch.zeros((1, action_dim)) - 0.5, requires_grad=True)
+        self.sqrt_2pi_log = np.log(np.sqrt(2 * np.pi))
+
+    def forward(self, state):
+        return self.net(state).tanh()  # action.tanh()# in this way limit the data output of action
+
+    def get_action(self, state):
+        a_avg = self.net(state)  # mean
+        a_std = self.a_std_log.exp()  # standard deviation
+
+        noise = torch.randn_like(a_avg)
+        action = a_avg + noise * a_std
+        return action, noise
+
+    def get_logprob_entropy(self, state, action):
+        a_avg = self.net(state)
+        a_std = self.a_std_log.exp()
+
+        delta = ((a_avg - action) / a_std).pow(2) * 0.5
+        logprob = -(self.a_std_log + self.sqrt_2pi_log + delta).sum(1)  # new_logprob
+
+        dist_entropy = (logprob.exp() * logprob).mean()  # policy entropy
+        return logprob, dist_entropy
+
+    def get_old_logprob(self, _action, noise):  # noise = action - a_noise
+        delta = noise.pow(2) * 0.5
+        return -(self.a_std_log + self.sqrt_2pi_log + delta).sum(1)  # old_logprob
+
+
+class Critic(nn.Module):
+    def __init__(self, mid_dim, state_dim, action_dim):
+        super().__init__()
+        self.net = nn.Sequential(nn.Linear(state_dim + action_dim, mid_dim), nn.ReLU(),
+                                 nn.Linear(mid_dim, mid_dim), nn.ReLU(),
+                                 nn.Linear(mid_dim, mid_dim), nn.Hardswish(),
+                                 nn.Linear(mid_dim, 1))
+
+    def forward(self, state, action):
+        return self.net(torch.cat((state, action), dim=1))  # q value
+
+
+class CriticAdv(nn.Module):
+    def __init__(self, mid_dim, state_dim, _action_dim):
+        super().__init__()
+        self.net = nn.Sequential(nn.Linear(state_dim, mid_dim), nn.ReLU(),
+                                 nn.Linear(mid_dim, mid_dim), nn.ReLU(),
+                                 nn.Linear(mid_dim, mid_dim), nn.Hardswish(),
+                                 nn.Linear(mid_dim, 1))
+
+    def forward(self, state):
+        return self.net(state)  # advantage value
+
+
+class CriticTwin(nn.Module):  # shared parameter
+    def __init__(self, mid_dim, state_dim, action_dim):
+        super().__init__()
+        self.net_sa = nn.Sequential(nn.Linear(state_dim + action_dim, mid_dim), nn.ReLU(),
+                                    nn.Linear(mid_dim, mid_dim), nn.ReLU())  # concat(state, action)
+        self.net_q1 = nn.Sequential(nn.Linear(mid_dim, mid_dim), nn.Hardswish(),
+                                    nn.Linear(mid_dim, 1))  # q1 value
+        self.net_q2 = nn.Sequential(nn.Linear(mid_dim, mid_dim), nn.Hardswish(),
+                                    nn.Linear(mid_dim, 1))  # q2 value
+
+    def forward(self, state, action):
+        tmp = self.net_sa(torch.cat((state, action), dim=1))
+        return self.net_q1(tmp)  # one Q value
+
+    def get_q1_q2(self, state, action):
+        tmp = self.net_sa(torch.cat((state, action), dim=1))
+        return self.net_q1(tmp), self.net_q2(tmp)  # two Q values
--- a/parameters.py
+++ b/parameters.py
@ -0,0 +1,49 @@
+import numpy as np
+
+solar_parameters = {
+    'I_sc0': 8.0,  # 参考条件下的短路电流 (A)
+    'V_b': 24,  # 基准电压
+    'V_oc0': 36.0,  # 参考条件下的开路电压 (V)
+    'R_s': 0.1,  # 串联电阻 (Ω)
+    'R_sh': 100.0,  # 并联电阻 (Ω)
+    'k_v': -0.2,  # 开路电压的温度系数 (V/°C)
+    'k_o': 0.001,  # 变动成本系数 (元/千瓦时)
+    'G_ref': 1000,  # 参考辐照度 (W/m²)
+    'T_ref': 25,  # 参考温度 (°C)
+}
+
+wind_parameters = {
+    'cutin_speed': 3,
+    'cutout_speed': 12,
+    'rated_speed': 8,
+    'air_density': 1.225,
+    'rotor_radius': 25,
+    'power_coefficient': 0.5,
+    'generator_efficiency': 0.9,
+    'opex_cofficient': 0.01,
+}
+
+battery_parameters = {
+    'capacity': 500,  # kw
+    'max_charge': 100,  # kw
+    'max_discharge': 100,  # kw
+    'efficiency': 0.9,
+    'degradation': 0.01,  # euro/kw
+    'max_soc': 0.8,
+    'min_soc': 0.2,
+    'initial_capacity': 0.4
+}
+
+dg_parameters = {
+    'gen_1': {'a': 0.0034, 'b': 3, 'c': 30, 'd': 0.03, 'e': 4.2, 'f': 0.031,
+              'power_output_max': 150, 'power_output_min': 10,
+              'ramping_up': 100, 'ramping_down': 100, 'min_up': 2, 'min_down': 1},
+
+    'gen_2': {'a': 0.001, 'b': 10, 'c': 40, 'd': 0.03, 'e': 4.2, 'f': 0.031,
+              'power_output_max': 375, 'power_output_min': 50,
+              'ramping_up': 100, 'ramping_down': 100, 'min_up': 2, 'min_down': 1},
+
+    'gen_3': {'a': 0.001, 'b': 15, 'c': 70, 'd': 0.03, 'e': 4.2, 'f': 0.031,
+              'power_output_max': 500, 'power_output_min': 100,
+              'ramping_up': 200, 'ramping_down': 200, 'min_up': 2, 'min_down': 1}
+}
--- a/plotDRL.py
+++ b/plotDRL.py
@ -0,0 +1,150 @@
+import os
+import pickle
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+
+matplotlib.rc('text', usetex=True)
+pd.options.display.notebook_repr_html = False
+
+
+def plot_optimization_result(datasource, directory):  # data source is dataframe
+    sns.set_theme(style='whitegrid')  # "white", "dark", "whitegrid", "darkgrid", "ticks"
+    plt.rcParams["figure.figsize"] = (16, 9)
+    fig, axs = plt.subplots(2, 2)
+    plt.subplots_adjust(wspace=0.7, hspace=0.3)
+    plt.autoscale(tight=True)
+    T = np.array([i for i in range(24)])
+    # plot step cost in ax[0]
+    axs[0, 0].cla()
+    axs[0, 0].set_ylabel('Costs')
+    axs[0, 0].set_xlabel('Time(h)')
+    axs[0, 0].bar(T, datasource['step_cost'])
+    # axs[0,0].set_xticks([i for i in range(24)],[i for i in range(1,25)])
+
+    # plot soc and price in ax[1]
+    axs[0, 1].cla()
+    axs[0, 1].set_ylabel('Price')
+    axs[0, 1].set_xlabel('Time(h)')
+
+    axs[0, 1].plot(T, datasource['price'], drawstyle='steps-mid', label='Price', color='pink')
+    axs[0, 1] = axs[0, 1].twinx()
+
+    axs[0, 1].set_ylabel('SOC')
+    axs[0, 1].plot(T, datasource['soc'], drawstyle='steps-mid', label='SOC', color='grey')
+    # axs[0,1].set_xticks([i for i in range(24)],[i for i in range(1,25)])
+    axs[0, 1].legend(loc='upper right', fontsize=12, frameon=False, labelspacing=0.3)
+
+    # plot accumulated generation and consumption in ax[2]
+    axs[1, 0].cla()
+    axs[1, 0].set_ylabel('Outputs of DGs and Battery')
+    axs[1, 0].set_xlabel('Time(h)')
+    battery_positive = np.array(datasource['battery_energy_change'])
+    battery_negative = np.array(datasource['battery_energy_change'])
+    battery_negative = np.minimum(battery_negative, 0)  # discharge
+    battery_positive = np.maximum(battery_positive, 0)  # charge
+    # deal with power exchange within the figure
+    imported_from_grid = np.array(datasource['grid_import'])
+    exported_2_grid = np.array(datasource['grid_export'])
+    axs[1, 0].bar(T, datasource['gen1'], label='Gen1')
+    axs[1, 0].bar(T, datasource['gen2'], label='Gen2', bottom=datasource['gen1'])
+    axs[1, 0].bar(T, datasource['gen3'], label='Gen3', bottom=datasource['gen2'] + datasource['gen1'])
+    axs[1, 0].bar(T, -battery_positive, color='blue', hatch='/', label='ESS charge')
+    axs[1, 0].bar(T, -battery_negative, hatch='/', label='ESS discharge',
+                  bottom=datasource['gen3'] + datasource['gen2'] + datasource['gen1'])
+    # import as generate
+    axs[1, 0].bar(T, imported_from_grid, label='Grid import',
+                  bottom=-battery_negative + datasource['gen3'] + datasource['gen2'] + datasource['gen1'])
+    # export as load
+    axs[1, 0].bar(T, -exported_2_grid, label='Grid export', bottom=-battery_positive)
+    axs[1, 0].plot(T, datasource['netload'], label='Netload', drawstyle='steps-mid', alpha=0.7)
+    axs[1, 0].legend(loc='upper right', fontsize=12, frameon=False, labelspacing=0.3)
+    # axs[1,0].set_xticks([i for i in range(24)],[i for i in range(1,25)])
+    # plt.show()
+    fig.savefig(f"{directory}/optimization_information.svg", format='svg', dpi=600, bbox_inches='tight')
+    print('optimization result has been ploted')
+
+
+def plot_evaluation_information(datasource, directory):
+    sns.set_theme(style='whitegrid')
+    with open(datasource, 'rb') as tf:
+        test_data = pickle.load(tf)
+    # plot unbalance, and reward of each step by bar figures
+    plt.rcParams["figure.figsize"] = (16, 9)
+    fig, axs = plt.subplots(2, 2)
+    plt.subplots_adjust(wspace=0.7, hspace=0.3)
+    plt.autoscale(tight=True)
+
+    # prepare data for evaluation the environment here
+    eval_data = pd.DataFrame(test_data['information'])
+    eval_data.columns = ['time_step', 'price', 'netload', 'action', 'real_action', 'soc', 'battery', 'gen1', 'gen2',
+                         'gen3', 'unbalance', 'operation_cost']
+
+    # plot unbalance in axs[0]
+    axs[0, 0].cla()
+    axs[0, 0].set_ylabel('Unbalance of Generation and Load')
+    axs[0, 0].bar(eval_data['time_step'], eval_data['unbalance'], label='Exchange with Grid', width=0.4)
+    axs[0, 0].bar(eval_data['time_step'] + 0.4, eval_data['netload'], label='Netload', width=0.4)
+    axs[0, 0].legend(loc='upper right', fontsize=12, frameon=False, labelspacing=0.5)
+    # axs[0,0].set_xticks([i for i in range(24)],[i for i in range(1,25)])
+
+    # plot reward in axs[1]
+    axs[1, 1].cla()
+    axs[1, 1].set_ylabel('Costs')
+    axs[1, 1].bar(eval_data['time_step'], eval_data['operation_cost'])
+
+    # plot generation and netload in ax[2]
+    axs[1, 0].cla()
+    axs[1, 0].set_ylabel('Outputs of Units and Netload (kWh)')
+    # axs[1,0].set_xticks([i for i in range(24)], [i for i in range(1, 25)])
+    battery_positive = np.array(eval_data['battery'])
+    battery_negative = np.array(eval_data['battery'])
+    battery_positive = np.maximum(battery_positive, 0)  # charge
+    battery_negative = np.minimum(battery_negative, 0)  # discharge
+
+    # deal with power exchange within the figure 
+    imported_from_grid = np.minimum(np.array(eval_data['unbalance']), 0)
+    exported_2_grid = np.maximum(np.array(eval_data['unbalance']), 0)
+    x = eval_data['time_step']
+    axs[1, 0].bar(x, eval_data['gen1'], label='Gen1')
+    axs[1, 0].bar(x, eval_data['gen2'], label='Gen2', bottom=eval_data['gen1'])
+    axs[1, 0].bar(x, eval_data['gen3'], label='Gen3', bottom=eval_data['gen1'] + eval_data['gen2'])
+    axs[1, 0].bar(x, -battery_positive, color='blue', hatch='/', label='ESS charge')
+    axs[1, 0].bar(x, -battery_negative, label='ESS discharge', hatch='/',
+                  bottom=eval_data['gen1'] + eval_data['gen2'] + eval_data['gen3'])
+    axs[1, 0].bar(x, -imported_from_grid, label='Grid import',
+                  bottom=eval_data['gen1'] + eval_data['gen2'] + eval_data['gen3'] - battery_negative)
+    axs[1, 0].bar(x, -exported_2_grid, label='Grid export', bottom=-battery_positive)
+
+    axs[1, 0].plot(x, eval_data['netload'], drawstyle='steps-mid', label='Netload')
+    axs[1, 0].legend(loc='upper right', fontsize=12, frameon=False, labelspacing=0.3)
+
+    # plot energy charge/discharge with price in ax[3].
+    axs[0, 1].cla()
+    axs[0, 1].set_ylabel('Price')
+    axs[0, 1].set_xlabel('Time Steps')
+
+    axs[0, 1].plot(eval_data['time_step'], eval_data['price'], drawstyle='steps-mid', label='Price', color='pink')
+    axs[0, 1] = axs[0, 1].twinx()
+    axs[0, 1].set_ylabel('SOC')
+    # axs[0,1].set_xticks([i for i in range(24)], [i for i in range(1, 25)])
+    axs[0, 1].plot(eval_data['time_step'], eval_data['soc'], drawstyle='steps-mid', label='SOC', color='grey')
+    axs[0, 1].legend(loc='upper right', fontsize=12, frameon=False, labelspacing=0.3)
+    # plt.show()
+    fig.savefig(f"{directory}/Evoluation Information.svg", format='svg', dpi=600, bbox_inches='tight')
+    print('the evaluation figure have been plot and saved')
+
+
+def make_dir(directory, feature_change):
+    cwd = f'{directory}/DRL_{feature_change}_plots'
+    os.makedirs(cwd, exist_ok=True)
+    return cwd
+
+
+class PlotArgs:
+    def __init__(self) -> None:
+        self.cwd = None
+        self.feature_change = None
+        self.plot_on = None
--- a/read.ipynb
+++ b/read.ipynb
@ -0,0 +1,403 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "initial_id",
+   "metadata": {
+    "collapsed": true,
+    "ExecuteTime": {
+     "end_time": "2024-06-11T06:41:37.671133500Z",
+     "start_time": "2024-06-11T06:41:37.652982600Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['./data/iowa.nc']\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "import netCDF4 as nc\n",
+    "import os\n",
+    "import pandas as pd\n",
+    "import datetime as dt\n",
+    "nc_path = \"./data/\"\n",
+    "nc_files = [f\"{nc_path}{x}\" for x in os.listdir(nc_path) if x.endswith('.nc')]\n",
+    "print(nc_files)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "outputs": [],
+   "source": [
+    "stations = pd.read_csv('./data/station.csv')\n",
+    "stations.head()\n",
+    "stations.dropna(inplace=True)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-11T06:41:37.722796200Z",
+     "start_time": "2024-06-11T06:41:37.663028100Z"
+    }
+   },
+   "id": "9c4ed0ec8dc7ff7d"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['longitude', 'latitude', 'time', 't2m']\n"
+     ]
+    }
+   ],
+   "source": [
+    "nc_data = nc.Dataset(nc_files[0])  # 读文件\n",
+    "print(list(nc_data.variables.keys())) # keys"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-11T06:41:37.730197500Z",
+     "start_time": "2024-06-11T06:41:37.678192900Z"
+    }
+   },
+   "id": "a88ddc29d8fa3d06"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "(13, 31)"
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "lat = np.asarray(nc_data['latitude'][:]).tolist()\n",
+    "lon = np.asarray(nc_data['longitude'][:]).tolist()\n",
+    "len(lat), len(lon)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-11T06:41:37.730197500Z",
+     "start_time": "2024-06-11T06:41:37.693749900Z"
+    }
+   },
+   "id": "4ec1510ef5baa4b9"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "array([1016832, 1016833, 1016834, ..., 1025613, 1025614, 1025615])"
+     },
+     "execution_count": 38,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.asarray(nc_data['time'][:])"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-11T06:41:37.731222200Z",
+     "start_time": "2024-06-11T06:41:37.710255800Z"
+    }
+   },
+   "id": "9e1cfd59ea63dee"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "<class 'netCDF4._netCDF4.Variable'>\nint32 time(time)\n    units: hours since 1900-01-01 00:00:00.0\n    long_name: time\n    calendar: gregorian\nunlimited dimensions: \ncurrent shape = (8784,)\nfilling on, default _FillValue of -2147483647 used"
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nc_data['time']"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-11T06:41:37.739215300Z",
+     "start_time": "2024-06-11T06:41:37.723796300Z"
+    }
+   },
+   "id": "305b595052773792"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "datetime.datetime(1900, 1, 1, 0, 0)"
+     },
+     "execution_count": 40,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "start_date = dt.datetime(1900, 1, 1)\n",
+    "start_date"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-11T06:41:37.788384900Z",
+     "start_time": "2024-06-11T06:41:37.739215300Z"
+    }
+   },
+   "id": "9dc1b63dbd6e5847"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "array([[[267.10449818, 266.56524854, 265.81574602, ..., 266.35826383,\n         266.94435738, 267.10231939],\n        [266.75916053, 266.21773211, 265.4017766 , ..., 266.49334859,\n         267.59908269, 267.86380524],\n        [266.70360148, 265.86258993, 265.06733289, ..., 266.6197182 ,\n         267.66117811, 267.95640366],\n        ...,\n        [270.23759303, 270.20055367, 269.91731143, ..., 270.11231282,\n         269.99683714, 269.90750689],\n        [271.34223774, 271.32153927, 271.25726507, ..., 270.55569585,\n         270.37376718, 270.24086121],\n        [271.76274352, 271.85643134, 271.79215714, ..., 271.32371806,\n         270.86617291, 270.71583665]],\n\n       [[267.45746158, 267.17966631, 266.73846206, ..., 266.19376546,\n         266.66983029, 266.83541806],\n        [267.5337191 , 267.25374505, 266.67418786, ..., 266.32231386,\n         267.30821471, 267.57620544],\n        [267.67425083, 267.02933005, 266.40183956, ..., 266.40183956,\n         267.33218136, 267.62522813],\n        ...,\n        [269.75172367, 269.78549485, 269.62317527, ..., 270.06873709,\n         269.95652959, 269.69289643],\n        [270.76159117, 270.80080932, 270.75832299, ..., 270.33237024,\n         270.18748095, 269.91186447],\n        [271.10692881, 271.19516966, 271.15268333, ..., 270.63848973,\n         270.47290197, 270.30513541]],\n\n       [[267.29841017, 267.25483444, 267.06745881, ..., 265.76345515,\n         266.3157775 , 266.53692432],\n        [267.55441757, 267.51084184, 267.31801925, ..., 266.01728376,\n         266.99338007, 267.2613708 ],\n        [267.77338561, 267.59363573, 267.37684648, ..., 266.09898825,\n         267.00318461, 267.28860563],\n        ...,\n        [269.32359214, 269.41945874, 269.29091034, ..., 270.08507799,\n         269.90750689, 269.55781167],\n        [270.24957636, 270.34871114, 270.35088993, ..., 270.20600063,\n         270.01317803, 269.67110857],\n        [270.62105944, 270.76703813, 270.76921692, ..., 270.43695199,\n         270.18857034, 270.01099925]],\n\n       ...,\n\n       [[271.48168007, 271.36729379, 271.24637114, ..., 270.89776532,\n         270.98164859, 270.96857587],\n        [271.81830257, 271.70282689, 271.56447396, ..., 271.43701495,\n         271.53723913, 271.69084357],\n        [272.18107051, 271.92179493, 271.77690563, ..., 272.04271757,\n         272.15165689, 272.3107083 ],\n        ...,\n        [275.28910932, 274.99388376, 274.74441272, ..., 276.07565121,\n         276.18350114, 276.35344648],\n        [275.60285456, 275.36536684, 275.34575777, ..., 276.91993095,\n         276.99618847, 277.08551871],\n        [275.90788466, 275.95254978, 275.9329407 , ..., 278.0169499 ,\n         277.9058318 , 277.90365301]],\n\n       [[269.88462964, 269.78658425, 269.64496313, ..., 269.72231005,\n         269.85957359, 269.92602658],\n        [270.1308325 , 270.03278711, 269.93147354, ..., 270.19946427,\n         270.40427019, 270.72237301],\n        [270.45656107, 270.21471578, 270.10904464, ..., 270.84329566,\n         271.05463794, 271.37056197],\n        ...,\n        [273.1440941 , 273.02535024, 272.86738823, ..., 274.12127981,\n         274.27488425, 274.4742432 ],\n        [273.5765832 , 273.50795143, 273.58638774, ..., 274.72698243,\n         274.85553083, 275.00150951],\n        [273.94697689, 274.12127981, 274.19971612, ..., 275.72704539,\n         275.64207272, 275.67148634]],\n\n       [[269.74954488, 269.63951617, 269.51096777, ..., 268.99459539,\n         269.11442864, 269.11224985],\n        [269.87046752, 269.76043881, 269.70596915, ..., 269.16345133,\n         269.50443141, 269.90097053],\n        [269.85630541, 269.81381908, 269.78222667, ..., 269.61446012,\n         269.94672505, 270.34435357],\n        ...,\n        [273.29007279, 273.09833959, 273.18440165, ..., 273.11794867,\n         273.21381527, 273.48943175],\n        [274.379466  , 274.08641922, 273.44258784, ..., 273.57331502,\n         273.64957255, 273.81298153],\n        [273.89904359, 273.55370595, 272.90769578, ..., 274.27815243,\n         274.27488425, 274.25309638]]])"
+     },
+     "execution_count": 41,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.asarray(nc_data['t2m'][:])"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-11T06:41:37.790393500Z",
+     "start_time": "2024-06-11T06:41:37.756209300Z"
+    }
+   },
+   "id": "d189f90ec07c8690"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "(8784, 13, 31)"
+     },
+     "execution_count": 42,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.asarray(nc_data['t2m'][:]).shape"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-11T06:41:37.862579700Z",
+     "start_time": "2024-06-11T06:41:37.787379500Z"
+    }
+   },
+   "id": "6a90a501e4374806"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "outputs": [],
+   "source": [
+    "def find_closest_number_index(arr, target):\n",
+    "    arr.sort()  # 对数组进行排序\n",
+    "    left, right = 0, len(arr) - 1\n",
+    "    closest_index = 0\n",
+    "\n",
+    "    while left <= right:\n",
+    "        mid = left + (right - left) // 2\n",
+    "        # 更新最接近数值的索引\n",
+    "        if abs(arr[mid] - target) < abs(arr[closest_index] - target):\n",
+    "            closest_index = mid\n",
+    "        # 根据目标值与中间值的比较，决定搜索左半部分还是右半部分\n",
+    "        if arr[mid] < target:\n",
+    "            left = mid + 1\n",
+    "        elif arr[mid] > target:\n",
+    "            right = mid - 1\n",
+    "        else:\n",
+    "            return mid  # 如果找到精确匹配，直接返回索引\n",
+    "    return closest_index"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-11T06:41:37.870067500Z",
+     "start_time": "2024-06-11T06:41:37.820315500Z"
+    }
+   },
+   "id": "738d2236e3320758"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "outputs": [],
+   "source": [
+    "target_cols = ['t2m']\n",
+    "times = np.asarray(nc_data['time'][:]).tolist()\n",
+    "times = [start_date + dt.timedelta(hours=x) for x in times]"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-11T06:41:37.870067500Z",
+     "start_time": "2024-06-11T06:41:37.834951500Z"
+    }
+   },
+   "id": "336150c5fe22b4db"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "outputs": [],
+   "source": [
+    "time_index = pd.to_datetime(times * stations.shape[0])\n",
+    "stations['lat'] = stations['lat'].astype(float)\n",
+    "stations['lon'] = stations['lon'].astype(float)\n",
+    "stations_out = stations.copy()"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-11T06:41:37.871070500Z",
+     "start_time": "2024-06-11T06:41:37.848726Z"
+    }
+   },
+   "id": "dfc81e6752e2c51a"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "  state   lon   lat  lon_index  lat_index  best_lon  best_lat\n0  iowa  93.5  41.1         30          2     -89.0      41.0",
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>state</th>\n      <th>lon</th>\n      <th>lat</th>\n      <th>lon_index</th>\n      <th>lat_index</th>\n      <th>best_lon</th>\n      <th>best_lat</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>iowa</td>\n      <td>93.5</td>\n      <td>41.1</td>\n      <td>30</td>\n      <td>2</td>\n      <td>-89.0</td>\n      <td>41.0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
+     },
+     "execution_count": 46,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "lon_index = list()\n",
+    "lat_index = list()\n",
+    "best_lons = list()\n",
+    "best_lats = list()\n",
+    "for i in range(stations.shape[0]):\n",
+    "    # 获得观测点所在的经纬度\n",
+    "    stat_lat = stations.iloc[i]['lat']\n",
+    "    stat_lon = stations.iloc[i]['lon']\n",
+    "    # 获得观测点所在经纬度对应最近的网格数据的经纬度\n",
+    "    best_lat_index = find_closest_number_index(lat, stat_lat)\n",
+    "    best_lon_index = find_closest_number_index(lon, stat_lon)\n",
+    "    lat_index.append(best_lat_index)\n",
+    "    lon_index.append(best_lon_index)\n",
+    "    best_lons.append(lon[best_lon_index])\n",
+    "    best_lats.append(lat[best_lat_index])\n",
+    "stations_out['lon_index'] = lon_index\n",
+    "stations_out['lat_index'] = lat_index\n",
+    "stations_out['best_lon'] = best_lons\n",
+    "stations_out['best_lat'] = best_lats\n",
+    "stations_out"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-11T06:41:37.880042100Z",
+     "start_time": "2024-06-11T06:41:37.866999300Z"
+    }
+   },
+   "id": "3eccbc89ab82bf16"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "outputs": [],
+   "source": [
+    "for tgt in target_cols:\n",
+    "    tmp = np.asarray(nc_data[tgt][:])\n",
+    "    stations_out[tgt] = stations_out.apply(lambda x: tmp[:, len(lat) - x['lat_index'] - 1, x['lon_index']].squeeze(),axis=1)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-11T06:41:37.953643400Z",
+     "start_time": "2024-06-11T06:41:37.882051700Z"
+    }
+   },
+   "id": "eb2d82717535c00b"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "outputs": [],
+   "source": [
+    "result_df = stations_out.explode(column=target_cols).reset_index(drop=True)\n",
+    "result_df['date_time'] = time_index\n",
+    "result_df[['date_time', 't2m']].to_csv('./temper.csv', index=False, encoding='utf-8-sig')"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-11T06:41:37.961281200Z",
+     "start_time": "2024-06-11T06:41:37.913630800Z"
+    }
+   },
+   "id": "f347ca967bfcafce"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/test.ipynb
+++ b/test.ipynb
@ -0,0 +1,111 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "initial_id",
+   "metadata": {
+    "collapsed": true,
+    "ExecuteTime": {
+     "end_time": "2024-06-11T06:48:54.873304Z",
+     "start_time": "2024-06-11T06:48:54.842415500Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "pv_df = pd.read_csv('data/PV.csv', sep=';')\n",
+    "temperature_df = pd.read_csv('data/temper.csv', sep=',')\n",
+    "\n",
+    "pv_data = pv_df['P_PV_'].apply(lambda x: x.replace(',', '.')).to_numpy(dtype=float)\n",
+    "temper = temperature_df['t2m'].to_numpy(dtype=float)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "array([0., 0., 0., ..., 0., 0., 0.])"
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pv_data"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-11T06:48:54.881858Z",
+     "start_time": "2024-06-11T06:48:54.867302600Z"
+    }
+   },
+   "id": "d37d5b6cd5cb7387"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "array([-3.2424931, -3.4571036, -3.5921883, ...,  3.2034465,  1.3242432,\n        0.3394317])"
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "temper -= 273.15\n",
+    "temper"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-11T06:48:54.906957500Z",
+     "start_time": "2024-06-11T06:48:54.880858700Z"
+    }
+   },
+   "id": "3ef2774e5245627"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "outputs": [],
+   "source": [],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-06-11T06:48:54.915543800Z",
+     "start_time": "2024-06-11T06:48:54.896403Z"
+    }
+   },
+   "id": "33f9458d47973337"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/test.py
+++ b/test.py
@ -0,0 +1,41 @@
+# from joblib import load
+#
+# model = load('AgentPPO/reward_data.pkl')
+#
+# data = model['mean_episode_reward']
+# print(data)
+
+
+# import torch
+#
+# model_path = 'AgentDDPG/actor.pth'
+#
+# checkpoint = torch.load(model_path, map_location=torch.device('cuda'))
+#
+# # 如果.pth文件保存的是整个模型（包含模型结构和参数）
+# model = checkpoint  # 这里假设checkpoint直接就是模型实例，有时候可能需要model.load_state_dict(checkpoint['state_dict'])
+
+# 如果.pth文件仅保存了state_dict（模型参数）
+# model = YourModelClass()  # 实例化你的模型
+# model.load_state_dict(checkpoint)
+
+# print(model)
+
+import pickle
+
+a = 'DDPG'
+b = 'PPO'
+c = 'SAC'
+d = 'TD3'
+
+a1 = '/reward_data.pkl'
+a2 = '/loss_data.pkl'
+a3 = '/test_data.pkl'
+
+filename = './Agent' + a + a3
+
+# 使用 'rb' 模式打开文件，读取二进制数据
+with open(filename, 'rb') as f:
+    data = pickle.load(f)
+
+print(data)
--- a/tools.py
+++ b/tools.py
@ -0,0 +1,285 @@
+import os
+import gurobipy as gp
+import numpy as np
+import numpy.random as rd
+import pandas as pd
+import torch
+from gurobipy import GRB
+
+
+def optimization_base_result(env, month, day, initial_soc):
+    price = env.data_manager.get_series_price_data(month, day)
+    load = env.data_manager.get_series_load_cons_data(month, day)
+    temperature = env.data_manager.get_series_temperature_data(month, day)
+    irradiance = env.data_manager.get_series_irradiance_data(month, day)
+    wind_speed = env.data_manager.get_series_wind_data(month, day)
+
+    pv = [env.solar.step(temp, irr) for temp, irr in zip(temperature, irradiance)]
+    wind = [env.wind.step(ws) for ws in wind_speed]
+
+    period = env.episode_length
+    DG_parameters = env.dg_parameters
+
+    def get_dg_info(parameters):
+        p_max = []
+        p_min = []
+        ramping_up = []
+        ramping_down = []
+        a_para = []
+        b_para = []
+        c_para = []
+
+        for name, gen_info in parameters.items():
+            p_max.append(gen_info['power_output_max'])
+            p_min.append(gen_info['power_output_min'])
+            ramping_up.append(gen_info['ramping_up'])
+            ramping_down.append(gen_info['ramping_down'])
+            a_para.append(gen_info['a'])
+            b_para.append(gen_info['b'])
+            c_para.append(gen_info['c'])
+        return p_max, p_min, ramping_up, ramping_down, a_para, b_para, c_para
+
+    p_max, p_min, ramping_up, ramping_down, a_para, b_para, c_para = get_dg_info(parameters=DG_parameters)
+    NUM_GEN = len(DG_parameters.keys())
+    battery_capacity = env.battery.capacity
+    battery_efficiency = env.battery.efficiency
+
+    m = gp.Model("UC")
+
+    # set system variables
+    on_off = m.addVars(NUM_GEN, period, vtype=GRB.BINARY, name='on_off')
+    gen_output = m.addVars(NUM_GEN, period, vtype=GRB.CONTINUOUS, name='output')
+    # set constrains for charge/discharge
+    battery_energy_change = m.addVars(period, vtype=GRB.CONTINUOUS, lb=env.battery.max_discharge,
+                                      ub=env.battery.max_charge, name='battery_action')
+    # set constrains for exchange between external grid and distributed energy system
+    grid_energy_import = m.addVars(period, vtype=GRB.CONTINUOUS, lb=0, ub=env.grid.exchange_ability, name='import')
+    grid_energy_export = m.addVars(period, vtype=GRB.CONTINUOUS, lb=0, ub=env.grid.exchange_ability, name='export')
+    soc = m.addVars(period, vtype=GRB.CONTINUOUS, lb=0.2, ub=0.8, name='SOC')
+
+    # 1. add balance constrain
+    m.addConstrs(((sum(gen_output[g, t] for g in range(NUM_GEN)) + pv[t] + wind[t] + grid_energy_import[t] >= load[t] +
+                   battery_energy_change[t] + grid_energy_export[t]) for t in range(period)), name='powerbalance')
+    # 2. add constrain for pmax pmin
+    m.addConstrs((gen_output[g, t] <= on_off[g, t] * p_max[g] for g in range(NUM_GEN) for t in range(period)),
+                 'gen_output_max')
+    m.addConstrs((gen_output[g, t] >= on_off[g, t] * p_min[g] for g in range(NUM_GEN) for t in range(period)),
+                 'gen_output_min')
+    # 3. add constrain for ramping up ramping down
+    m.addConstrs((gen_output[g, t + 1] - gen_output[g, t] <= ramping_up[g] for g in range(NUM_GEN)
+                  for t in range(period - 1)), 'ramping_up')
+    m.addConstrs((gen_output[g, t] - gen_output[g, t + 1] <= ramping_down[g] for g in range(NUM_GEN)
+                  for t in range(period - 1)), 'ramping_down')
+    # 4. add constrains for SOC
+    m.addConstr(battery_capacity * soc[0] == battery_capacity * initial_soc +
+                (battery_energy_change[0] * battery_efficiency), name='soc0')
+    m.addConstrs((battery_capacity * soc[t] == battery_capacity * soc[t - 1] +
+                  (battery_energy_change[t] * battery_efficiency) for t in range(1, period)), name='soc update')
+    # 5. add constrain for pv output
+    m.addConstrs((pv[t] >= 0 for t in range(period)), name='pv_output_min')
+    # set cost function
+    # 1 cost of generator
+    cost_gen = gp.quicksum(
+        (a_para[g] * gen_output[g, t] * gen_output[g, t] + b_para[g] * gen_output[g, t] + c_para[g] * on_off[g, t]) for
+        t in range(period) for g in range(NUM_GEN))
+    cost_grid_import = gp.quicksum(grid_energy_import[t] * price[t] for t in range(period))
+    cost_grid_export = gp.quicksum(grid_energy_export[t] * price[t] * env.sell_coefficient for t in range(period))
+
+    m.setObjective((cost_gen + cost_grid_import - cost_grid_export), GRB.MINIMIZE)
+    m.optimize()
+
+    output_record = {'pv': [], 'temperature': [], 'irradiance': [], 'wind_speed': [], 'price': [], 'load': [],
+                     'netload': [], 'soc': [], 'battery_energy_change': [], 'grid_import': [], 'grid_export': [],
+                     'gen1': [], 'gen2': [], 'gen3': [], 'step_cost': []}
+    for t in range(period):
+        gen_cost = sum((on_off[g, t].x * (
+                a_para[g] * gen_output[g, t].x * gen_output[g, t].x + b_para[g] * gen_output[g, t].x + c_para[g]))
+                       for g in range(NUM_GEN))
+        grid_import_cost = grid_energy_import[t].x * price[t]
+        grid_export_cost = grid_energy_export[t].x * price[t] * env.sell_coefficient
+        output_record['pv'].append(pv[t].x)
+        output_record['temperature'].append(temperature[t])
+        output_record['irradiance'].append(irradiance[t])
+        output_record['wind_speed'].append(wind[t])
+        output_record['price'].append(price[t])
+        output_record['load'].append(load[t])
+        output_record['netload'].append(load[t] - pv[t].x)
+        output_record['soc'].append(soc[t].x)
+        output_record['battery_energy_change'].append(battery_energy_change[t].x)
+        output_record['grid_import'].append(grid_energy_import[t].x)
+        output_record['grid_export'].append(grid_energy_export[t].x)
+        output_record['gen1'].append(gen_output[0, t].x)
+        output_record['gen2'].append(gen_output[1, t].x)
+        output_record['gen3'].append(gen_output[2, t].x)
+        output_record['step_cost'].append(gen_cost + grid_import_cost - grid_export_cost)
+
+    output_record_df = pd.DataFrame.from_dict(output_record)
+    return output_record_df
+
+
+class Arguments:
+    """revise here for our own purpose"""
+
+    def __init__(self, agent=None, env=None):
+        self.agent = agent  # Deep Reinforcement Learning algorithm
+        self.env = env  # the environment for training
+        # self.plot_shadow_on = False  # control do we need to plot all shadow figures
+        self.cwd = None  # current work directory. None means set automatically
+        self.if_remove = False  # remove the cwd folder? (True, False, None:ask me)
+        self.visible_gpu = '0,1,2,3'  # for example: os.environ['CUDA_VISIBLE_DEVICES'] = '0, 2,'
+        # self.worker_num = 2  # rollout workers number pre GPU (adjust it to get high GPU usage)
+        self.num_threads = 32  # cpu_num for evaluate model, torch.set_num_threads(self.num_threads)
+
+        '''Arguments for training'''
+        self.num_episode = 1000
+        self.gamma = 0.995  # discount factor of future rewards
+        # self.reward_scale = 1  # an approximate target reward usually be closed to 256
+        self.learning_rate = 2 ** -14  # 2 ** -14 ~= 6e-5
+        self.soft_update_tau = 2 ** -8  # 2 ** -8 ~= 5e-3
+        self.net_dim = 256  # the network width 256
+        self.batch_size = 4096  # num of transitions sampled from replay buffer.
+        self.repeat_times = 2 ** 5  # repeatedly update network to keep critic's loss small
+        self.target_step = 4096  # collect target_step experiences, then update network, 1024
+        self.max_memo = 500000  # capacity of replay buffer
+        self.if_per_or_gae = False  # PER for off-policy sparse reward: Prioritized Experience Replay.
+
+        '''Arguments for evaluate'''
+        # self.eval_gap = 2 ** 6  # evaluate the agent per eval_gap seconds
+        # self.eval_times = 2  # number of times that get episode return in first
+        self.random_seed = 0  # initialize random seed in self.init_before_training()
+        # self.random_seed_list = [1234, 2234, 3234, 4234, 5234]
+        self.random_seed_list = [1234]
+        '''Arguments for save and plot issues'''
+        self.train = True
+        self.save_network = True
+        self.test_network = True
+        self.save_test_data = True
+        self.compare_with_pyomo = True
+        self.plot_on = True
+
+    def init_before_training(self, if_main):
+        if self.cwd is None:
+            agent_name = self.agent.__class__.__name__
+            self.cwd = f'./{agent_name}'
+
+        if if_main:
+            import shutil  # remove history according to bool(if_remove)
+            if self.if_remove is None:
+                self.if_remove = bool(input(f"| PRESS 'y' to REMOVE: {self.cwd}? ") == 'y')
+            elif self.if_remove:
+                shutil.rmtree(self.cwd, ignore_errors=True)
+                print(f"| Remove cwd: {self.cwd}")
+            os.makedirs(self.cwd, exist_ok=True)
+
+        np.random.seed(self.random_seed)
+        torch.manual_seed(self.random_seed)
+        torch.set_num_threads(self.num_threads)
+        torch.set_default_dtype(torch.float32)
+
+        os.environ['CUDA_VISIBLE_DEVICES'] = str(self.visible_gpu)  # control how many GPU is used 　
+
+
+def test_one_episode(env, act, device):
+    """to get evaluate information, here record the unbalance of after taking action"""
+    record_state = []
+    record_action = []
+    record_reward = []
+    record_output = []
+    record_cost = []
+    record_unbalance = []
+    record_system_info = []  # [time price,netload,action,real action, output*4,soc,unbalance(exchange+penalty)]
+    record_init_info = []  # include month,day,time,intial soc
+    env.TRAIN = False
+    state = env.reset()
+    record_init_info.append([env.month, env.day, env.current_time, env.battery.current_capacity])
+    print(f'current testing month is {env.month}, day is {env.day},initial_soc is {env.battery.current_capacity}')
+    for i in range(24):
+        s_tensor = torch.as_tensor((state,), device=device)
+        a_tensor = act(s_tensor)
+        action = a_tensor.detach().cpu().numpy()[0]  # not need detach(), because with torch.no_grad() outside
+        real_action = action
+        state, next_state, reward, done = env.step(action)
+
+        record_system_info.append([state[0], state[1], state[3], action, real_action, env.battery.SOC(),
+                                   env.battery.energy_change, next_state[4], next_state[5], next_state[6],
+                                   next_state[7], next_state[8], env.unbalance, env.operation_cost])
+        record_state.append(state)
+        record_action.append(real_action)
+        record_reward.append(reward)
+        record_output.append(env.current_output)
+        record_unbalance.append(env.unbalance)
+        state = next_state
+    # add information of last step dg1, dh2, dg3, soc
+    record_system_info[-1][7:10] = [env.final_step_outputs[0], env.final_step_outputs[1], env.final_step_outputs[2]]
+    record_system_info[-1][5] = env.final_step_outputs[3]
+    record = {'init_info': record_init_info, 'system_info': record_system_info, 'state': record_state,
+              'action': record_action, 'reward': record_reward, 'cost': record_cost, 'unbalance': record_unbalance,
+              'record_output': record_output}
+    return record
+
+
+def get_episode_return(env, act, device):
+    episode_return = 0.0  # sum of rewards in an episode
+    episode_unbalance = 0.0
+    state = env.reset()
+    for i in range(24):
+        s_tensor = torch.as_tensor((state,), device=device)
+        a_tensor = act(s_tensor)
+        action = a_tensor.detach().cpu().numpy()[0]  # not need detach(), because with torch.no_grad() outside
+        state, next_state, reward, done, = env.step(action)
+        state = next_state
+        episode_return += reward
+        episode_unbalance += env.real_unbalance
+        if done:
+            break
+    return episode_return, episode_unbalance
+
+
+class ReplayBuffer:
+    def __init__(self, max_len, state_dim, action_dim, gpu_id=0):
+        self.now_len = 0
+        self.next_idx = 0
+        self.if_full = False
+        self.max_len = max_len
+        self.data_type = torch.float32
+        self.action_dim = action_dim
+        self.device = torch.device(f"cuda:{gpu_id}" if (torch.cuda.is_available() and (gpu_id >= 0)) else "cpu")
+
+        other_dim = 1 + 1 + self.action_dim
+        self.buf_other = torch.empty(size=(max_len, other_dim), dtype=self.data_type, device=self.device)
+
+        if isinstance(state_dim, int):  # state is pixel
+            self.buf_state = torch.empty((max_len, state_dim), dtype=torch.float32, device=self.device)
+        elif isinstance(state_dim, tuple):
+            self.buf_state = torch.empty((max_len, *state_dim), dtype=torch.uint8, device=self.device)
+        else:
+            raise ValueError('state_dim')
+
+    def extend_buffer(self, state, other):  # CPU array to CPU array
+        size = len(other)
+        next_idx = self.next_idx + size
+
+        if next_idx > self.max_len:
+            self.buf_state[self.next_idx:self.max_len] = state[:self.max_len - self.next_idx]
+            self.buf_other[self.next_idx:self.max_len] = other[:self.max_len - self.next_idx]
+            self.if_full = True
+
+            next_idx = next_idx - self.max_len
+            self.buf_state[0:next_idx] = state[-next_idx:]
+            self.buf_other[0:next_idx] = other[-next_idx:]
+        else:
+            self.buf_state[self.next_idx:next_idx] = state
+            self.buf_other[self.next_idx:next_idx] = other
+        self.next_idx = next_idx
+
+    def sample_batch(self, batch_size) -> tuple:
+        indices = rd.randint(self.now_len - 1, size=batch_size)
+        r_m_a = self.buf_other[indices]
+        return (r_m_a[:, 0:1],
+                r_m_a[:, 1:2],
+                r_m_a[:, 2:],
+                self.buf_state[indices],
+                self.buf_state[indices + 1])
+
+    def update_now_len(self):
+        self.now_len = self.max_len if self.if_full else self.next_idx