building-agents/read.ipynb

15 KiB

In [34]:
import numpy as np
import netCDF4 as nc
import os
import pandas as pd
import datetime as dt
nc_path = "./data/"
nc_files = [f"{nc_path}{x}" for x in os.listdir(nc_path) if x.endswith('.nc')]
print(nc_files)
['./data/iowa.nc']
In [35]:
stations = pd.read_csv('./data/station.csv')
stations.head()
stations.dropna(inplace=True)
In [36]:
nc_data = nc.Dataset(nc_files[0])  # 读文件
print(list(nc_data.variables.keys())) # keys
['longitude', 'latitude', 'time', 't2m']
In [37]:
lat = np.asarray(nc_data['latitude'][:]).tolist()
lon = np.asarray(nc_data['longitude'][:]).tolist()
len(lat), len(lon)
Out[37]:
(13, 31)
In [38]:
np.asarray(nc_data['time'][:])
Out[38]:
array([1016832, 1016833, 1016834, ..., 1025613, 1025614, 1025615])
In [39]:
nc_data['time']
Out[39]:
<class 'netCDF4._netCDF4.Variable'>
int32 time(time)
    units: hours since 1900-01-01 00:00:00.0
    long_name: time
    calendar: gregorian
unlimited dimensions: 
current shape = (8784,)
filling on, default _FillValue of -2147483647 used
In [40]:
start_date = dt.datetime(1900, 1, 1)
start_date
Out[40]:
datetime.datetime(1900, 1, 1, 0, 0)
In [41]:
np.asarray(nc_data['t2m'][:])
Out[41]:
array([[[267.10449818, 266.56524854, 265.81574602, ..., 266.35826383,
         266.94435738, 267.10231939],
        [266.75916053, 266.21773211, 265.4017766 , ..., 266.49334859,
         267.59908269, 267.86380524],
        [266.70360148, 265.86258993, 265.06733289, ..., 266.6197182 ,
         267.66117811, 267.95640366],
        ...,
        [270.23759303, 270.20055367, 269.91731143, ..., 270.11231282,
         269.99683714, 269.90750689],
        [271.34223774, 271.32153927, 271.25726507, ..., 270.55569585,
         270.37376718, 270.24086121],
        [271.76274352, 271.85643134, 271.79215714, ..., 271.32371806,
         270.86617291, 270.71583665]],

       [[267.45746158, 267.17966631, 266.73846206, ..., 266.19376546,
         266.66983029, 266.83541806],
        [267.5337191 , 267.25374505, 266.67418786, ..., 266.32231386,
         267.30821471, 267.57620544],
        [267.67425083, 267.02933005, 266.40183956, ..., 266.40183956,
         267.33218136, 267.62522813],
        ...,
        [269.75172367, 269.78549485, 269.62317527, ..., 270.06873709,
         269.95652959, 269.69289643],
        [270.76159117, 270.80080932, 270.75832299, ..., 270.33237024,
         270.18748095, 269.91186447],
        [271.10692881, 271.19516966, 271.15268333, ..., 270.63848973,
         270.47290197, 270.30513541]],

       [[267.29841017, 267.25483444, 267.06745881, ..., 265.76345515,
         266.3157775 , 266.53692432],
        [267.55441757, 267.51084184, 267.31801925, ..., 266.01728376,
         266.99338007, 267.2613708 ],
        [267.77338561, 267.59363573, 267.37684648, ..., 266.09898825,
         267.00318461, 267.28860563],
        ...,
        [269.32359214, 269.41945874, 269.29091034, ..., 270.08507799,
         269.90750689, 269.55781167],
        [270.24957636, 270.34871114, 270.35088993, ..., 270.20600063,
         270.01317803, 269.67110857],
        [270.62105944, 270.76703813, 270.76921692, ..., 270.43695199,
         270.18857034, 270.01099925]],

       ...,

       [[271.48168007, 271.36729379, 271.24637114, ..., 270.89776532,
         270.98164859, 270.96857587],
        [271.81830257, 271.70282689, 271.56447396, ..., 271.43701495,
         271.53723913, 271.69084357],
        [272.18107051, 271.92179493, 271.77690563, ..., 272.04271757,
         272.15165689, 272.3107083 ],
        ...,
        [275.28910932, 274.99388376, 274.74441272, ..., 276.07565121,
         276.18350114, 276.35344648],
        [275.60285456, 275.36536684, 275.34575777, ..., 276.91993095,
         276.99618847, 277.08551871],
        [275.90788466, 275.95254978, 275.9329407 , ..., 278.0169499 ,
         277.9058318 , 277.90365301]],

       [[269.88462964, 269.78658425, 269.64496313, ..., 269.72231005,
         269.85957359, 269.92602658],
        [270.1308325 , 270.03278711, 269.93147354, ..., 270.19946427,
         270.40427019, 270.72237301],
        [270.45656107, 270.21471578, 270.10904464, ..., 270.84329566,
         271.05463794, 271.37056197],
        ...,
        [273.1440941 , 273.02535024, 272.86738823, ..., 274.12127981,
         274.27488425, 274.4742432 ],
        [273.5765832 , 273.50795143, 273.58638774, ..., 274.72698243,
         274.85553083, 275.00150951],
        [273.94697689, 274.12127981, 274.19971612, ..., 275.72704539,
         275.64207272, 275.67148634]],

       [[269.74954488, 269.63951617, 269.51096777, ..., 268.99459539,
         269.11442864, 269.11224985],
        [269.87046752, 269.76043881, 269.70596915, ..., 269.16345133,
         269.50443141, 269.90097053],
        [269.85630541, 269.81381908, 269.78222667, ..., 269.61446012,
         269.94672505, 270.34435357],
        ...,
        [273.29007279, 273.09833959, 273.18440165, ..., 273.11794867,
         273.21381527, 273.48943175],
        [274.379466  , 274.08641922, 273.44258784, ..., 273.57331502,
         273.64957255, 273.81298153],
        [273.89904359, 273.55370595, 272.90769578, ..., 274.27815243,
         274.27488425, 274.25309638]]])
In [42]:
np.asarray(nc_data['t2m'][:]).shape
Out[42]:
(8784, 13, 31)
In [43]:
def find_closest_number_index(arr, target):
    arr.sort()  # 对数组进行排序
    left, right = 0, len(arr) - 1
    closest_index = 0

    while left <= right:
        mid = left + (right - left) // 2
        # 更新最接近数值的索引
        if abs(arr[mid] - target) < abs(arr[closest_index] - target):
            closest_index = mid
        # 根据目标值与中间值的比较,决定搜索左半部分还是右半部分
        if arr[mid] < target:
            left = mid + 1
        elif arr[mid] > target:
            right = mid - 1
        else:
            return mid  # 如果找到精确匹配,直接返回索引
    return closest_index
In [44]:
target_cols = ['t2m']
times = np.asarray(nc_data['time'][:]).tolist()
times = [start_date + dt.timedelta(hours=x) for x in times]
In [45]:
time_index = pd.to_datetime(times * stations.shape[0])
stations['lat'] = stations['lat'].astype(float)
stations['lon'] = stations['lon'].astype(float)
stations_out = stations.copy()
In [46]:
lon_index = list()
lat_index = list()
best_lons = list()
best_lats = list()
for i in range(stations.shape[0]):
    # 获得观测点所在的经纬度
    stat_lat = stations.iloc[i]['lat']
    stat_lon = stations.iloc[i]['lon']
    # 获得观测点所在经纬度对应最近的网格数据的经纬度
    best_lat_index = find_closest_number_index(lat, stat_lat)
    best_lon_index = find_closest_number_index(lon, stat_lon)
    lat_index.append(best_lat_index)
    lon_index.append(best_lon_index)
    best_lons.append(lon[best_lon_index])
    best_lats.append(lat[best_lat_index])
stations_out['lon_index'] = lon_index
stations_out['lat_index'] = lat_index
stations_out['best_lon'] = best_lons
stations_out['best_lat'] = best_lats
stations_out
Out[46]:
state lon lat lon_index lat_index best_lon best_lat
0 iowa 93.5 41.1 30 2 -89.0 41.0
In [47]:
for tgt in target_cols:
    tmp = np.asarray(nc_data[tgt][:])
    stations_out[tgt] = stations_out.apply(lambda x: tmp[:, len(lat) - x['lat_index'] - 1, x['lon_index']].squeeze(),axis=1)
In [48]:
result_df = stations_out.explode(column=target_cols).reset_index(drop=True)
result_df['date_time'] = time_index
result_df[['date_time', 't2m']].to_csv('./temper.csv', index=False, encoding='utf-8-sig')