MAE_ATMO/build_data.ipynb

21 KiB

数据集构建

写一个筛选空值的代码,用于构建数据集

In [1]:
import numpy as np
In [2]:
import os
In [3]:
npy_list = os.listdir('./np_data/')
In [4]:
len(npy_list)
Out[4]:
361
In [5]:
len(os.listdir('./out_mat/96/'))
Out[5]:
5
In [6]:
def sliding_window(matrix, window_size):
    rows = len(matrix) - window_size + 1
    cols = len(matrix[0]) - window_size + 1
    
    for i in range(rows):
        for j in range(cols):
            sub_matrix = matrix[i : i+window_size, j : j+window_size, :-3]
            yield sub_matrix
In [7]:
window_size = 96
In [8]:
data = np.load(f"./np_data/{npy_list[0]}")
In [9]:
data.shape
Out[9]:
(110, 190, 11)
In [10]:
data[0][0]
Out[10]:
array([            nan,  2.90520200e+02,  9.77973000e+01,  2.80806000e+02,
        4.36411383e+05, -1.35540000e+00,  2.04530000e+00,             nan,
        6.93860000e+00,  0.00000000e+00,  0.00000000e+00])
In [11]:
num_samples = len(npy_list)
valid_list = np.random.choice(npy_list, size=int(num_samples * 0.2), replace=False)
train_list = [x for x in npy_list if x not in valid_list]
test_list = np.random.choice(valid_list, size=int(num_samples * 0.1), replace=False)
val_list = [x for x in valid_list if x not in test_list]
for file in npy_list:
    data = np.load(f"./np_data/{file}")
    file_id = file.split('.')[0]
    for ind, mat in enumerate(sliding_window(data, window_size)):
        if (np.isnan(mat) * 1).sum() != 0:
            continue
        else:
            if file in train_list:
                np.save(f'./out_mat/{window_size}/train/{file_id}-{ind}.npy', mat)
            elif file in val_list:
                np.save(f'./out_mat/{window_size}/test/{file_id}-{ind}.npy', mat)
            else:
                np.save(f'./out_mat/{window_size}/valid/{file_id}-{ind}.npy', mat)
In [12]:
import matplotlib.pyplot as plt

筛选mask

In [13]:
import cv2
In [14]:
mask_list = {}
for file in npy_list:
    data = np.load(f"./np_data/{file}")
    file_id = file.split('.')[0]
    count = 0
    for ind, mat in enumerate(sliding_window(data, window_size)):
        cur_no2 = np.isnan(mat[:,:,0])
        na_sums = (cur_no2 * 1).sum()
        miss_rate = round(na_sums / (window_size**2), 2) * 100
        if (miss_rate % 10 == 0) and miss_rate > 0:
            fold_path = str(int(miss_rate))
            if not os.path.exists(f"./out_mat/96/mask/{fold_path}"):
                os.mkdir(f"./out_mat/96/mask/{fold_path}")
            if fold_path not in mask_list:
                mask_list[fold_path] = 1
            else:
                mask_list[fold_path] += 1
            msk = 1 - (cur_no2 * 1)
            # cv2.imwrite(f'./out_mat/96/mask/{fold_path}/{file_id}-{ind}.jpg', msk)
In [15]:
dd = cur_no2 * 1
dd.max()
Out[15]:
1
In [16]:
dd.min()
Out[16]:
0
In [17]:
(1 - dd).max()
Out[17]:
1
In [19]:
d = plt.imread("./out_mat/96/mask/70/20200110-1145.jpg")
plt.imshow(d, cmap='gray')
Out[19]:
<matplotlib.image.AxesImage at 0x7fa6680b2370>
No description has been provided for this image
In [20]:
np.argwhere(d==2)
Out[20]:
array([[ 7,  3],
       [ 7,  4],
       [ 7,  5],
       [33, 47],
       [56, 48],
       [56, 49],
       [64, 15],
       [71,  3],
       [71,  4]])
In [21]:
d.max()
Out[21]:
2
In [22]:
mask_list
Out[22]:
{'10': 7033,
 '20': 4791,
 '40': 3699,
 '30': 3849,
 '50': 4245,
 '90': 2494,
 '80': 2549,
 '60': 3831,
 '70': 3144,
 '100': 17936}
In [25]:
mask_list
Out[25]:
{'10': 7033,
 '20': 4791,
 '40': 3699,
 '30': 3849,
 '50': 4245,
 '90': 2494,
 '80': 2549,
 '60': 3831,
 '70': 3144,
 '100': 17936}
In [ ]:
mask_list
In [ ]:
plt.imshow('2', mat[:,:,0])
In [27]:
(np.isnan(mat[:,:,0]) * 1).sum()
Out[27]:
4679
In [ ]: