{ "cells": [ { "cell_type": "markdown", "id": "6f914d38-ee6e-4418-bfdd-44fbb7d4e0cf", "metadata": {}, "source": [ "# 数据集构建\n", "### 写一个筛选空值的代码,用于构建数据集" ] }, { "cell_type": "code", "execution_count": 1, "id": "7f26956d-c06a-4c61-a029-2095b0372799", "metadata": {}, "outputs": [], "source": [ "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "id": "7fb503fb-b22d-4839-804c-c6326ce2a5be", "metadata": {}, "outputs": [], "source": [ "import os" ] }, { "cell_type": "code", "execution_count": 3, "id": "27f9906b-e831-4995-87ba-6178746b8b77", "metadata": {}, "outputs": [], "source": [ "npy_list = os.listdir('./np_data/')" ] }, { "cell_type": "code", "execution_count": 4, "id": "801bb7b5-ebbc-47e0-8749-0d6b76d89a68", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "361" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(npy_list)" ] }, { "cell_type": "code", "execution_count": 5, "id": "35fc93fd-93d3-48c1-8b36-d932a39d7662", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "5" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(os.listdir('./out_mat/96/'))" ] }, { "cell_type": "code", "execution_count": 6, "id": "d3c87665-b690-4ec6-82bb-8313db9b55d3", "metadata": { "tags": [] }, "outputs": [], "source": [ "def sliding_window(matrix, window_size):\n", " rows = len(matrix) - window_size + 1\n", " cols = len(matrix[0]) - window_size + 1\n", " \n", " for i in range(rows):\n", " for j in range(cols):\n", " sub_matrix = matrix[i : i+window_size, j : j+window_size, :-3]\n", " yield sub_matrix" ] }, { "cell_type": "code", "execution_count": 7, "id": "696e49df-5e49-40d0-8e44-63ac066febef", "metadata": {}, "outputs": [], "source": [ "window_size = 96" ] }, { "cell_type": "code", "execution_count": 8, "id": "204d8ee2-7668-4f47-9980-cfbd36ff3bd5", "metadata": {}, "outputs": [], "source": [ "data = np.load(f\"./np_data/{npy_list[0]}\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "275f62b5-8084-4370-a0ef-a27bcc293c12", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(110, 190, 11)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.shape" ] }, { "cell_type": "code", "execution_count": 10, "id": "4192b9d4-b66e-4fb5-97ea-380284079ca2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ nan, 2.90520200e+02, 9.77973000e+01, 2.80806000e+02,\n", " 4.36411383e+05, -1.35540000e+00, 2.04530000e+00, nan,\n", " 6.93860000e+00, 0.00000000e+00, 0.00000000e+00])" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[0][0]" ] }, { "cell_type": "code", "execution_count": 11, "id": "2fe94edd-425c-43d9-8d27-3d8b7f0120e6", "metadata": {}, "outputs": [], "source": [ "num_samples = len(npy_list)\n", "valid_list = np.random.choice(npy_list, size=int(num_samples * 0.2), replace=False)\n", "train_list = [x for x in npy_list if x not in valid_list]\n", "test_list = np.random.choice(valid_list, size=int(num_samples * 0.1), replace=False)\n", "val_list = [x for x in valid_list if x not in test_list]\n", "for file in npy_list:\n", " data = np.load(f\"./np_data/{file}\")\n", " file_id = file.split('.')[0]\n", " for ind, mat in enumerate(sliding_window(data, window_size)):\n", " if (np.isnan(mat) * 1).sum() != 0:\n", " continue\n", " else:\n", " if file in train_list:\n", " np.save(f'./out_mat/{window_size}/train/{file_id}-{ind}.npy', mat)\n", " elif file in val_list:\n", " np.save(f'./out_mat/{window_size}/test/{file_id}-{ind}.npy', mat)\n", " else:\n", " np.save(f'./out_mat/{window_size}/valid/{file_id}-{ind}.npy', mat)" ] }, { "cell_type": "code", "execution_count": 12, "id": "1ddcf0c4-2c46-4b91-85f1-4181b879f723", "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt" ] }, { "cell_type": "markdown", "id": "36798a50-0890-43dd-9feb-d10dc774472b", "metadata": {}, "source": [ "筛选mask" ] }, { "cell_type": "code", "execution_count": 13, "id": "f419d8e3-8d01-4efe-81e5-60e18b40a1d7", "metadata": {}, "outputs": [], "source": [ "import cv2" ] }, { "cell_type": "code", "execution_count": 14, "id": "176eb78d-0137-4f6b-8555-e83e891fd9b8", "metadata": {}, "outputs": [], "source": [ "mask_list = {}\n", "for file in npy_list:\n", " data = np.load(f\"./np_data/{file}\")\n", " file_id = file.split('.')[0]\n", " count = 0\n", " for ind, mat in enumerate(sliding_window(data, window_size)):\n", " cur_no2 = np.isnan(mat[:,:,0])\n", " na_sums = (cur_no2 * 1).sum()\n", " miss_rate = round(na_sums / (window_size**2), 2) * 100\n", " if (miss_rate % 10 == 0) and miss_rate > 0:\n", " fold_path = str(int(miss_rate))\n", " if not os.path.exists(f\"./out_mat/96/mask/{fold_path}\"):\n", " os.mkdir(f\"./out_mat/96/mask/{fold_path}\")\n", " if fold_path not in mask_list:\n", " mask_list[fold_path] = 1\n", " else:\n", " mask_list[fold_path] += 1\n", " msk = 1 - (cur_no2 * 1)\n", " # cv2.imwrite(f'./out_mat/96/mask/{fold_path}/{file_id}-{ind}.jpg', msk)" ] }, { "cell_type": "code", "execution_count": 15, "id": "2b21b80f-d0f6-4c75-ab0c-be692b5e0cdd", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dd = cur_no2 * 1\n", "dd.max()" ] }, { "cell_type": "code", "execution_count": 16, "id": "de6093f7-1296-438a-a2e5-6770350760f1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dd.min()" ] }, { "cell_type": "code", "execution_count": 17, "id": "8c610f19-ec49-4592-8647-bc957e716546", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(1 - dd).max()" ] }, { "cell_type": "code", "execution_count": 19, "id": "d220cc78-985c-4a45-be53-11039cc8d279", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "d = plt.imread(\"./out_mat/96/mask/70/20200110-1145.jpg\")\n", "plt.imshow(d, cmap='gray')" ] }, { "cell_type": "code", "execution_count": 20, "id": "c0064319-6185-4f80-9140-2f70233bd549", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 7, 3],\n", " [ 7, 4],\n", " [ 7, 5],\n", " [33, 47],\n", " [56, 48],\n", " [56, 49],\n", " [64, 15],\n", " [71, 3],\n", " [71, 4]])" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.argwhere(d==2)" ] }, { "cell_type": "code", "execution_count": 21, "id": "80881971-c661-47c5-8e08-9136528f6e22", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d.max()" ] }, { "cell_type": "code", "execution_count": 22, "id": "e110e873-7ac4-48af-8608-be18cebabbbb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'10': 7033,\n", " '20': 4791,\n", " '40': 3699,\n", " '30': 3849,\n", " '50': 4245,\n", " '90': 2494,\n", " '80': 2549,\n", " '60': 3831,\n", " '70': 3144,\n", " '100': 17936}" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mask_list" ] }, { "cell_type": "code", "execution_count": 25, "id": "d1338b0d-134b-4694-bdca-a7016c4f207f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'10': 7033,\n", " '20': 4791,\n", " '40': 3699,\n", " '30': 3849,\n", " '50': 4245,\n", " '90': 2494,\n", " '80': 2549,\n", " '60': 3831,\n", " '70': 3144,\n", " '100': 17936}" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mask_list" ] }, { "cell_type": "code", "execution_count": null, "id": "dae31feb-ce59-43ca-b736-585618437081", "metadata": {}, "outputs": [], "source": [ "mask_list" ] }, { "cell_type": "code", "execution_count": null, "id": "3de4d61f-0e3c-4303-8668-8b9fa3b51862", "metadata": {}, "outputs": [], "source": [ "plt.imshow('2', mat[:,:,0])" ] }, { "cell_type": "code", "execution_count": 27, "id": "7897f563-8c5f-4db8-9b36-b6af8b03100d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "4679" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(np.isnan(mat[:,:,0]) * 1).sum()" ] }, { "cell_type": "code", "execution_count": null, "id": "116c5a81-5396-4b27-89e0-30afaf2828d4", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.16" } }, "nbformat": 4, "nbformat_minor": 5 }