coal_materials/multioutput_regression.ipynb

194 lines
7.7 KiB
Plaintext
Raw Normal View History

2024-01-05 09:12:27 +08:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"# A demo for multi-output regression\n",
"\n",
"The demo is adopted from scikit-learn:\n",
"\n",
"https://scikit-learn.org/stable/auto_examples/ensemble/plot_random_forest_regression_multioutput.html#sphx-glr-auto-examples-ensemble-plot-random-forest-regression-multioutput-py\n",
"\n",
"See :doc:`/tutorials/multioutput` for more information.\n",
"\n",
"<div class=\"alert alert-info\"><h4>Note</h4><p>The feature is experimental. For the `multi_output_tree` strategy, many features are\n",
" missing.</p></div>\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'xgboost'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[1], line 7\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mmatplotlib\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m pyplot \u001b[38;5;28;01mas\u001b[39;00m plt\n\u001b[0;32m----> 7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mxgboost\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mxgb\u001b[39;00m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mplot_predt\u001b[39m(y: np\u001b[38;5;241m.\u001b[39mndarray, y_predt: np\u001b[38;5;241m.\u001b[39mndarray, name: \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 11\u001b[0m s \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m25\u001b[39m\n",
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'xgboost'"
]
}
],
"source": [
"import argparse\n",
"from typing import Dict, List, Tuple\n",
"\n",
"import numpy as np\n",
"from matplotlib import pyplot as plt\n",
"\n",
"import xgboost as xgb\n",
"\n",
"\n",
"def plot_predt(y: np.ndarray, y_predt: np.ndarray, name: str) -> None:\n",
" s = 25\n",
" plt.scatter(y[:, 0], y[:, 1], c=\"navy\", s=s, edgecolor=\"black\", label=\"data\")\n",
" plt.scatter(\n",
" y_predt[:, 0], y_predt[:, 1], c=\"cornflowerblue\", s=s, edgecolor=\"black\"\n",
" )\n",
" plt.xlim([-1, 2])\n",
" plt.ylim([-1, 2])\n",
" plt.show()\n",
"\n",
"\n",
"def gen_circle() -> Tuple[np.ndarray, np.ndarray]:\n",
" \"Generate a sample dataset that y is a 2 dim circle.\"\n",
" rng = np.random.RandomState(1994)\n",
" X = np.sort(200 * rng.rand(100, 1) - 100, axis=0)\n",
" y = np.array([np.pi * np.sin(X).ravel(), np.pi * np.cos(X).ravel()]).T\n",
" y[::5, :] += 0.5 - rng.rand(20, 2)\n",
" y = y - y.min()\n",
" y = y / y.max()\n",
" return X, y\n",
"\n",
"\n",
"def rmse_model(plot_result: bool, strategy: str) -> None:\n",
" \"\"\"Draw a circle with 2-dim coordinate as target variables.\"\"\"\n",
" X, y = gen_circle()\n",
" # Train a regressor on it\n",
" reg = xgb.XGBRegressor(\n",
" tree_method=\"hist\",\n",
" n_estimators=128,\n",
" n_jobs=16,\n",
" max_depth=8,\n",
" multi_strategy=strategy,\n",
" subsample=0.6,\n",
" )\n",
" reg.fit(X, y, eval_set=[(X, y)])\n",
"\n",
" y_predt = reg.predict(X)\n",
" if plot_result:\n",
" plot_predt(y, y_predt, \"multi\")\n",
"\n",
"\n",
"def custom_rmse_model(plot_result: bool, strategy: str) -> None:\n",
" \"\"\"Train using Python implementation of Squared Error.\"\"\"\n",
"\n",
" # As the experimental support status, custom objective doesn't support matrix as\n",
" # gradient and hessian, which will be changed in future release.\n",
" def gradient(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:\n",
" \"\"\"Compute the gradient squared error.\"\"\"\n",
" y = dtrain.get_label().reshape(predt.shape)\n",
" return (predt - y).reshape(y.size)\n",
"\n",
" def hessian(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:\n",
" \"\"\"Compute the hessian for squared error.\"\"\"\n",
" return np.ones(predt.shape).reshape(predt.size)\n",
"\n",
" def squared_log(\n",
" predt: np.ndarray, dtrain: xgb.DMatrix\n",
" ) -> Tuple[np.ndarray, np.ndarray]:\n",
" grad = gradient(predt, dtrain)\n",
" hess = hessian(predt, dtrain)\n",
" return grad, hess\n",
"\n",
" def rmse(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:\n",
" y = dtrain.get_label().reshape(predt.shape)\n",
" v = np.sqrt(np.sum(np.power(y - predt, 2)))\n",
" return \"PyRMSE\", v\n",
"\n",
" X, y = gen_circle()\n",
" Xy = xgb.DMatrix(X, y)\n",
" results: Dict[str, Dict[str, List[float]]] = {}\n",
" # Make sure the `num_target` is passed to XGBoost when custom objective is used.\n",
" # When builtin objective is used, XGBoost can figure out the number of targets\n",
" # automatically.\n",
" booster = xgb.train(\n",
" {\n",
" \"tree_method\": \"hist\",\n",
" \"num_target\": y.shape[1],\n",
" \"multi_strategy\": strategy,\n",
" },\n",
" dtrain=Xy,\n",
" num_boost_round=128,\n",
" obj=squared_log,\n",
" evals=[(Xy, \"Train\")],\n",
" evals_result=results,\n",
" custom_metric=rmse,\n",
" )\n",
"\n",
" y_predt = booster.inplace_predict(X)\n",
" if plot_result:\n",
" plot_predt(y, y_predt, \"multi\")\n",
"\n",
"\n",
"if __name__ == \"__main__\":\n",
" parser = argparse.ArgumentParser()\n",
" parser.add_argument(\"--plot\", choices=[0, 1], type=int, default=1)\n",
" args = parser.parse_args()\n",
"\n",
" # Train with builtin RMSE objective\n",
" # - One model per output.\n",
" rmse_model(args.plot == 1, \"one_output_per_tree\")\n",
" # - One model for all outputs, this is still working in progress, many features are\n",
" # missing.\n",
" rmse_model(args.plot == 1, \"multi_output_tree\")\n",
"\n",
" # Train with custom objective.\n",
" # - One model per output.\n",
" custom_rmse_model(args.plot == 1, \"one_output_per_tree\")\n",
" # - One model for all outputs, this is still working in progress, many features are\n",
" # missing.\n",
" custom_rmse_model(args.plot == 1, \"multi_output_tree\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
2024-06-18 10:19:35 +08:00
"version": "3.8.18"
2024-01-05 09:12:27 +08:00
}
},
"nbformat": 4,
"nbformat_minor": 4
}