From 60118e85a483e39438fcd4a73b704be2bc295b95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9E=97=E6=9E=97?= <6969846@qq.com> Date: Thu, 27 Mar 2025 16:19:45 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A4=8D=E7=8E=B0XGBoost?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 周家林/XGboost.py | 136 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 周家林/XGboost.py diff --git a/周家林/XGboost.py b/周家林/XGboost.py new file mode 100644 index 0000000..354d121 --- /dev/null +++ b/周家林/XGboost.py @@ -0,0 +1,136 @@ +import xgboost as xgb # 导入XGBoost +import pandas as pd +from sklearn import preprocessing +import numpy as np +from sklearn.model_selection import train_test_split +# 移除了 torch 和 torch.nn 相关导入 +import matplotlib.pyplot as plt +import seaborn as sns +from sklearn.metrics import confusion_matrix, accuracy_score, classification_report # 增加评估指标 +from sklearn.feature_selection import SelectKBest, chi2 +from sklearn.utils.class_weight import compute_sample_weight # 用于计算样本权重 + +# 检查GPU可用性(XGBoost 可配置使用GPU,但方式不同,这里简化为CPU) +# device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +# print(f"Using device: {device}") +print("Using CPU for XGBoost (GPU can be configured if needed and available)") + +# 1. 加载数据集 +df = pd.read_csv('sensor.csv', index_col=0) +print("Dataset loaded.") + +# 2. 数据预处理 +df.drop(columns=['sensor_50', 'sensor_51', 'sensor_15'], inplace=True) +x = df.iloc[:, 1:50].fillna(method='ffill') + +scaler = preprocessing.MinMaxScaler() +x_scaled = scaler.fit_transform(x) +x_scaled = pd.DataFrame(x_scaled, columns=df.iloc[:, 1:50].columns) +print("Data scaled and NaNs filled.") + +# 目标变量编码 +conditions = [(df['machine_status'] =='NORMAL'), (df['machine_status'] =='BROKEN'), (df['machine_status'] =='RECOVERING')] +choices = [1, 0, 2] # BROKEN: 0, NORMAL: 1, RECOVERING: 2 +df['Operation'] = np.select(conditions, choices, default=0) # 保持原始编码 +# df.drop(['machine_status'],axis=1, inplace=True) # 保留原始列以便检查 +y = df['Operation'].values # 直接获取numpy数组 +print("Target variable encoded.") +print("Class distribution in y:", np.bincount(y)) + +# 4. 特征选择 (在缩放后的数据上进行) +selector = SelectKBest(score_func=chi2, k=20) +# chi2要求非负特征,MinMaxScaler保证了这一点 +x_new = selector.fit_transform(x_scaled, y) +selected_features_indices = selector.get_support(indices=True) +selected_features_names = x_scaled.columns[selected_features_indices] +print(f"Selected {len(selected_features_names)} features:", selected_features_names.tolist()) + +# 3. 构建时序输入数据 (仍需要创建窗口) +def create_sequences(data, target, time_steps=24): + X, y_seq = [], [] + print(f"Creating sequences with time_steps={time_steps}...") + for i in range(len(data) - time_steps): + X.append(data[i:i + time_steps, :]) + # 目标是预测 time_steps 之后的那个点的状态 + y_seq.append(target[i + time_steps]) + print(f"Finished creating sequences. X shape: {np.array(X).shape}, y shape: {np.array(y_seq).shape}") + return np.array(X), np.array(y_seq) + +time_steps = 24 # 定义时间窗口大小 +X_seq, y_seq = create_sequences(x_new, y, time_steps=time_steps) + +# *** 重要:为XGBoost重塑数据 *** +# 将 (n_samples, time_steps, n_features) 转换为 (n_samples, time_steps * n_features) +n_samples, _, n_features = X_seq.shape +X_reshaped = X_seq.reshape(n_samples, time_steps * n_features) +print(f"Reshaped X for XGBoost. New shape: {X_reshaped.shape}") + +# 4. 划分数据集 (使用重塑后的X和对应的y) +X_train, X_test, y_train, y_test = train_test_split( + X_reshaped, y_seq, test_size=0.2, random_state=42, stratify=y_seq # 使用stratify保持类别比例 +) +print(f"Dataset split. Train shape: {X_train.shape}, Test shape: {X_test.shape}") +print("Class distribution in y_train:", np.bincount(y_train)) +print("Class distribution in y_test:", np.bincount(y_test)) + + +# 计算样本权重以处理类别不平衡 (可选但推荐) +# 使用 scikit-learn 的工具函数计算权重 +sample_weights = compute_sample_weight(class_weight='balanced', y=y_train) +print("Sample weights computed for training.") + +# 初始化 XGBoost 模型 +print("Initializing XGBoost Classifier...") +# 如果需要GPU加速,且已安装GPU支持的XGBoost,可添加 tree_method='gpu_hist' +model = xgb.XGBClassifier( + objective='multi:softprob', # 输出每个类别的概率 + num_class=len(np.unique(y_seq)), # 类别数量 + eval_metric='mlogloss', # 多分类对数损失 + use_label_encoder=False, # 推荐设置,避免警告 + random_state=42, + n_estimators=100, # 树的数量 (可调) + learning_rate=0.1, # 学习率 (可调) + max_depth=5, # 树的最大深度 (可调) + # tree_method='gpu_hist' # 取消注释以尝试GPU加速 + # 其他超参数可根据需要调整... +) + +# --- 移除了 PyTorch 损失函数和优化器 --- + +# --- 移除了 PyTorch 训练循环 --- + +# 训练 XGBoost 模型 +print("Training XGBoost model...") +# 使用 eval_set 进行早停可以防止过拟合,这里简化训练过程 +# eval_set = [(X_test, y_test)] +# model.fit(X_train, y_train, sample_weight=sample_weights, eval_set=eval_set, early_stopping_rounds=10, verbose=True) +model.fit(X_train, y_train, sample_weight=sample_weights, verbose=True) # 使用样本权重 +print("XGBoost training finished.") + + +# 评估模型 +print("Evaluating XGBoost model...") +y_pred = model.predict(X_test) # 直接预测类别标签 + +# 计算准确率 +accuracy = accuracy_score(y_test, y_pred) +print(f"Test Accuracy: {accuracy * 100:.2f}%") + +# 输出详细分类报告 +print("\nClassification Report:") +print(classification_report(y_test, y_pred, target_names=['BROKEN', 'NORMAL', 'RECOVERING'])) # 确保标签顺序正确 + +# 输出混淆矩阵 +print("\nConfusion Matrix:") +cm = confusion_matrix(y_test, y_pred) +print(cm) + +# 可视化混淆矩阵 +plt.figure(figsize=(8, 6)) +sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', + xticklabels=['BROKEN', 'NORMAL', 'RECOVERING'], # 与 choices 对应 + yticklabels=['BROKEN', 'NORMAL', 'RECOVERING']) +plt.title("XGBoost Confusion Matrix") +plt.xlabel("Predicted Label") +plt.ylabel("True Label") +plt.show() \ No newline at end of file