students_git_repo/周家林/XGboost.py

136 lines
5.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import xgboost as xgb # 导入XGBoost
import pandas as pd
from sklearn import preprocessing
import numpy as np
from sklearn.model_selection import train_test_split
# 移除了 torch 和 torch.nn 相关导入
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report # 增加评估指标
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.utils.class_weight import compute_sample_weight # 用于计算样本权重
# 检查GPU可用性XGBoost 可配置使用GPU但方式不同这里简化为CPU
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")
print("Using CPU for XGBoost (GPU can be configured if needed and available)")
# 1. 加载数据集
df = pd.read_csv('sensor.csv', index_col=0)
print("Dataset loaded.")
# 2. 数据预处理
df.drop(columns=['sensor_50', 'sensor_51', 'sensor_15'], inplace=True)
x = df.iloc[:, 1:50].fillna(method='ffill')
scaler = preprocessing.MinMaxScaler()
x_scaled = scaler.fit_transform(x)
x_scaled = pd.DataFrame(x_scaled, columns=df.iloc[:, 1:50].columns)
print("Data scaled and NaNs filled.")
# 目标变量编码
conditions = [(df['machine_status'] =='NORMAL'), (df['machine_status'] =='BROKEN'), (df['machine_status'] =='RECOVERING')]
choices = [1, 0, 2] # BROKEN: 0, NORMAL: 1, RECOVERING: 2
df['Operation'] = np.select(conditions, choices, default=0) # 保持原始编码
# df.drop(['machine_status'],axis=1, inplace=True) # 保留原始列以便检查
y = df['Operation'].values # 直接获取numpy数组
print("Target variable encoded.")
print("Class distribution in y:", np.bincount(y))
# 4. 特征选择 (在缩放后的数据上进行)
selector = SelectKBest(score_func=chi2, k=20)
# chi2要求非负特征MinMaxScaler保证了这一点
x_new = selector.fit_transform(x_scaled, y)
selected_features_indices = selector.get_support(indices=True)
selected_features_names = x_scaled.columns[selected_features_indices]
print(f"Selected {len(selected_features_names)} features:", selected_features_names.tolist())
# 3. 构建时序输入数据 (仍需要创建窗口)
def create_sequences(data, target, time_steps=24):
X, y_seq = [], []
print(f"Creating sequences with time_steps={time_steps}...")
for i in range(len(data) - time_steps):
X.append(data[i:i + time_steps, :])
# 目标是预测 time_steps 之后的那个点的状态
y_seq.append(target[i + time_steps])
print(f"Finished creating sequences. X shape: {np.array(X).shape}, y shape: {np.array(y_seq).shape}")
return np.array(X), np.array(y_seq)
time_steps = 24 # 定义时间窗口大小
X_seq, y_seq = create_sequences(x_new, y, time_steps=time_steps)
# *** 重要为XGBoost重塑数据 ***
# 将 (n_samples, time_steps, n_features) 转换为 (n_samples, time_steps * n_features)
n_samples, _, n_features = X_seq.shape
X_reshaped = X_seq.reshape(n_samples, time_steps * n_features)
print(f"Reshaped X for XGBoost. New shape: {X_reshaped.shape}")
# 4. 划分数据集 (使用重塑后的X和对应的y)
X_train, X_test, y_train, y_test = train_test_split(
X_reshaped, y_seq, test_size=0.2, random_state=42, stratify=y_seq # 使用stratify保持类别比例
)
print(f"Dataset split. Train shape: {X_train.shape}, Test shape: {X_test.shape}")
print("Class distribution in y_train:", np.bincount(y_train))
print("Class distribution in y_test:", np.bincount(y_test))
# 计算样本权重以处理类别不平衡 (可选但推荐)
# 使用 scikit-learn 的工具函数计算权重
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)
print("Sample weights computed for training.")
# 初始化 XGBoost 模型
print("Initializing XGBoost Classifier...")
# 如果需要GPU加速且已安装GPU支持的XGBoost可添加 tree_method='gpu_hist'
model = xgb.XGBClassifier(
objective='multi:softprob', # 输出每个类别的概率
num_class=len(np.unique(y_seq)), # 类别数量
eval_metric='mlogloss', # 多分类对数损失
use_label_encoder=False, # 推荐设置,避免警告
random_state=42,
n_estimators=100, # 树的数量 (可调)
learning_rate=0.1, # 学习率 (可调)
max_depth=5, # 树的最大深度 (可调)
# tree_method='gpu_hist' # 取消注释以尝试GPU加速
# 其他超参数可根据需要调整...
)
# --- 移除了 PyTorch 损失函数和优化器 ---
# --- 移除了 PyTorch 训练循环 ---
# 训练 XGBoost 模型
print("Training XGBoost model...")
# 使用 eval_set 进行早停可以防止过拟合,这里简化训练过程
# eval_set = [(X_test, y_test)]
# model.fit(X_train, y_train, sample_weight=sample_weights, eval_set=eval_set, early_stopping_rounds=10, verbose=True)
model.fit(X_train, y_train, sample_weight=sample_weights, verbose=True) # 使用样本权重
print("XGBoost training finished.")
# 评估模型
print("Evaluating XGBoost model...")
y_pred = model.predict(X_test) # 直接预测类别标签
# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy * 100:.2f}%")
# 输出详细分类报告
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['BROKEN', 'NORMAL', 'RECOVERING'])) # 确保标签顺序正确
# 输出混淆矩阵
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)
# 可视化混淆矩阵
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['BROKEN', 'NORMAL', 'RECOVERING'], # 与 choices 对应
yticklabels=['BROKEN', 'NORMAL', 'RECOVERING'])
plt.title("XGBoost Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()