students_git_repo/周家林/XGboost.py

import xgboost as xgb # 导入XGBoost
import pandas as pd
from sklearn import preprocessing
import numpy as np
from sklearn.model_selection import train_test_split
# 移除了 torch 和 torch.nn 相关导入
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report # 增加评估指标
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.utils.class_weight import compute_sample_weight # 用于计算样本权重

# 检查GPU可用性（XGBoost 可配置使用GPU，但方式不同，这里简化为CPU）
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")
print("Using CPU for XGBoost (GPU can be configured if needed and available)")

# 1. 加载数据集
df = pd.read_csv('sensor.csv', index_col=0)
print("Dataset loaded.")

# 2. 数据预处理
df.drop(columns=['sensor_50', 'sensor_51', 'sensor_15'], inplace=True)
x = df.iloc[:, 1:50].fillna(method='ffill')

scaler = preprocessing.MinMaxScaler()
x_scaled = scaler.fit_transform(x)
x_scaled = pd.DataFrame(x_scaled, columns=df.iloc[:, 1:50].columns)
print("Data scaled and NaNs filled.")

# 目标变量编码
conditions = [(df['machine_status'] =='NORMAL'), (df['machine_status'] =='BROKEN'), (df['machine_status'] =='RECOVERING')]
choices = [1, 0, 2] # BROKEN: 0, NORMAL: 1, RECOVERING: 2
df['Operation'] = np.select(conditions, choices, default=0) # 保持原始编码
# df.drop(['machine_status'],axis=1, inplace=True) # 保留原始列以便检查
y = df['Operation'].values # 直接获取numpy数组
print("Target variable encoded.")
print("Class distribution in y:", np.bincount(y))

# 4. 特征选择 (在缩放后的数据上进行)
selector = SelectKBest(score_func=chi2, k=20)
# chi2要求非负特征，MinMaxScaler保证了这一点
x_new = selector.fit_transform(x_scaled, y)
selected_features_indices = selector.get_support(indices=True)
selected_features_names = x_scaled.columns[selected_features_indices]
print(f"Selected {len(selected_features_names)} features:", selected_features_names.tolist())

# 3. 构建时序输入数据 (仍需要创建窗口)
def create_sequences(data, target, time_steps=24):
    X, y_seq = [], []
    print(f"Creating sequences with time_steps={time_steps}...")
    for i in range(len(data) - time_steps):
        X.append(data[i:i + time_steps, :])
        # 目标是预测 time_steps 之后的那个点的状态
        y_seq.append(target[i + time_steps])
    print(f"Finished creating sequences. X shape: {np.array(X).shape}, y shape: {np.array(y_seq).shape}")
    return np.array(X), np.array(y_seq)

time_steps = 24 # 定义时间窗口大小
X_seq, y_seq = create_sequences(x_new, y, time_steps=time_steps)

# *** 重要：为XGBoost重塑数据 ***
# 将 (n_samples, time_steps, n_features) 转换为 (n_samples, time_steps * n_features)
n_samples, _, n_features = X_seq.shape
X_reshaped = X_seq.reshape(n_samples, time_steps * n_features)
print(f"Reshaped X for XGBoost. New shape: {X_reshaped.shape}")

# 4. 划分数据集 (使用重塑后的X和对应的y)
X_train, X_test, y_train, y_test = train_test_split(
    X_reshaped, y_seq, test_size=0.2, random_state=42, stratify=y_seq # 使用stratify保持类别比例
)
print(f"Dataset split. Train shape: {X_train.shape}, Test shape: {X_test.shape}")
print("Class distribution in y_train:", np.bincount(y_train))
print("Class distribution in y_test:", np.bincount(y_test))


# 计算样本权重以处理类别不平衡 (可选但推荐)
# 使用 scikit-learn 的工具函数计算权重
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)
print("Sample weights computed for training.")

# 初始化 XGBoost 模型
print("Initializing XGBoost Classifier...")
# 如果需要GPU加速，且已安装GPU支持的XGBoost，可添加 tree_method='gpu_hist'
model = xgb.XGBClassifier(
    objective='multi:softprob',  # 输出每个类别的概率
    num_class=len(np.unique(y_seq)),  # 类别数量
    eval_metric='mlogloss',       # 多分类对数损失
    use_label_encoder=False,      # 推荐设置，避免警告
    random_state=42,
    n_estimators=100,             # 树的数量 (可调)
    learning_rate=0.1,            # 学习率 (可调)
    max_depth=5,                  # 树的最大深度 (可调)
    # tree_method='gpu_hist'      # 取消注释以尝试GPU加速
    # 其他超参数可根据需要调整...
)

# --- 移除了 PyTorch 损失函数和优化器 ---

# --- 移除了 PyTorch 训练循环 ---

# 训练 XGBoost 模型
print("Training XGBoost model...")
# 使用 eval_set 进行早停可以防止过拟合，这里简化训练过程
# eval_set = [(X_test, y_test)]
# model.fit(X_train, y_train, sample_weight=sample_weights, eval_set=eval_set, early_stopping_rounds=10, verbose=True)
model.fit(X_train, y_train, sample_weight=sample_weights, verbose=True) # 使用样本权重
print("XGBoost training finished.")


# 评估模型
print("Evaluating XGBoost model...")
y_pred = model.predict(X_test) # 直接预测类别标签

# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# 输出详细分类报告
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['BROKEN', 'NORMAL', 'RECOVERING'])) # 确保标签顺序正确

# 输出混淆矩阵
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

# 可视化混淆矩阵
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['BROKEN', 'NORMAL', 'RECOVERING'], # 与 choices 对应
            yticklabels=['BROKEN', 'NORMAL', 'RECOVERING'])
plt.title("XGBoost Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()