import pandas as pd import numpy as np from sklearn.ensemble import RandomForestClassifier from sklearn.calibration import CalibratedClassifierCV from sklearn.metrics import ( accuracy_score, f1_score, roc_auc_score, precision_recall_curve, classification_report ) from sklearn.model_selection import train_test_split if __name__ == '__main__': data = pd.read_csv('tr_user_tj.csv', header=0) data1 = data[['star_num', 'sign_num', 'coll_num', 'dna_num', 'task_num', 'word_num', 'balance_amt', 'earn_amt', 'season_point', 'point', 'star_score', 'term_amt', 'match_num']] # 分离特征和目标变量 X = data1.drop('match_num', axis=1) y = data1['match_num'] # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 创建随机森林分类器 rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42) # 训练模型 rf_classifier.fit(X_train, y_train) # 预测 y_pred = rf_classifier.predict(X_test) y_pred_proba = rf_classifier.predict_proba(X_test)[:, 1] if len(np.unique(y)) == 2 else rf_classifier.predict_proba( X_test) # 计算评估指标 accuracy = accuracy_score(y_test, y_pred) f1 = f1_score(y_test, y_pred, average='weighted') print(f"Accuracy: {accuracy}") print(f"F1 Score: {f1}") print("\nClassification Report:") print(classification_report(y_test, y_pred)) # 特征重要性 feature_importance = pd.DataFrame({ 'feature': X.columns, 'importance': rf_classifier.feature_importances_ }).sort_values('importance', ascending=False) print("\nFeature Importance:") print(feature_importance)