import pandas as pd import numpy as np from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error from sklearn.model_selection import train_test_split import joblib def create_weighted_features(X): """ 创建加权特征 """ X_weighted = X.copy() # 对重要特征进行加权(增加倍数) X_weighted['dna_num'] = X_weighted['dna_num'] * 2 X_weighted['task_num'] = X_weighted['task_num'] * 1.5 X_weighted['season_point'] = X_weighted['season_point'] * 1.5 return X_weighted def train_and_save_model(): """ 训练模型并保存 """ # 读取数据 data = pd.read_csv('tr_user_tj.csv', header=0) data1 = data[['star_num', 'sign_num', 'coll_num', 'dna_num', 'task_num', 'word_num', 'balance_amt', 'earn_amt', 'season_point', 'point', 'star_score', 'term_amt', 'match_num']] # 分离特征和目标变量 X = data1.drop('match_num', axis=1) y = data1['match_num'] # 创建加权特征 X_weighted = create_weighted_features(X) # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X_weighted, y, test_size=0.2, random_state=42) # 创建随机森林回归器 rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42) # 训练模型 rf_regressor.fit(X_train, y_train) # 预测 y_pred = rf_regressor.predict(X_test) # 计算评估指标 mse = mean_squared_error(y_test, y_pred) rmse = np.sqrt(mse) mae = mean_absolute_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) print(f"Model Training Results:") print(f"Mean Squared Error: {mse}") print(f"Root Mean Squared Error: {rmse}") print(f"Mean Absolute Error: {mae}") print(f"R² Score: {r2}") # 保存模型 joblib.dump(rf_regressor, 'match_num_predictor.pkl') print("\nModel saved as 'match_num_predictor.pkl'") # 保存特征列名(用于预测时保持一致性) feature_names = list(X_weighted.columns) joblib.dump(feature_names, 'feature_names.pkl') print("Feature names saved as 'feature_names.pkl'") return rf_regressor, feature_names def predict_match_num(new_data): """ 使用训练好的模型预测match_num 参数: new_data: dict 或 DataFrame, 包含所有特征的数据 返回: 预测的match_num值 """ # 加载模型和特征名称 model = joblib.load('match_num_predictor.pkl') feature_names = joblib.load('feature_names.pkl') # 处理输入数据 if isinstance(new_data, dict): # 如果输入是字典,转换为DataFrame df = pd.DataFrame([new_data]) else: # 如果输入是DataFrame df = new_data.copy() # 确保包含所有必要的特征列 required_features = [f for f in feature_names if f not in ['dna_num', 'task_num', 'season_point'] or not f.endswith(('_weighted', '_squared', '_interaction'))] # 应用相同的特征加权处理 df_weighted = create_weighted_features(df) # 确保列顺序与训练时一致 df_weighted = df_weighted.reindex(columns=feature_names, fill_value=0) # 进行预测 prediction = model.predict(df_weighted) return prediction # 使用示例 if __name__ == '__main__': # 训练并保存模型 model, features = train_and_save_model() # 示例:如何使用模型进行预测 print("\n" + "=" * 50) print("预测示例:") # 示例数据(需要包含所有原始特征) sample_data = { 'star_num': 5, 'sign_num': 10, 'coll_num': 3, 'dna_num': 50, 'task_num': 25, 'word_num': 100, 'balance_amt': 1000, 'earn_amt': 500, 'season_point': 200, 'point': 150, 'star_score': 4.5, 'term_amt': 300 } # 进行预测 try: prediction = predict_match_num(sample_data) print(f"输入数据: {sample_data}") print(f"预测的match_num值: {prediction[0]:.2f}") except Exception as e: print(f"预测过程中出现错误: {e}")