143 lines
4.1 KiB
Python
143 lines
4.1 KiB
Python
|
import pandas as pd
|
|||
|
import numpy as np
|
|||
|
from sklearn.ensemble import RandomForestRegressor
|
|||
|
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
|
|||
|
from sklearn.model_selection import train_test_split
|
|||
|
import joblib
|
|||
|
|
|||
|
|
|||
|
def create_weighted_features(X):
|
|||
|
"""
|
|||
|
创建加权特征
|
|||
|
"""
|
|||
|
X_weighted = X.copy()
|
|||
|
# 对重要特征进行加权(增加倍数)
|
|||
|
X_weighted['dna_num'] = X_weighted['dna_num'] * 2
|
|||
|
X_weighted['task_num'] = X_weighted['task_num'] * 1.5
|
|||
|
X_weighted['season_point'] = X_weighted['season_point'] * 1.5
|
|||
|
return X_weighted
|
|||
|
|
|||
|
|
|||
|
def train_and_save_model():
|
|||
|
"""
|
|||
|
训练模型并保存
|
|||
|
"""
|
|||
|
# 读取数据
|
|||
|
data = pd.read_csv('tr_user_tj.csv', header=0)
|
|||
|
data1 = data[['star_num', 'sign_num', 'coll_num', 'dna_num', 'task_num', 'word_num', 'balance_amt', 'earn_amt',
|
|||
|
'season_point', 'point', 'star_score', 'term_amt', 'match_num']]
|
|||
|
|
|||
|
# 分离特征和目标变量
|
|||
|
X = data1.drop('match_num', axis=1)
|
|||
|
y = data1['match_num']
|
|||
|
|
|||
|
# 创建加权特征
|
|||
|
X_weighted = create_weighted_features(X)
|
|||
|
|
|||
|
# 划分训练集和测试集
|
|||
|
X_train, X_test, y_train, y_test = train_test_split(X_weighted, y, test_size=0.2, random_state=42)
|
|||
|
|
|||
|
# 创建随机森林回归器
|
|||
|
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
|
|||
|
|
|||
|
# 训练模型
|
|||
|
rf_regressor.fit(X_train, y_train)
|
|||
|
|
|||
|
# 预测
|
|||
|
y_pred = rf_regressor.predict(X_test)
|
|||
|
|
|||
|
# 计算评估指标
|
|||
|
mse = mean_squared_error(y_test, y_pred)
|
|||
|
rmse = np.sqrt(mse)
|
|||
|
mae = mean_absolute_error(y_test, y_pred)
|
|||
|
r2 = r2_score(y_test, y_pred)
|
|||
|
|
|||
|
print(f"Model Training Results:")
|
|||
|
print(f"Mean Squared Error: {mse}")
|
|||
|
print(f"Root Mean Squared Error: {rmse}")
|
|||
|
print(f"Mean Absolute Error: {mae}")
|
|||
|
print(f"R² Score: {r2}")
|
|||
|
|
|||
|
# 保存模型
|
|||
|
joblib.dump(rf_regressor, 'match_num_predictor.pkl')
|
|||
|
print("\nModel saved as 'match_num_predictor.pkl'")
|
|||
|
|
|||
|
# 保存特征列名(用于预测时保持一致性)
|
|||
|
feature_names = list(X_weighted.columns)
|
|||
|
joblib.dump(feature_names, 'feature_names.pkl')
|
|||
|
print("Feature names saved as 'feature_names.pkl'")
|
|||
|
|
|||
|
return rf_regressor, feature_names
|
|||
|
|
|||
|
|
|||
|
def predict_match_num(new_data):
|
|||
|
"""
|
|||
|
使用训练好的模型预测match_num
|
|||
|
|
|||
|
参数:
|
|||
|
new_data: dict 或 DataFrame, 包含所有特征的数据
|
|||
|
|
|||
|
返回:
|
|||
|
预测的match_num值
|
|||
|
"""
|
|||
|
# 加载模型和特征名称
|
|||
|
model = joblib.load('match_num_predictor.pkl')
|
|||
|
feature_names = joblib.load('feature_names.pkl')
|
|||
|
|
|||
|
# 处理输入数据
|
|||
|
if isinstance(new_data, dict):
|
|||
|
# 如果输入是字典,转换为DataFrame
|
|||
|
df = pd.DataFrame([new_data])
|
|||
|
else:
|
|||
|
# 如果输入是DataFrame
|
|||
|
df = new_data.copy()
|
|||
|
|
|||
|
# 确保包含所有必要的特征列
|
|||
|
required_features = [f for f in feature_names if f not in ['dna_num', 'task_num', 'season_point'] or
|
|||
|
not f.endswith(('_weighted', '_squared', '_interaction'))]
|
|||
|
|
|||
|
# 应用相同的特征加权处理
|
|||
|
df_weighted = create_weighted_features(df)
|
|||
|
|
|||
|
# 确保列顺序与训练时一致
|
|||
|
df_weighted = df_weighted.reindex(columns=feature_names, fill_value=0)
|
|||
|
|
|||
|
# 进行预测
|
|||
|
prediction = model.predict(df_weighted)
|
|||
|
|
|||
|
return prediction
|
|||
|
|
|||
|
|
|||
|
# 使用示例
|
|||
|
if __name__ == '__main__':
|
|||
|
# 训练并保存模型
|
|||
|
model, features = train_and_save_model()
|
|||
|
|
|||
|
# 示例:如何使用模型进行预测
|
|||
|
print("\n" + "=" * 50)
|
|||
|
print("预测示例:")
|
|||
|
|
|||
|
# 示例数据(需要包含所有原始特征)
|
|||
|
sample_data = {
|
|||
|
'star_num': 5,
|
|||
|
'sign_num': 10,
|
|||
|
'coll_num': 3,
|
|||
|
'dna_num': 50,
|
|||
|
'task_num': 25,
|
|||
|
'word_num': 100,
|
|||
|
'balance_amt': 1000,
|
|||
|
'earn_amt': 500,
|
|||
|
'season_point': 200,
|
|||
|
'point': 150,
|
|||
|
'star_score': 4.5,
|
|||
|
'term_amt': 300
|
|||
|
}
|
|||
|
|
|||
|
# 进行预测
|
|||
|
try:
|
|||
|
prediction = predict_match_num(sample_data)
|
|||
|
print(f"输入数据: {sample_data}")
|
|||
|
print(f"预测的match_num值: {prediction[0]:.2f}")
|
|||
|
except Exception as e:
|
|||
|
print(f"预测过程中出现错误: {e}")
|