unified_python/回归1.py

143 lines
4.1 KiB
Python
Raw Normal View History

2025-08-13 08:50:32 +08:00
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
import joblib
def create_weighted_features(X):
"""
创建加权特征
"""
X_weighted = X.copy()
# 对重要特征进行加权(增加倍数)
X_weighted['dna_num'] = X_weighted['dna_num'] * 2
X_weighted['task_num'] = X_weighted['task_num'] * 1.5
X_weighted['season_point'] = X_weighted['season_point'] * 1.5
return X_weighted
def train_and_save_model():
"""
训练模型并保存
"""
# 读取数据
data = pd.read_csv('tr_user_tj.csv', header=0)
data1 = data[['star_num', 'sign_num', 'coll_num', 'dna_num', 'task_num', 'word_num', 'balance_amt', 'earn_amt',
'season_point', 'point', 'star_score', 'term_amt', 'match_num']]
# 分离特征和目标变量
X = data1.drop('match_num', axis=1)
y = data1['match_num']
# 创建加权特征
X_weighted = create_weighted_features(X)
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_weighted, y, test_size=0.2, random_state=42)
# 创建随机森林回归器
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
# 训练模型
rf_regressor.fit(X_train, y_train)
# 预测
y_pred = rf_regressor.predict(X_test)
# 计算评估指标
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Model Training Results:")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"Mean Absolute Error: {mae}")
print(f"R² Score: {r2}")
# 保存模型
joblib.dump(rf_regressor, 'match_num_predictor.pkl')
print("\nModel saved as 'match_num_predictor.pkl'")
# 保存特征列名(用于预测时保持一致性)
feature_names = list(X_weighted.columns)
joblib.dump(feature_names, 'feature_names.pkl')
print("Feature names saved as 'feature_names.pkl'")
return rf_regressor, feature_names
def predict_match_num(new_data):
"""
使用训练好的模型预测match_num
参数:
new_data: dict DataFrame, 包含所有特征的数据
返回:
预测的match_num值
"""
# 加载模型和特征名称
model = joblib.load('match_num_predictor.pkl')
feature_names = joblib.load('feature_names.pkl')
# 处理输入数据
if isinstance(new_data, dict):
# 如果输入是字典转换为DataFrame
df = pd.DataFrame([new_data])
else:
# 如果输入是DataFrame
df = new_data.copy()
# 确保包含所有必要的特征列
required_features = [f for f in feature_names if f not in ['dna_num', 'task_num', 'season_point'] or
not f.endswith(('_weighted', '_squared', '_interaction'))]
# 应用相同的特征加权处理
df_weighted = create_weighted_features(df)
# 确保列顺序与训练时一致
df_weighted = df_weighted.reindex(columns=feature_names, fill_value=0)
# 进行预测
prediction = model.predict(df_weighted)
return prediction
# 使用示例
if __name__ == '__main__':
# 训练并保存模型
model, features = train_and_save_model()
# 示例:如何使用模型进行预测
print("\n" + "=" * 50)
print("预测示例:")
# 示例数据(需要包含所有原始特征)
sample_data = {
'star_num': 5,
'sign_num': 10,
'coll_num': 3,
'dna_num': 50,
'task_num': 25,
'word_num': 100,
'balance_amt': 1000,
'earn_amt': 500,
'season_point': 200,
'point': 150,
'star_score': 4.5,
'term_amt': 300
}
# 进行预测
try:
prediction = predict_match_num(sample_data)
print(f"输入数据: {sample_data}")
print(f"预测的match_num值: {prediction[0]:.2f}")
except Exception as e:
print(f"预测过程中出现错误: {e}")