unified_python/回归1.py
2025-08-13 08:50:32 +08:00

143 lines
4.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
import joblib
def create_weighted_features(X):
"""
创建加权特征
"""
X_weighted = X.copy()
# 对重要特征进行加权(增加倍数)
X_weighted['dna_num'] = X_weighted['dna_num'] * 2
X_weighted['task_num'] = X_weighted['task_num'] * 1.5
X_weighted['season_point'] = X_weighted['season_point'] * 1.5
return X_weighted
def train_and_save_model():
"""
训练模型并保存
"""
# 读取数据
data = pd.read_csv('tr_user_tj.csv', header=0)
data1 = data[['star_num', 'sign_num', 'coll_num', 'dna_num', 'task_num', 'word_num', 'balance_amt', 'earn_amt',
'season_point', 'point', 'star_score', 'term_amt', 'match_num']]
# 分离特征和目标变量
X = data1.drop('match_num', axis=1)
y = data1['match_num']
# 创建加权特征
X_weighted = create_weighted_features(X)
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_weighted, y, test_size=0.2, random_state=42)
# 创建随机森林回归器
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
# 训练模型
rf_regressor.fit(X_train, y_train)
# 预测
y_pred = rf_regressor.predict(X_test)
# 计算评估指标
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Model Training Results:")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"Mean Absolute Error: {mae}")
print(f"R² Score: {r2}")
# 保存模型
joblib.dump(rf_regressor, 'match_num_predictor.pkl')
print("\nModel saved as 'match_num_predictor.pkl'")
# 保存特征列名(用于预测时保持一致性)
feature_names = list(X_weighted.columns)
joblib.dump(feature_names, 'feature_names.pkl')
print("Feature names saved as 'feature_names.pkl'")
return rf_regressor, feature_names
def predict_match_num(new_data):
"""
使用训练好的模型预测match_num
参数:
new_data: dict 或 DataFrame, 包含所有特征的数据
返回:
预测的match_num值
"""
# 加载模型和特征名称
model = joblib.load('match_num_predictor.pkl')
feature_names = joblib.load('feature_names.pkl')
# 处理输入数据
if isinstance(new_data, dict):
# 如果输入是字典转换为DataFrame
df = pd.DataFrame([new_data])
else:
# 如果输入是DataFrame
df = new_data.copy()
# 确保包含所有必要的特征列
required_features = [f for f in feature_names if f not in ['dna_num', 'task_num', 'season_point'] or
not f.endswith(('_weighted', '_squared', '_interaction'))]
# 应用相同的特征加权处理
df_weighted = create_weighted_features(df)
# 确保列顺序与训练时一致
df_weighted = df_weighted.reindex(columns=feature_names, fill_value=0)
# 进行预测
prediction = model.predict(df_weighted)
return prediction
# 使用示例
if __name__ == '__main__':
# 训练并保存模型
model, features = train_and_save_model()
# 示例:如何使用模型进行预测
print("\n" + "=" * 50)
print("预测示例:")
# 示例数据(需要包含所有原始特征)
sample_data = {
'star_num': 5,
'sign_num': 10,
'coll_num': 3,
'dna_num': 50,
'task_num': 25,
'word_num': 100,
'balance_amt': 1000,
'earn_amt': 500,
'season_point': 200,
'point': 150,
'star_score': 4.5,
'term_amt': 300
}
# 进行预测
try:
prediction = predict_match_num(sample_data)
print(f"输入数据: {sample_data}")
print(f"预测的match_num值: {prediction[0]:.2f}")
except Exception as e:
print(f"预测过程中出现错误: {e}")