import re
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression as LR, Lasso
import joblib
import seaborn as sns
model_save_path = r'./app/dataset/testModel/'
if not os.path.exists(model_save_path):
os.makedirs(model_save_path)
data = pd.read_csv(r"./app/dataset/ana_result/piaofang_info.csv")
data = data.iloc[:, [2, 3, 4, 5, 7, 9, 10, 11]]
X = data.iloc[:, 0:7]
y = data.iloc[:, 7].apply(lambda x: x / 10000)
y = np.log1p(y)
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=1)
oof_df = pd.DataFrame()
test_oof_df = pd.DataFrame()
def performance_metric(y_true, y_predict):
""" Calculates and returns the performance score between true and predicted values based on the metric chosen. """
score = r2_score(y_true, y_predict)
return score
def fit_dtr_model(X, y):
cross_validator = KFold(n_splits=5)
regressor = DecisionTreeRegressor(random_state=1)
params = {'max_depth': [i for i in range(1, 11)]}
scoring_fnc = make_scorer(performance_metric)
grid = GridSearchCV(regressor, params, scoring=scoring_fnc, cv=cross_validator)
grid = grid.fit(X, y)
dtr_max_depth = grid.best_estimator_.get_params()['max_depth']
return dtr_max_depth
def fit_decision_tree_model_forcast():
dtr_max_depth = fit_dtr_model(X, y)
dtr_regressor = DecisionTreeRegressor(max_depth=dtr_max_depth)
dtr_regressor.fit(X, y)
pred_y = dtr_regressor.predict(test_X)
test_oof_df['dtr'] = pred_y
r2_score_val = performance_metric(test_y, pred_y)
rmse_score = np.sqrt(mean_squared_error(pred_y, test_y))
print('决策树回归模型评价指标为:')
print("The R2 score is ", r2_score_val)
print('均方差', rmse_score)
joblib.dump(dtr_regressor, model_save_path + 'dtr_model.pkl')
return rmse_score
def fit_lasso_model_forcast():
lasso_regressor = Lasso()
lasso_regressor.fit(X, y)
pred_y = lasso_regressor.predict(test_X)
test_oof_df['lasso'] = pred_y
r2_score_val = performance_metric(test_y, pred_y)
rmse_score = np.sqrt(mean_squared_error(pred_y, test_y))
print('Lasso 回归模型评价指标为:')
print("The R2 score is ", r2_score_val)
print('均方差', rmse_score)
joblib.dump(lasso_regressor, model_save_path + 'lasso_model.pkl')
return rmse_score
def fit_random_forest_regression_model():
rf_model = RandomForestRegressor()
rf_model.fit(X, y)
pred_y = rf_model.predict(test_X)
test_oof_df['rf'] = pred_y
r2_score_val = performance_metric(pred_y, test_y)
rmse_score = np.sqrt(mean_squared_error(pred_y, test_y))
print('随机森林模型评价指标为:')
print("The R2 score is ", r2_score_val)
print('均方差', rmse_score)
joblib.dump(rf_model, model_save_path + 'rf_model.pkl')
return rmse_score
def fit_gdbt_model():
gdbt_model = GradientBoostingRegressor()
gdbt_model.fit(X, y)
pred_y = gdbt_model.predict(test_X)
test_oof_df['gdbt'] = pred_y
r2_score_val = performance_metric(pred_y, test_y)
rmse_score = np.sqrt(mean_squared_error(pred_y, test_y))
print('GDBT 模型评价指标为:')
print("The R2 score is ", r2_score_val)
print('均方差', rmse_score)
joblib.dump(gdbt_model, model_save_path + 'gdbt_model.pkl')
return rmse_score
def fit_stacking_model():
lr_model = LR()
lr_model.fit(test_oof_df, test_y)
pred_y = lr_model.predict(test_oof_df)
r2_score_val = performance_metric(pred_y, test_y)
rmse_score = np.sqrt(mean_squared_error(pred_y, test_y))
print('Staking 模型评价指标为:')
print("The R2 score is ", r2_score_val)
print('均方差', rmse_score)
joblib.dump(lr_model, model_save_path + 'stacking_model.pkl')
return rmse_score
def forcast_piaofang(para):
para = pd.DataFrame(para)
dtr_model = joblib.load(model_save_path + 'dtr_model.pkl')
dtr_pred = dtr_model.predict(para)
print("决策树预测票房%s万" % np.expm1(dtr_pred[0]))
lasso_model = joblib.load(model_save_path + 'lasso_model.pkl')
lasso_pred = lasso_model.predict(para)
print("Lasso 预测票房%s万" % np.expm1(lasso_pred[0]))
rf_model = joblib.load(model_save_path + 'rf_model.pkl')
rf_pred = rf_model.predict(para)
print("随机森林预测票房%s万" % np.expm1(rf_pred[0]))
gdbt_model = joblib.load(model_save_path + 'gdbt_model.pkl')
gdbt_pred = gdbt_model.predict(para)
print("GDBT 预测票房%s万" % np.expm1(gdbt_pred[0]))
return [[dtr_pred[0], lasso_pred[0], rf_pred[0], gdbt_pred[0]]]
def train_model():
dtr_rmse = fit_decision_tree_model_forcast()
lasso_rmse = fit_lasso_model_forcast()
rf_rmse = fit_random_forest_regression_model()
gdbt_rmse = fit_gdbt_model()
lr_rmse = fit_stacking_model()
rmse_result = pd.DataFrame(index=["决策树", "Lasso", "随机森林", "GDBT", "Stacking"])
rmse_result['rmse_score'] = [dtr_rmse, lasso_rmse, rf_rmse, gdbt_rmse, lr_rmse]
rmse_result.to_csv("../dataset/testModel/rmse_result.csv", encoding='utf-8', index=False)
def test_model():
test_para = pd.DataFrame([[2022, 2, 8.4, 3, 1, 50, 173]])
test_piaofang = 8394962 / 10000
print("真实票房%s万" % test_piaofang)
pred_list = forcast_piaofang(test_para)
stacking_model = joblib.load(model_save_path + 'stacking_model.pkl')
piaofang = stacking_model.predict(pred_list)[0]
piaofang = round(np.expm1(piaofang), 2)
print("Stacking 预测票房%s万" % piaofang)
return piaofang
if __name__ == '__main__':
train_model()
piaofang = test_model()
ana_columns()