Python 数据分析：模型评估与选择实战

Python 数据分析：模型评估与选择实战 | 极客日志

sklearn

# 导入相关库
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import pandas as pd

# 导入数据
df = pd.read_csv(r'C:\Users\DELL\data-science-learning\seaborn-data\iris.csv')
print(df.shape)

# 划分数据集和测试集
train_set, test_set = train_test_split(df, test_size=0.3,
                                       random_state=12345)
print(train_set.shape, test_set.shape)

# 加载数据
digits = datasets.load_digits()

# 创建特征矩阵
features = digits.data
target = digits.target

# 进行标准化
stand = StandardScaler()

# 创建逻辑回归器
logistic = LogisticRegression()

# 创建一个包含数据标准化和逻辑回归的流水线
pipeline = make_pipeline(stand, logistic)  # 先对数据进行标准化，再用逻辑回归拟合

# 创建 k 折交叉验证对象
kf = KFold(n_splits=10, shuffle=True, random_state=1)

# 进行 k 折交叉验证
cv_results = cross_val_score(pipeline,
                             features,
                             target,
                             cv=kf,
                             scoring='accuracy',  # 评估的指标
                             n_jobs=-1)  # 调用所有的 cpu

print(cv_results.mean())

from sklearn.model_selection import train_test_split

# 划分数据集
features_train, features_test, target_train, target_test = train_test_split(features, 
                                                                            target,
                                                                            test_size=0.1,
                                                                            random_state=1)

# 使用训练集来计算标准化参数
stand.fit(features_train)

# 然后在训练集和测试集上运用
features_train_std = stand.transform(features_train)
features_test_std = stand.transform(features_test)

pipeline = make_pipeline(stand, logistic)

cv_results = cross_val_score(pipeline,
                             features_train_std,
                             target_train,
                             cv=kf,
                             scoring='accuracy',
                             n_jobs=-1)

print(cv_results.mean())

# 导入相关库
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn import metrics

# 建立模拟数据集
features, target = make_regression(n_samples=100,
                                   n_features=3,
                                   n_informative=3,
                                   n_targets=1,
                                   noise=50,
                                   coef=False,
                                   random_state=1)

# 创建 LinearRegression 回归器
ols = LinearRegression()

print(metrics.SCORERS.keys())

# 使用 MSE 对线性回归做交叉验证
print(cross_val_score(ols, features, target, scoring='neg_mean_squared_error', cv=5))

# 使用 R² 进行交叉验证
print(cross_val_score(ols, features, target, scoring='r2'))

from sklearn.datasets import fetch_california_housing
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split

# 加载数据
housing = fetch_california_housing()
features, target = housing.data, housing.target

# 将数据分为测试集和训练集
features_train, features_test, target_train, target_test = train_test_split(features, target,
                                                                           random_state=0)

# 创建 dummyregression 对象
dummy = DummyRegressor(strategy='mean')

# 训练模型
dummy.fit(features_train, target_train)

# 在测试集上评估
dummy_score = dummy.score(features_test, target_test)
print(f"Dummy Score: {dummy_score}")

# 下面我们训练自己的模型进行对比
from sklearn.linear_model import LinearRegression
ols = LinearRegression()
ols.fit(features_train, target_train)

ols_score = ols.score(features_test, target_test)
print(f"Linear Regression Score: {ols_score}")

# 导入相关库
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import pandas as pd

# 加载数据
iris = datasets.load_iris()
features = iris.data
target = iris.target
class_names = iris.target_names

features_train, features_test, target_train, target_test = train_test_split(
    features, target, random_state=1)

classifier = LogisticRegression()

# 训练并预测
target_predicted = classifier.fit(features_train, target_train).predict(features_test)

# 创建一个混淆矩阵
matrix = confusion_matrix(target_test, target_predicted)
df = pd.DataFrame(matrix, index=class_names, columns=class_names)

sns.heatmap(df, annot=True, cbar=None, cmap='Blues')
plt.ylabel('True Class')
plt.xlabel('Predict Class')
plt.title('Confusion Matrix')
plt.show()

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification

# 创建模拟数据集
X, y = make_classification(random_state=1,
                          n_samples=1000,
                          n_features=3,
                          n_informative=3,
                          n_redundant=0,
                          n_classes=2)

# 创建逻辑回归器
logit = LogisticRegression()

# 使用准确率对模型进行交叉验证
print(cross_val_score(logit, X, y, scoring='accuracy'))

# 使用 F1 分数
print(cross_val_score(logit, X, y, scoring='f1'))

# 使用精确率
print(cross_val_score(logit, X, y, scoring='precision'))

# 导入相关库
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split

features, target = make_classification(n_samples=1000,
                                      n_features=10,
                                      n_classes=2,
                                      n_informative=3,
                                      random_state=3)

features_train, features_test, target_train, target_test = train_test_split(features,
                                                                            target,
                                                                            test_size=0.1,
                                                                            random_state=1)

logit = LogisticRegression()
logit.fit(features_train, target_train)

# 预测为 1 的概率
target_probabilities = logit.predict_proba(features_test)[:, 1]

false_positive_rate, true_positive_rate, thresholds = roc_curve(target_test, target_probabilities)

# 绘制 ROC 曲线
plt.plot(false_positive_rate, true_positive_rate)
plt.plot([0, 1], ls='--')
plt.plot([0, 0], [1, 0], c='.7')
plt.plot([1, 1], c='.7')
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

auc_value = roc_auc_score(target_test, target_probabilities)
print(f"AUC Value: {auc_value}")

from sklearn.datasets import load_iris
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split

iris = load_iris()
features, target = iris.data, iris.target

# 划分数据集
features_train, features_test, target_train, target_test = train_test_split(features, target,
                                                                           random_state=0)

dummy = DummyClassifier(strategy='uniform', random_state=1)
dummy.fit(features_train, target_train)
print(f"Dummy Classifier Score: {dummy.score(features_test, target_test)}")

# 接下来我们创建自己的模型
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(features_train, target_train)
print(f"Random Forest Score: {classifier.score(features_test, target_test)}")

import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve

digits = load_digits()
features, target = digits.data, digits.target

# 使用交叉验证为不同规模的训练集计算训练和测试得分
train_sizes, train_scores, test_scores = learning_curve(RandomForestClassifier(),
                                                       features,
                                                       target,
                                                       cv=10,
                                                       scoring='accuracy',
                                                       n_jobs=-1,
                                                       train_sizes=np.linspace(0.01, 1, 50))

# 计算训练集得分的平均值和标准差
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(train_sizes, train_mean, '--', color='black', label='Training score')
plt.plot(train_sizes, test_mean, color='black', label='Cross-validation score')
plt.fill_between(train_sizes, train_mean-train_std,
                train_mean + train_std, color='#DDDDDD')
plt.fill_between(train_sizes, test_mean-test_std,
                test_mean + test_std, color='#DDDDDD')
plt.title('Learning Curve')
plt.xlabel('Training Set Size')
plt.ylabel('Accuracy Score')
plt.legend(loc='best')
plt.tight_layout()
plt.show()

from sklearn.metrics import classification_report

iris = datasets.load_iris()
features = iris.data
target = iris.target
class_names = iris.target_names

features_train, features_test, target_train, target_test = train_test_split(
    features, target, random_state=1)

classifier = LogisticRegression()
model = classifier.fit(features_train, target_train)
target_predicted = model.predict(features_test)

# 生成分类器的性能报告
print(classification_report(target_test,
                           target_predicted,
                           target_names=class_names))

Python 数据分析：模型评估与选择实战

Python 数据分析：模型评估与选择实战

引言

1. 数据集划分

2. 交叉验证模型

3. 回归模型评估指标

4. 创建一个基准回归模型

5. 混淆矩阵

6. 分类评估指标

7. ROC 和 AUC

7.1 ROC 曲线

7.2 AUC 值

8. 创建一个基准分类模型

9. 可视化训练集规模的影响

10. 生成评估指标报告

总结

更多推荐文章

相关免费在线工具

Python 数据分析：模型评估与选择实战

Python 数据分析：模型评估与选择实战

引言

1. 数据集划分

2. 交叉验证模型

3. 回归模型评估指标

4. 创建一个基准回归模型

5. 混淆矩阵

6. 分类评估指标

7. ROC 和 AUC

7.1 ROC 曲线

7.2 AUC 值

8. 创建一个基准分类模型

9. 可视化训练集规模的影响

10. 生成评估指标报告

总结

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具