"""
案例:癌症预测案例,目的:演示逻辑回归相关 API。
逻辑回归:概述:它属于分类算法的一种,一般用于二分法。
原理:
1. 基于线性回归,结合特征值,计算出标签值。
2. 把上述算出来的标签值传给激活函数 (Sigmoid),映射成 [0, 1] 区间的值。
3. 结合手动设置的阈值,来划分区间即可。
损失函数:先基于极大似然函数计算,然后转成对数似然函数,结合梯度下降,计算最小值即可。
"""
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
data = pd.read_csv('./data/breast-cancer-wisconsin.csv')
data.info()
data = data.replace('?', np.nan)
data.info()
data.dropna(axis=0, inplace=True)
data.info()
x = data.iloc[:, 1:-1]
y = data.Class
print(len(x), len(y))
print(x.head(10))
print(y.head(10))
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=22)
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
estimator = LogisticRegression()
estimator.fit(x_train, y_train)
y_predict = estimator.predict(x_test)
print(f'预测值:{y_predict}')
print(f'准确率:{estimator.score(x_test, y_test)}')
print(f'准确率:{accuracy_score(y_test, y_predict)}')
"""
案例:演示混淆矩阵和精确率、召回率、F1 值。
混淆矩阵:概述:用来描述真实值和预测值之间关系的。
图解:
预测标签 (正例) | 预测标签 (反例)
真实标签 (正例) | 真正例 (TP) | 伪反例 (FN)
真实标签 (反例) | 伪正例 (FP) | 真反例 (TN)
单词:True: 真,False: 假 (伪),Positive: 正例,Negative: 反例
结论:
1. 模拟使用分类少的充当正例。
2. 精确率 = 真正例 在 预测正例中的占比,即:tp / (tp + fp)
3. 召回率 = 真正例 在 真实正例中的占比,即:tp / (tp + fn)
4. F1 值 = 2 * (精确率 * 召回率) / (精确率 + 召回率)
"""
import pandas as pd
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
y_train = ['恶性', '恶性', '恶性', '恶性', '恶性', '恶性', '良性', '良性', '良性', '良性']
y_pred_A = ['恶性', '恶性', '恶性', '良性', '良性', '良性', '良性', '良性', '良性', '良性']
y_pred_B = ['恶性', '恶性', '恶性', '恶性', '恶性', '恶性', '良性', '恶性', '恶性', '恶性']
label = ['恶性', '良性']
df_label = ['恶性 (正例)', '良性 (反例)']
cm_A = confusion_matrix(y_train, y_pred_A, labels=label)
print(f'混淆矩阵 A:\n {cm_A}')
df_A = pd.DataFrame(cm_A, index=df_label, columns=df_label)
print(f'混淆矩阵 A 的 DataFrame 对象形式:\n {df_A}')
cm_B = confusion_matrix(y_train, y_pred_B, labels=label)
print(f'混淆矩阵 B:\n {cm_B}')
df_B = pd.DataFrame(cm_B, index=df_label, columns=df_label)
print(f'混淆矩阵 B 的 DataFrame 对象形式:\n {df_B}')
print(f'模型 A 精确率:{precision_score(y_train, y_pred_A, pos_label="恶性")}')
print(f'模型 A 召回率:{recall_score(y_train, y_pred_A, pos_label="恶性")}')
print(f'模型 A F1 值:{f1_score(y_train, y_pred_A, pos_label="恶性")}')
print(f'模型 B 精确率:{precision_score(y_train, y_pred_B, pos_label="恶性")}')
print(f'模型 B 召回率:{recall_score(y_train, y_pred_B, pos_label="恶性")}')
print(f'模型 B F1 值:{f1_score(y_train, y_pred_B, pos_label="恶性")}')
"""
案例:电信客户流失分析。
目的:
1. 演示逻辑回归的相关操作,主要是:二分法 (流失,不流失)
2. 演示逻辑回归的评估操作,主要是:混淆矩阵、准确率、召回率、F1 值、ROC 曲线、AUC 值、分类评估报告
"""
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split
def dm01_数据预处理():
data = pd.read_csv('./data/churn.csv')
data.info()
data = pd.get_dummies(data)
data.info()
print(data.head(10))
data.drop(['gender_Male', 'Churn_No'], axis=1, inplace=True)
print(data.head(10))
data.rename(columns={'Churn_Yes': 'flag'}, inplace=True)
print(data.head(10))
print(data.flag.value_counts())
def dm02_会员流失可视化情况():
data = pd.read_csv('./data/churn.csv')
data = pd.get_dummies(data)
data.drop(['gender_Male', 'Churn_No'], axis=1, inplace=True)
data.rename(columns={'Churn_Yes': 'flag'}, inplace=True)
print(data.flag.value_counts())
print(data.columns)
sns.countplot(data, x='Contract_Month', hue='flag')
plt.show()
def dm03_逻辑回归模型训练评估():
data = pd.read_csv('./data/churn.csv')
data = pd.get_dummies(data)
data.drop(['gender_Male', 'Churn_No'], axis=1, inplace=True)
data.rename(columns={'Churn_Yes': 'flag'}, inplace=True)
x = data[['Contract_Month', 'PaymentElectronic', 'internet_other']]
y = data['flag']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=22)
estimator = LogisticRegression()
estimator.fit(x_train, y_train)
y_predict = estimator.predict(x_test)
print(f'预测值为:{y_predict}')
print(f'准确率:{estimator.score(x_test, y_test)}')
print(f'精确率:{precision_score(y_test, y_predict)}')
print(f'召回率:{recall_score(y_test, y_predict)}')
print(f'F1 值:{f1_score(y_test, y_predict)}')
print(f'roc 曲线:{roc_auc_score(y_test, y_predict)}')
print(f'分类评估报告:{classification_report(y_test, y_predict)}')
if __name__ == '__main__':
dm03_逻辑回归模型训练评估()