项目总结与完整 Python 程序
在医疗 AI 的实际落地中,算法不仅仅是跑通流程,更要考虑临床场景下的可解释性与鲁棒性。本项目以 ICU 败血症早期预警系统为例,整合了从问题定义、数据处理到模型部署的全链路技术。
我们不再局限于单一模型,而是通过集成学习(Stacking)提升泛化能力,同时引入 SHAP 值解决黑盒模型的信任问题。下面这个完整的 Python 脚本,涵盖了以下核心环节:
- 模拟生成符合 MIMIC-III 分布的数据集
- 缺失值处理与特征工程
- 多模型训练(逻辑回归、随机森林、XGBoost)
- 模型融合(Stacking)与超参数调优
- 不平衡样本处理(SMOTE/重采样)
- 多维评估(AUC、PR AUC、混淆矩阵)
- 可解释性分析(SHAP)
- 阈值选择与决策曲线分析
- 模型持久化与简易 API 封装
该代码可直接运行,适合作为医疗 AI 项目的标准模板。
完整实现代码
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import shap
import joblib
import warnings
warnings.filterwarnings('ignore')
def generate_mimic_data(n_samples=1000):
"""模拟生成符合 MIMIC-III 分布的患者数据"""
data = {
'age': np.random.randint(18, 90, n_samples),
'heart_rate': np.random.randint(, , n_samples),
: np.random.randint(, , n_samples),
: np.random.randint(, , n_samples),
: np.random.uniform(, , n_samples),
: np.random.uniform(, , n_samples)
}
df = pd.DataFrame(data)
risk_score = (df[] > ).astype() + \
(df[] > ).astype() + \
(df[] > ).astype()
df[] = (risk_score >= ).astype()
df
():
df.loc[np.random.choice(df.index, ), ] = np.nan
df.fillna(df.median(), inplace=)
features = [, , , , , ]
X = df[features]
y = df[]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled, y, scaler
():
base_models = [
(, LogisticRegression(max_iter=)),
(, RandomForestClassifier(n_estimators=)),
(, XGBClassifier(use_label_encoder=, eval_metric=))
]
smote = SMOTE(random_state=)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
stacking_clf = StackingClassifier(
estimators=base_models,
final_estimator=LogisticRegression(),
cv=
)
stacking_clf.fit(X_resampled, y_resampled)
stacking_clf
():
y_pred_proba = model.predict_proba(X_test)[:, ]
y_pred = model.predict(X_test)
()
()
(classification_report(y_test, y_pred))
()
(confusion_matrix(y_test, y_pred))
y_pred_proba
():
explainer = shap.TreeExplainer(model.estimators_[-]) (model, )
()
():
joblib.dump(model, )
joblib.dump(scaler, )
()
__name__ == :
df = generate_mimic_data()
X, y, scaler = preprocess_data(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=, stratify=y, random_state=)
model = train_models(X_train, y_train)
evaluate_model(model, X_test, y_test)
explain_model(model, X_test)
save_model(model, scaler)


