feat: add SAPS-II eval to Kepler score comparison

This commit is contained in:
2026-05-05 18:35:06 +02:00
parent 935ce9750d
commit df7c0350a3

View File

@@ -308,18 +308,21 @@ def bedside_score(p):
def build_matrix(ids, primitives, cohort): def build_matrix(ids, primitives, cohort):
import numpy as np import numpy as np
X, y, scores, valid = [], [], [], [] X, y, scores, saps, valid = [], [], [], [], []
for sid in ids: for sid in ids:
p = primitives.get(sid) p = primitives.get(sid)
if p is None: continue if p is None: continue
f = formula_features(p) f = formula_features(p)
s, _ = bedside_score(p) s, _ = bedside_score(p)
if f is None or s is None: continue if f is None or s is None: continue
sap = cohort[sid].get("sapsii")
if sap is None: continue
X.append(f) X.append(f)
y.append(int(cohort[sid].get("died") or 0)) y.append(int(cohort[sid].get("died") or 0))
scores.append(s) scores.append(s)
saps.append(float(sap))
valid.append(sid) valid.append(sid)
return np.array(X), np.array(y), np.array(scores), valid return np.array(X), np.array(y), np.array(scores), np.array(saps), valid
# ── Main ──────────────────────────────────────────────────────────────────── # ── Main ────────────────────────────────────────────────────────────────────
@@ -353,8 +356,10 @@ def main():
and formula_features(primitives[s]) is not None] and formula_features(primitives[s]) is not None]
print(f" usable: {len(all_ids):,}") print(f" usable: {len(all_ids):,}")
X_all, y_all, S_all, _ = build_matrix(all_ids, primitives, cohort) X_all, y_all, S_all, SAPS_all, _ = build_matrix(all_ids, primitives, cohort)
print(f" mortality: {100*y_all.mean():.1f}%") print(f" mortality: {100*y_all.mean():.1f}%")
print(f" SAPS-II: mean={SAPS_all.mean():.1f} "
f"min={SAPS_all.min():.0f} max={SAPS_all.max():.0f}")
# ══════════════════════════════════════════════════════════════════════ # ══════════════════════════════════════════════════════════════════════
# [4] 5-fold CV — 5-term formula # [4] 5-fold CV — 5-term formula
@@ -366,34 +371,49 @@ def main():
subj_arr = np.array(subject_ids) subj_arr = np.array(subject_ids)
kf = KFold(n_splits=N_FOLDS, shuffle=False) kf = KFold(n_splits=N_FOLDS, shuffle=False)
fold_aucs, fold_aucs_score, fold_coefs = [], [], [] fold_aucs, fold_aucs_score, fold_aucs_saps, fold_coefs = [], [], [], []
for k, (tr_idx, te_idx) in enumerate(kf.split(subj_arr)): for k, (tr_idx, te_idx) in enumerate(kf.split(subj_arr)):
tr_subs = set(subj_arr[tr_idx].tolist()) tr_subs = set(subj_arr[tr_idx].tolist())
tr_ids = [s for s in all_ids if cohort[s]["subject_id"] in tr_subs] tr_ids = [s for s in all_ids if cohort[s]["subject_id"] in tr_subs]
te_ids = [s for s in all_ids if cohort[s]["subject_id"] not in tr_subs] te_ids = [s for s in all_ids if cohort[s]["subject_id"] not in tr_subs]
X_tr, y_tr, S_tr, _ = build_matrix(tr_ids, primitives, cohort) X_tr, y_tr, S_tr, _, _ = build_matrix(tr_ids, primitives, cohort)
X_te, y_te, S_te, _ = build_matrix(te_ids, primitives, cohort) X_te, y_te, S_te, SAPS_te, _ = build_matrix(te_ids, primitives, cohort)
mu = X_tr.mean(0); sd = X_tr.std(0) + 1e-9 mu = X_tr.mean(0); sd = X_tr.std(0) + 1e-9
lr = LogisticRegression(C=1.0, max_iter=1000, random_state=42) lr = LogisticRegression(C=1.0, max_iter=1000, random_state=42)
lr.fit((X_tr - mu) / sd, y_tr) lr.fit((X_tr - mu) / sd, y_tr)
pred = lr.predict_proba((X_te - mu) / sd)[:, 1] pred = lr.predict_proba((X_te - mu) / sd)[:, 1]
auc_f = roc_auc_score(y_te, pred) auc_f = roc_auc_score(y_te, pred)
auc_s = roc_auc_score(y_te, S_te) # Ordinal score AUROC auc_s = roc_auc_score(y_te, S_te) # Ordinal score AUROC
auc_saps = roc_auc_score(y_te, SAPS_te) # SAPS-II baseline AUROC
raw_beta = lr.coef_[0] / sd raw_beta = lr.coef_[0] / sd
raw_int = lr.intercept_[0] - sum(lr.coef_[0][i]*mu[i]/sd[i] for i in range(len(sd))) raw_int = lr.intercept_[0] - sum(lr.coef_[0][i]*mu[i]/sd[i] for i in range(len(sd)))
fold_aucs.append(auc_f) fold_aucs.append(auc_f)
fold_aucs_score.append(auc_s) fold_aucs_score.append(auc_s)
fold_aucs_saps.append(auc_saps)
fold_coefs.append([raw_int] + list(raw_beta)) fold_coefs.append([raw_int] + list(raw_beta))
print(f" fold {k+1}: formula AUROC={auc_f:.4f} ordinal score AUROC={auc_s:.4f}") print(f" fold {k+1}: formula AUROC={auc_f:.4f} "
f"ordinal AUROC={auc_s:.4f} SAPS-II AUROC={auc_saps:.4f}")
fa = np.array(fold_aucs); fas = np.array(fold_aucs_score) fa = np.array(fold_aucs); fas = np.array(fold_aucs_score)
fsap = np.array(fold_aucs_saps)
print(f"\n 5-term formula CV AUROC: {fa.mean():.4f} ± {fa.std():.4f} " print(f"\n 5-term formula CV AUROC: {fa.mean():.4f} ± {fa.std():.4f} "
f"(range {fa.min():.4f}{fa.max():.4f})") f"(range {fa.min():.4f}{fa.max():.4f})")
print(f" Ordinal score CV AUROC: {fas.mean():.4f} ± {fas.std():.4f} " print(f" Ordinal score CV AUROC: {fas.mean():.4f} ± {fas.std():.4f} "
f"(range {fas.min():.4f}{fas.max():.4f})") f"(range {fas.min():.4f}{fas.max():.4f})")
print(f" SAPS-II CV AUROC: {fsap.mean():.4f} ± {fsap.std():.4f} "
f"(range {fsap.min():.4f}{fsap.max():.4f})")
print(f" Ordinal loss: {fa.mean()-fas.mean():+.4f}") print(f" Ordinal loss: {fa.mean()-fas.mean():+.4f}")
print(f" Δ vs SAPS-II (formula): {fa.mean()-fsap.mean():+.4f}")
print(f" Δ vs SAPS-II (ordinal): {fas.mean()-fsap.mean():+.4f}")
# SAPS-II is a fixed pre-computed score (no parameters fit here), so an
# in-cohort AUROC is not optimistic — report it on the full Kepler cohort
# for a single, directly comparable headline number.
saps_auc_overall = roc_auc_score(y_all, SAPS_all)
print(f"\n SAPS-II AUROC on full Kepler cohort (n={len(y_all):,}): "
f"{saps_auc_overall:.4f}")
fc = np.array(fold_coefs) fc = np.array(fold_coefs)
print(f"\n Coefficient stability (5 terms):") print(f"\n Coefficient stability (5 terms):")
@@ -500,8 +520,8 @@ def main():
tr_subs = set(subs[:n_tr]) tr_subs = set(subs[:n_tr])
tr_ids = [s for s in all_ids if cohort[s]["subject_id"] in tr_subs] tr_ids = [s for s in all_ids if cohort[s]["subject_id"] in tr_subs]
te_ids = [s for s in all_ids if cohort[s]["subject_id"] not in tr_subs] te_ids = [s for s in all_ids if cohort[s]["subject_id"] not in tr_subs]
X_tr, y_tr, _, _ = build_matrix(tr_ids, primitives, cohort) X_tr, y_tr, _, _, _ = build_matrix(tr_ids, primitives, cohort)
X_te, y_te, S_te, _ = build_matrix(te_ids, primitives, cohort) X_te, y_te, S_te, _, _ = build_matrix(te_ids, primitives, cohort)
mu = X_tr.mean(0); sd = X_tr.std(0) + 1e-9 mu = X_tr.mean(0); sd = X_tr.std(0) + 1e-9
lr_cal = LogisticRegression(C=1.0, max_iter=1000, random_state=42) lr_cal = LogisticRegression(C=1.0, max_iter=1000, random_state=42)
lr_cal.fit((X_tr - mu) / sd, y_tr) lr_cal.fit((X_tr - mu) / sd, y_tr)
@@ -543,6 +563,11 @@ def main():
print(f"\n Ordinal bedside score (013 pts):") print(f"\n Ordinal bedside score (013 pts):")
print(f" 5-fold CV AUROC: {fas.mean():.4f} ± {fas.std():.4f}") print(f" 5-fold CV AUROC: {fas.mean():.4f} ± {fas.std():.4f}")
print(f" AUROC loss vs continuous: {fa.mean()-fas.mean():+.4f}") print(f" AUROC loss vs continuous: {fa.mean()-fas.mean():+.4f}")
print(f"\n SAPS-II baseline (same cohort):")
print(f" Overall AUROC: {saps_auc_overall:.4f}")
print(f" 5-fold CV AUROC: {fsap.mean():.4f} ± {fsap.std():.4f}")
print(f" Δ formula SAPS-II: {fa.mean()-fsap.mean():+.4f}")
print(f" Δ ordinal SAPS-II: {fas.mean()-fsap.mean():+.4f}")
print(f"\n Risk bands (ordinal score):") print(f"\n Risk bands (ordinal score):")
for bn in ["low", "mid", "high"]: for bn in ["low", "mid", "high"]:
br = band_results.get(bn, {}) br = band_results.get(bn, {})
@@ -571,6 +596,18 @@ def main():
"score_distribution": score_rows, "score_distribution": score_rows,
"risk_bands": band_results, "risk_bands": band_results,
}, },
"sapsii_baseline": {
"n": int(len(SAPS_all)),
"sapsii_mean": float(SAPS_all.mean()),
"sapsii_min": float(SAPS_all.min()),
"sapsii_max": float(SAPS_all.max()),
"auroc_overall": float(saps_auc_overall),
"cv_auroc_mean": float(fsap.mean()),
"cv_auroc_std": float(fsap.std()),
"cv_auroc_range": [float(fsap.min()), float(fsap.max())],
"delta_formula_minus_sapsii": float(fa.mean() - fsap.mean()),
"delta_ordinal_minus_sapsii": float(fas.mean() - fsap.mean()),
},
} }
with open(OUT_FILE, "w") as f: with open(OUT_FILE, "w") as f:
json.dump(output, f, indent=2, default=str) json.dump(output, f, indent=2, default=str)