feat: add SAPS-II eval to Kepler score comparison
This commit is contained in:
@@ -308,18 +308,21 @@ def bedside_score(p):
|
||||
|
||||
def build_matrix(ids, primitives, cohort):
|
||||
import numpy as np
|
||||
X, y, scores, valid = [], [], [], []
|
||||
X, y, scores, saps, valid = [], [], [], [], []
|
||||
for sid in ids:
|
||||
p = primitives.get(sid)
|
||||
if p is None: continue
|
||||
f = formula_features(p)
|
||||
s, _ = bedside_score(p)
|
||||
if f is None or s is None: continue
|
||||
sap = cohort[sid].get("sapsii")
|
||||
if sap is None: continue
|
||||
X.append(f)
|
||||
y.append(int(cohort[sid].get("died") or 0))
|
||||
scores.append(s)
|
||||
saps.append(float(sap))
|
||||
valid.append(sid)
|
||||
return np.array(X), np.array(y), np.array(scores), valid
|
||||
return np.array(X), np.array(y), np.array(scores), np.array(saps), valid
|
||||
|
||||
|
||||
# ── Main ────────────────────────────────────────────────────────────────────
|
||||
@@ -353,8 +356,10 @@ def main():
|
||||
and formula_features(primitives[s]) is not None]
|
||||
print(f" usable: {len(all_ids):,}")
|
||||
|
||||
X_all, y_all, S_all, _ = build_matrix(all_ids, primitives, cohort)
|
||||
X_all, y_all, S_all, SAPS_all, _ = build_matrix(all_ids, primitives, cohort)
|
||||
print(f" mortality: {100*y_all.mean():.1f}%")
|
||||
print(f" SAPS-II: mean={SAPS_all.mean():.1f} "
|
||||
f"min={SAPS_all.min():.0f} max={SAPS_all.max():.0f}")
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# [4] 5-fold CV — 5-term formula
|
||||
@@ -366,34 +371,49 @@ def main():
|
||||
subj_arr = np.array(subject_ids)
|
||||
kf = KFold(n_splits=N_FOLDS, shuffle=False)
|
||||
|
||||
fold_aucs, fold_aucs_score, fold_coefs = [], [], []
|
||||
fold_aucs, fold_aucs_score, fold_aucs_saps, fold_coefs = [], [], [], []
|
||||
for k, (tr_idx, te_idx) in enumerate(kf.split(subj_arr)):
|
||||
tr_subs = set(subj_arr[tr_idx].tolist())
|
||||
tr_ids = [s for s in all_ids if cohort[s]["subject_id"] in tr_subs]
|
||||
te_ids = [s for s in all_ids if cohort[s]["subject_id"] not in tr_subs]
|
||||
X_tr, y_tr, S_tr, _ = build_matrix(tr_ids, primitives, cohort)
|
||||
X_te, y_te, S_te, _ = build_matrix(te_ids, primitives, cohort)
|
||||
X_tr, y_tr, S_tr, _, _ = build_matrix(tr_ids, primitives, cohort)
|
||||
X_te, y_te, S_te, SAPS_te, _ = build_matrix(te_ids, primitives, cohort)
|
||||
|
||||
mu = X_tr.mean(0); sd = X_tr.std(0) + 1e-9
|
||||
lr = LogisticRegression(C=1.0, max_iter=1000, random_state=42)
|
||||
lr.fit((X_tr - mu) / sd, y_tr)
|
||||
pred = lr.predict_proba((X_te - mu) / sd)[:, 1]
|
||||
auc_f = roc_auc_score(y_te, pred)
|
||||
auc_s = roc_auc_score(y_te, S_te) # Ordinal score AUROC
|
||||
auc_s = roc_auc_score(y_te, S_te) # Ordinal score AUROC
|
||||
auc_saps = roc_auc_score(y_te, SAPS_te) # SAPS-II baseline AUROC
|
||||
|
||||
raw_beta = lr.coef_[0] / sd
|
||||
raw_int = lr.intercept_[0] - sum(lr.coef_[0][i]*mu[i]/sd[i] for i in range(len(sd)))
|
||||
fold_aucs.append(auc_f)
|
||||
fold_aucs_score.append(auc_s)
|
||||
fold_aucs_saps.append(auc_saps)
|
||||
fold_coefs.append([raw_int] + list(raw_beta))
|
||||
print(f" fold {k+1}: formula AUROC={auc_f:.4f} ordinal score AUROC={auc_s:.4f}")
|
||||
print(f" fold {k+1}: formula AUROC={auc_f:.4f} "
|
||||
f"ordinal AUROC={auc_s:.4f} SAPS-II AUROC={auc_saps:.4f}")
|
||||
|
||||
fa = np.array(fold_aucs); fas = np.array(fold_aucs_score)
|
||||
fsap = np.array(fold_aucs_saps)
|
||||
print(f"\n 5-term formula CV AUROC: {fa.mean():.4f} ± {fa.std():.4f} "
|
||||
f"(range {fa.min():.4f}–{fa.max():.4f})")
|
||||
print(f" Ordinal score CV AUROC: {fas.mean():.4f} ± {fas.std():.4f} "
|
||||
f"(range {fas.min():.4f}–{fas.max():.4f})")
|
||||
print(f" SAPS-II CV AUROC: {fsap.mean():.4f} ± {fsap.std():.4f} "
|
||||
f"(range {fsap.min():.4f}–{fsap.max():.4f})")
|
||||
print(f" Ordinal loss: {fa.mean()-fas.mean():+.4f}")
|
||||
print(f" Δ vs SAPS-II (formula): {fa.mean()-fsap.mean():+.4f}")
|
||||
print(f" Δ vs SAPS-II (ordinal): {fas.mean()-fsap.mean():+.4f}")
|
||||
|
||||
# SAPS-II is a fixed pre-computed score (no parameters fit here), so an
|
||||
# in-cohort AUROC is not optimistic — report it on the full Kepler cohort
|
||||
# for a single, directly comparable headline number.
|
||||
saps_auc_overall = roc_auc_score(y_all, SAPS_all)
|
||||
print(f"\n SAPS-II AUROC on full Kepler cohort (n={len(y_all):,}): "
|
||||
f"{saps_auc_overall:.4f}")
|
||||
|
||||
fc = np.array(fold_coefs)
|
||||
print(f"\n Coefficient stability (5 terms):")
|
||||
@@ -500,8 +520,8 @@ def main():
|
||||
tr_subs = set(subs[:n_tr])
|
||||
tr_ids = [s for s in all_ids if cohort[s]["subject_id"] in tr_subs]
|
||||
te_ids = [s for s in all_ids if cohort[s]["subject_id"] not in tr_subs]
|
||||
X_tr, y_tr, _, _ = build_matrix(tr_ids, primitives, cohort)
|
||||
X_te, y_te, S_te, _ = build_matrix(te_ids, primitives, cohort)
|
||||
X_tr, y_tr, _, _, _ = build_matrix(tr_ids, primitives, cohort)
|
||||
X_te, y_te, S_te, _, _ = build_matrix(te_ids, primitives, cohort)
|
||||
mu = X_tr.mean(0); sd = X_tr.std(0) + 1e-9
|
||||
lr_cal = LogisticRegression(C=1.0, max_iter=1000, random_state=42)
|
||||
lr_cal.fit((X_tr - mu) / sd, y_tr)
|
||||
@@ -543,6 +563,11 @@ def main():
|
||||
print(f"\n Ordinal bedside score (0–13 pts):")
|
||||
print(f" 5-fold CV AUROC: {fas.mean():.4f} ± {fas.std():.4f}")
|
||||
print(f" AUROC loss vs continuous: {fa.mean()-fas.mean():+.4f}")
|
||||
print(f"\n SAPS-II baseline (same cohort):")
|
||||
print(f" Overall AUROC: {saps_auc_overall:.4f}")
|
||||
print(f" 5-fold CV AUROC: {fsap.mean():.4f} ± {fsap.std():.4f}")
|
||||
print(f" Δ formula − SAPS-II: {fa.mean()-fsap.mean():+.4f}")
|
||||
print(f" Δ ordinal − SAPS-II: {fas.mean()-fsap.mean():+.4f}")
|
||||
print(f"\n Risk bands (ordinal score):")
|
||||
for bn in ["low", "mid", "high"]:
|
||||
br = band_results.get(bn, {})
|
||||
@@ -571,6 +596,18 @@ def main():
|
||||
"score_distribution": score_rows,
|
||||
"risk_bands": band_results,
|
||||
},
|
||||
"sapsii_baseline": {
|
||||
"n": int(len(SAPS_all)),
|
||||
"sapsii_mean": float(SAPS_all.mean()),
|
||||
"sapsii_min": float(SAPS_all.min()),
|
||||
"sapsii_max": float(SAPS_all.max()),
|
||||
"auroc_overall": float(saps_auc_overall),
|
||||
"cv_auroc_mean": float(fsap.mean()),
|
||||
"cv_auroc_std": float(fsap.std()),
|
||||
"cv_auroc_range": [float(fsap.min()), float(fsap.max())],
|
||||
"delta_formula_minus_sapsii": float(fa.mean() - fsap.mean()),
|
||||
"delta_ordinal_minus_sapsii": float(fas.mean() - fsap.mean()),
|
||||
},
|
||||
}
|
||||
with open(OUT_FILE, "w") as f:
|
||||
json.dump(output, f, indent=2, default=str)
|
||||
|
||||
Reference in New Issue
Block a user