feat: add SAPS-II eval to Kepler score comparison
This commit is contained in:
@@ -308,18 +308,21 @@ def bedside_score(p):
|
|||||||
|
|
||||||
def build_matrix(ids, primitives, cohort):
|
def build_matrix(ids, primitives, cohort):
|
||||||
import numpy as np
|
import numpy as np
|
||||||
X, y, scores, valid = [], [], [], []
|
X, y, scores, saps, valid = [], [], [], [], []
|
||||||
for sid in ids:
|
for sid in ids:
|
||||||
p = primitives.get(sid)
|
p = primitives.get(sid)
|
||||||
if p is None: continue
|
if p is None: continue
|
||||||
f = formula_features(p)
|
f = formula_features(p)
|
||||||
s, _ = bedside_score(p)
|
s, _ = bedside_score(p)
|
||||||
if f is None or s is None: continue
|
if f is None or s is None: continue
|
||||||
|
sap = cohort[sid].get("sapsii")
|
||||||
|
if sap is None: continue
|
||||||
X.append(f)
|
X.append(f)
|
||||||
y.append(int(cohort[sid].get("died") or 0))
|
y.append(int(cohort[sid].get("died") or 0))
|
||||||
scores.append(s)
|
scores.append(s)
|
||||||
|
saps.append(float(sap))
|
||||||
valid.append(sid)
|
valid.append(sid)
|
||||||
return np.array(X), np.array(y), np.array(scores), valid
|
return np.array(X), np.array(y), np.array(scores), np.array(saps), valid
|
||||||
|
|
||||||
|
|
||||||
# ── Main ────────────────────────────────────────────────────────────────────
|
# ── Main ────────────────────────────────────────────────────────────────────
|
||||||
@@ -353,8 +356,10 @@ def main():
|
|||||||
and formula_features(primitives[s]) is not None]
|
and formula_features(primitives[s]) is not None]
|
||||||
print(f" usable: {len(all_ids):,}")
|
print(f" usable: {len(all_ids):,}")
|
||||||
|
|
||||||
X_all, y_all, S_all, _ = build_matrix(all_ids, primitives, cohort)
|
X_all, y_all, S_all, SAPS_all, _ = build_matrix(all_ids, primitives, cohort)
|
||||||
print(f" mortality: {100*y_all.mean():.1f}%")
|
print(f" mortality: {100*y_all.mean():.1f}%")
|
||||||
|
print(f" SAPS-II: mean={SAPS_all.mean():.1f} "
|
||||||
|
f"min={SAPS_all.min():.0f} max={SAPS_all.max():.0f}")
|
||||||
|
|
||||||
# ══════════════════════════════════════════════════════════════════════
|
# ══════════════════════════════════════════════════════════════════════
|
||||||
# [4] 5-fold CV — 5-term formula
|
# [4] 5-fold CV — 5-term formula
|
||||||
@@ -366,34 +371,49 @@ def main():
|
|||||||
subj_arr = np.array(subject_ids)
|
subj_arr = np.array(subject_ids)
|
||||||
kf = KFold(n_splits=N_FOLDS, shuffle=False)
|
kf = KFold(n_splits=N_FOLDS, shuffle=False)
|
||||||
|
|
||||||
fold_aucs, fold_aucs_score, fold_coefs = [], [], []
|
fold_aucs, fold_aucs_score, fold_aucs_saps, fold_coefs = [], [], [], []
|
||||||
for k, (tr_idx, te_idx) in enumerate(kf.split(subj_arr)):
|
for k, (tr_idx, te_idx) in enumerate(kf.split(subj_arr)):
|
||||||
tr_subs = set(subj_arr[tr_idx].tolist())
|
tr_subs = set(subj_arr[tr_idx].tolist())
|
||||||
tr_ids = [s for s in all_ids if cohort[s]["subject_id"] in tr_subs]
|
tr_ids = [s for s in all_ids if cohort[s]["subject_id"] in tr_subs]
|
||||||
te_ids = [s for s in all_ids if cohort[s]["subject_id"] not in tr_subs]
|
te_ids = [s for s in all_ids if cohort[s]["subject_id"] not in tr_subs]
|
||||||
X_tr, y_tr, S_tr, _ = build_matrix(tr_ids, primitives, cohort)
|
X_tr, y_tr, S_tr, _, _ = build_matrix(tr_ids, primitives, cohort)
|
||||||
X_te, y_te, S_te, _ = build_matrix(te_ids, primitives, cohort)
|
X_te, y_te, S_te, SAPS_te, _ = build_matrix(te_ids, primitives, cohort)
|
||||||
|
|
||||||
mu = X_tr.mean(0); sd = X_tr.std(0) + 1e-9
|
mu = X_tr.mean(0); sd = X_tr.std(0) + 1e-9
|
||||||
lr = LogisticRegression(C=1.0, max_iter=1000, random_state=42)
|
lr = LogisticRegression(C=1.0, max_iter=1000, random_state=42)
|
||||||
lr.fit((X_tr - mu) / sd, y_tr)
|
lr.fit((X_tr - mu) / sd, y_tr)
|
||||||
pred = lr.predict_proba((X_te - mu) / sd)[:, 1]
|
pred = lr.predict_proba((X_te - mu) / sd)[:, 1]
|
||||||
auc_f = roc_auc_score(y_te, pred)
|
auc_f = roc_auc_score(y_te, pred)
|
||||||
auc_s = roc_auc_score(y_te, S_te) # Ordinal score AUROC
|
auc_s = roc_auc_score(y_te, S_te) # Ordinal score AUROC
|
||||||
|
auc_saps = roc_auc_score(y_te, SAPS_te) # SAPS-II baseline AUROC
|
||||||
|
|
||||||
raw_beta = lr.coef_[0] / sd
|
raw_beta = lr.coef_[0] / sd
|
||||||
raw_int = lr.intercept_[0] - sum(lr.coef_[0][i]*mu[i]/sd[i] for i in range(len(sd)))
|
raw_int = lr.intercept_[0] - sum(lr.coef_[0][i]*mu[i]/sd[i] for i in range(len(sd)))
|
||||||
fold_aucs.append(auc_f)
|
fold_aucs.append(auc_f)
|
||||||
fold_aucs_score.append(auc_s)
|
fold_aucs_score.append(auc_s)
|
||||||
|
fold_aucs_saps.append(auc_saps)
|
||||||
fold_coefs.append([raw_int] + list(raw_beta))
|
fold_coefs.append([raw_int] + list(raw_beta))
|
||||||
print(f" fold {k+1}: formula AUROC={auc_f:.4f} ordinal score AUROC={auc_s:.4f}")
|
print(f" fold {k+1}: formula AUROC={auc_f:.4f} "
|
||||||
|
f"ordinal AUROC={auc_s:.4f} SAPS-II AUROC={auc_saps:.4f}")
|
||||||
|
|
||||||
fa = np.array(fold_aucs); fas = np.array(fold_aucs_score)
|
fa = np.array(fold_aucs); fas = np.array(fold_aucs_score)
|
||||||
|
fsap = np.array(fold_aucs_saps)
|
||||||
print(f"\n 5-term formula CV AUROC: {fa.mean():.4f} ± {fa.std():.4f} "
|
print(f"\n 5-term formula CV AUROC: {fa.mean():.4f} ± {fa.std():.4f} "
|
||||||
f"(range {fa.min():.4f}–{fa.max():.4f})")
|
f"(range {fa.min():.4f}–{fa.max():.4f})")
|
||||||
print(f" Ordinal score CV AUROC: {fas.mean():.4f} ± {fas.std():.4f} "
|
print(f" Ordinal score CV AUROC: {fas.mean():.4f} ± {fas.std():.4f} "
|
||||||
f"(range {fas.min():.4f}–{fas.max():.4f})")
|
f"(range {fas.min():.4f}–{fas.max():.4f})")
|
||||||
|
print(f" SAPS-II CV AUROC: {fsap.mean():.4f} ± {fsap.std():.4f} "
|
||||||
|
f"(range {fsap.min():.4f}–{fsap.max():.4f})")
|
||||||
print(f" Ordinal loss: {fa.mean()-fas.mean():+.4f}")
|
print(f" Ordinal loss: {fa.mean()-fas.mean():+.4f}")
|
||||||
|
print(f" Δ vs SAPS-II (formula): {fa.mean()-fsap.mean():+.4f}")
|
||||||
|
print(f" Δ vs SAPS-II (ordinal): {fas.mean()-fsap.mean():+.4f}")
|
||||||
|
|
||||||
|
# SAPS-II is a fixed pre-computed score (no parameters fit here), so an
|
||||||
|
# in-cohort AUROC is not optimistic — report it on the full Kepler cohort
|
||||||
|
# for a single, directly comparable headline number.
|
||||||
|
saps_auc_overall = roc_auc_score(y_all, SAPS_all)
|
||||||
|
print(f"\n SAPS-II AUROC on full Kepler cohort (n={len(y_all):,}): "
|
||||||
|
f"{saps_auc_overall:.4f}")
|
||||||
|
|
||||||
fc = np.array(fold_coefs)
|
fc = np.array(fold_coefs)
|
||||||
print(f"\n Coefficient stability (5 terms):")
|
print(f"\n Coefficient stability (5 terms):")
|
||||||
@@ -500,8 +520,8 @@ def main():
|
|||||||
tr_subs = set(subs[:n_tr])
|
tr_subs = set(subs[:n_tr])
|
||||||
tr_ids = [s for s in all_ids if cohort[s]["subject_id"] in tr_subs]
|
tr_ids = [s for s in all_ids if cohort[s]["subject_id"] in tr_subs]
|
||||||
te_ids = [s for s in all_ids if cohort[s]["subject_id"] not in tr_subs]
|
te_ids = [s for s in all_ids if cohort[s]["subject_id"] not in tr_subs]
|
||||||
X_tr, y_tr, _, _ = build_matrix(tr_ids, primitives, cohort)
|
X_tr, y_tr, _, _, _ = build_matrix(tr_ids, primitives, cohort)
|
||||||
X_te, y_te, S_te, _ = build_matrix(te_ids, primitives, cohort)
|
X_te, y_te, S_te, _, _ = build_matrix(te_ids, primitives, cohort)
|
||||||
mu = X_tr.mean(0); sd = X_tr.std(0) + 1e-9
|
mu = X_tr.mean(0); sd = X_tr.std(0) + 1e-9
|
||||||
lr_cal = LogisticRegression(C=1.0, max_iter=1000, random_state=42)
|
lr_cal = LogisticRegression(C=1.0, max_iter=1000, random_state=42)
|
||||||
lr_cal.fit((X_tr - mu) / sd, y_tr)
|
lr_cal.fit((X_tr - mu) / sd, y_tr)
|
||||||
@@ -543,6 +563,11 @@ def main():
|
|||||||
print(f"\n Ordinal bedside score (0–13 pts):")
|
print(f"\n Ordinal bedside score (0–13 pts):")
|
||||||
print(f" 5-fold CV AUROC: {fas.mean():.4f} ± {fas.std():.4f}")
|
print(f" 5-fold CV AUROC: {fas.mean():.4f} ± {fas.std():.4f}")
|
||||||
print(f" AUROC loss vs continuous: {fa.mean()-fas.mean():+.4f}")
|
print(f" AUROC loss vs continuous: {fa.mean()-fas.mean():+.4f}")
|
||||||
|
print(f"\n SAPS-II baseline (same cohort):")
|
||||||
|
print(f" Overall AUROC: {saps_auc_overall:.4f}")
|
||||||
|
print(f" 5-fold CV AUROC: {fsap.mean():.4f} ± {fsap.std():.4f}")
|
||||||
|
print(f" Δ formula − SAPS-II: {fa.mean()-fsap.mean():+.4f}")
|
||||||
|
print(f" Δ ordinal − SAPS-II: {fas.mean()-fsap.mean():+.4f}")
|
||||||
print(f"\n Risk bands (ordinal score):")
|
print(f"\n Risk bands (ordinal score):")
|
||||||
for bn in ["low", "mid", "high"]:
|
for bn in ["low", "mid", "high"]:
|
||||||
br = band_results.get(bn, {})
|
br = band_results.get(bn, {})
|
||||||
@@ -571,6 +596,18 @@ def main():
|
|||||||
"score_distribution": score_rows,
|
"score_distribution": score_rows,
|
||||||
"risk_bands": band_results,
|
"risk_bands": band_results,
|
||||||
},
|
},
|
||||||
|
"sapsii_baseline": {
|
||||||
|
"n": int(len(SAPS_all)),
|
||||||
|
"sapsii_mean": float(SAPS_all.mean()),
|
||||||
|
"sapsii_min": float(SAPS_all.min()),
|
||||||
|
"sapsii_max": float(SAPS_all.max()),
|
||||||
|
"auroc_overall": float(saps_auc_overall),
|
||||||
|
"cv_auroc_mean": float(fsap.mean()),
|
||||||
|
"cv_auroc_std": float(fsap.std()),
|
||||||
|
"cv_auroc_range": [float(fsap.min()), float(fsap.max())],
|
||||||
|
"delta_formula_minus_sapsii": float(fa.mean() - fsap.mean()),
|
||||||
|
"delta_ordinal_minus_sapsii": float(fas.mean() - fsap.mean()),
|
||||||
|
},
|
||||||
}
|
}
|
||||||
with open(OUT_FILE, "w") as f:
|
with open(OUT_FILE, "w") as f:
|
||||||
json.dump(output, f, indent=2, default=str)
|
json.dump(output, f, indent=2, default=str)
|
||||||
|
|||||||
Reference in New Issue
Block a user