add: diag

feat: add SAPS-II eval to Kepler score comparison
2026-05-05 19:35:11 +02:00 · 2026-05-05 18:35:06 +02:00
2 changed files with 494 additions and 10 deletions
--- a/paper3_phase5b_icd_sepsis_prevalence.py
+++ b/paper3_phase5b_icd_sepsis_prevalence.py
@@ -0,0 +1,447 @@
 """
 ICD-coded sepsis prevalence — inclusion vs exclusion cohort
 ═════════════════════════════════════════════════════════════════════════════
 Companion analysis for paper3_phase5b_refined.py.
 The Phase 5b cohort SQL (`q_cohort()`) keeps an ICU stay only when ALL of:
    1. sepsis3.sepsis3 = TRUE                  (Sepsis-3 derived flag)
    2. ICU length-of-stay ≥ 24h                (H_SNAPSHOT)
    3. sapsii IS NOT NULL AND sapsii ≥ 48      (SAPS-II Q4)
 This script computes the prevalence of *explicit* ICD-coded sepsis on:
    (a) the INCLUSION cohort  — stays that satisfy all three filters,
    (b) the EXCLUSION cohort  — every other ICU stay in MIMIC-III (fails
        at least one of the three filters above), and
    (c) ALL ICU STAYS         — the full mimiciii.icustays universe
        (= inclusion ∪ exclusion).
 Note: stay-level totals partition cleanly (incl + excl = all), but at the
 admission and subject level a single hadm_id / subject can have ICU stays
 in both buckets, so the "all" row is computed via SQL GROUPING SETS rather
 than by summing.
 ICD-coded sepsis is evaluated at the hospital-admission level (a stay is
 "ICD-sepsis +" if its parent hadm_id carries any of the codes below):
    - Explicit sepsis (ICD-9):   995.91, 995.92, 785.52
       (matches paper2_festung_teil3.py SYNDROME_ICDS["sepsis"]["icd_9"])
    - Angus septicemia (ICD-9):  038.*   (any 038-prefixed code)
    - Any of the above (union)
 For each cohort × definition combination we report:
    n positives, prevalence, Wilson 95% CI.
 The inclusion vs exclusion difference is reported with a normal-approx
 95% CI and a Pearson χ² statistic (no scipy dependency).
 Usage:
  python paper3_phase5b_icd_sepsis_prevalence.py
 """
 import json, math, os, sys, time
 # Reuse the same env-var contract as paper3_phase5b_refined.py.
 PG_DSN         = os.environ.get("MIMIC_PG_DSN", "dbname=mimic3")
 MIMIC_SCHEMA   = os.environ.get("MIMIC_SCHEMA",  "mimiciii")
 DERIVED_SCHEMA = os.environ.get("DERIVED_SCHEMA", MIMIC_SCHEMA)
 H_SNAPSHOT  = 24      # ICU LOS threshold, hours (matches paper3 phase 5b)
 SAPSII_MIN  = 48      # SAPS-II Q4 cutoff (matches paper3 phase 5b)
 OUT_FILE    = "paper3_phase5b_icd_sepsis_prevalence.json"
 # Explicit sepsis codes (ICD-9, MIMIC-III stores them WITHOUT decimal point):
 #   995.91  →  '99591'   Sepsis
 #   995.92  →  '99592'   Severe sepsis
 #   785.52  →  '78552'   Septic shock
 EXPLICIT_SEPSIS_ICD9 = ("99591", "99592", "78552")
 # Angus-style broad septicemia bucket: any ICD-9 starting with 038.
 SEPTICEMIA_PREFIX = "038"
 _PG_CONN = None
 def _pg_conn():
    global _PG_CONN
    if _PG_CONN is None or getattr(_PG_CONN, "closed", 0):
        import psycopg2
        _PG_CONN = psycopg2.connect(PG_DSN)
        _PG_CONN.set_session(readonly=True, autocommit=True)
    return _PG_CONN
 def run_pg(sql, label=""):
    import psycopg2.extras
    conn = _pg_conn()
    t0 = time.time()
    with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
        cur.execute(sql)
        rows = [dict(r) for r in cur.fetchall()] if cur.description else []
    print(f"    {label:40s}  {len(rows):>8,d} rows  ({time.time()-t0:.1f}s)")
    return rows
 # ── SQL ─────────────────────────────────────────────────────────────────────
 #
 # One pass: classify every ICU stay as "inclusion" or "exclusion" using the
 # Phase 5b filter, then left-join the two ICD code sets at the hadm_id level
 # and aggregate. This mirrors q_cohort() exactly for the inclusion bucket
 # (sepsis3 = TRUE AND LOS ≥ 24h AND sapsii ≥ 48), and treats every other
 # ICU stay in mimiciii.icustays as exclusion. GROUPING SETS adds a third
 # row (cohort = NULL → 'all') aggregated over the full ICU universe so that
 # admission- and subject-level distinct counts are correct (a single hadm_id
 # may straddle both buckets, so we cannot just sum incl + excl).
 def q_prevalence():
    explicit = ",".join(f"'{c}'" for c in EXPLICIT_SEPSIS_ICD9)
    return f"""
 WITH icu AS (
    SELECT icu.icustay_id,
           icu.hadm_id,
           icu.subject_id,
           EXTRACT(EPOCH FROM (icu.outtime - icu.intime)) / 3600.0 AS los_h,
           COALESCE(s3.sepsis3, FALSE) AS is_sepsis3,
           saps.sapsii AS sapsii
      FROM {MIMIC_SCHEMA}.icustays icu
      LEFT JOIN {DERIVED_SCHEMA}.sepsis3 s3   ON s3.icustay_id   = icu.icustay_id
      LEFT JOIN {DERIVED_SCHEMA}.sapsii  saps ON saps.icustay_id = icu.icustay_id
     WHERE icu.icustay_id IS NOT NULL
       AND icu.hadm_id    IS NOT NULL
 ),
 classified AS (
    SELECT icustay_id, hadm_id, subject_id,
           is_sepsis3, los_h, sapsii,
           CASE WHEN is_sepsis3 = TRUE
                     AND los_h  >= {H_SNAPSHOT}
                     AND sapsii IS NOT NULL
                     AND sapsii >= {SAPSII_MIN}
                THEN 'inclusion' ELSE 'exclusion' END AS cohort
      FROM icu
 ),
 explicit_sepsis AS (
    SELECT DISTINCT hadm_id
      FROM {MIMIC_SCHEMA}.diagnoses_icd
     WHERE icd9_code IN ({explicit})
 ),
 septicemia AS (
    SELECT DISTINCT hadm_id
      FROM {MIMIC_SCHEMA}.diagnoses_icd
     WHERE icd9_code LIKE '{SEPTICEMIA_PREFIX}%%'
 )
 SELECT
    COALESCE(c.cohort, 'all')                   AS cohort,
    COUNT(*)                                    AS n_stays,
    COUNT(DISTINCT c.hadm_id)                   AS n_admissions,
    COUNT(DISTINCT c.subject_id)                AS n_subjects,
    SUM(CASE WHEN e.hadm_id IS NOT NULL
             THEN 1 ELSE 0 END)                 AS n_stays_explicit,
    COUNT(DISTINCT CASE WHEN e.hadm_id IS NOT NULL
                        THEN c.hadm_id END)     AS n_adm_explicit,
    SUM(CASE WHEN s.hadm_id IS NOT NULL
             THEN 1 ELSE 0 END)                 AS n_stays_septicemia,
    COUNT(DISTINCT CASE WHEN s.hadm_id IS NOT NULL
                        THEN c.hadm_id END)     AS n_adm_septicemia,
    SUM(CASE WHEN e.hadm_id IS NOT NULL
              OR s.hadm_id IS NOT NULL
             THEN 1 ELSE 0 END)                 AS n_stays_any,
    COUNT(DISTINCT CASE WHEN e.hadm_id IS NOT NULL
                          OR s.hadm_id IS NOT NULL
                        THEN c.hadm_id END)     AS n_adm_any
  FROM classified c
  LEFT JOIN explicit_sepsis e ON e.hadm_id = c.hadm_id
  LEFT JOIN septicemia      s ON s.hadm_id = c.hadm_id
 GROUP BY GROUPING SETS ((c.cohort), ())
 """
 def q_exclusion_breakdown():
    """How many excluded stays fail each individual filter (non-exclusive
    counts; an excluded stay can violate >1 criterion)."""
    return f"""
 WITH icu AS (
    SELECT icu.icustay_id,
           EXTRACT(EPOCH FROM (icu.outtime - icu.intime)) / 3600.0 AS los_h,
           COALESCE(s3.sepsis3, FALSE) AS is_sepsis3,
           saps.sapsii AS sapsii
      FROM {MIMIC_SCHEMA}.icustays icu
      LEFT JOIN {DERIVED_SCHEMA}.sepsis3 s3   ON s3.icustay_id   = icu.icustay_id
      LEFT JOIN {DERIVED_SCHEMA}.sapsii  saps ON saps.icustay_id = icu.icustay_id
     WHERE icu.icustay_id IS NOT NULL
       AND icu.hadm_id    IS NOT NULL
 )
 SELECT
    COUNT(*)                                                AS n_total,
    SUM(CASE WHEN is_sepsis3 = FALSE
             THEN 1 ELSE 0 END)                             AS n_not_sepsis3,
    SUM(CASE WHEN los_h < {H_SNAPSHOT}
             THEN 1 ELSE 0 END)                             AS n_los_short,
    SUM(CASE WHEN sapsii IS NULL
             THEN 1 ELSE 0 END)                             AS n_sapsii_null,
    SUM(CASE WHEN sapsii IS NOT NULL AND sapsii < {SAPSII_MIN}
             THEN 1 ELSE 0 END)                             AS n_sapsii_below
  FROM icu
 """
 def q_icd_sepsis_waterfall():
    """Mutually-exclusive waterfall, restricted to ICD-coded sepsis stays
    only, showing which inclusion gate eliminated them.  Uses the union
    definition (explicit sepsis codes ∪ 038.* septicemia)."""
    explicit = ",".join(f"'{c}'" for c in EXPLICIT_SEPSIS_ICD9)
    return f"""
 WITH icu AS (
    SELECT icu.icustay_id, icu.hadm_id,
           EXTRACT(EPOCH FROM (icu.outtime - icu.intime)) / 3600.0 AS los_h,
           COALESCE(s3.sepsis3, FALSE) AS is_sepsis3,
           saps.sapsii AS sapsii
      FROM {MIMIC_SCHEMA}.icustays icu
      LEFT JOIN {DERIVED_SCHEMA}.sepsis3 s3   ON s3.icustay_id   = icu.icustay_id
      LEFT JOIN {DERIVED_SCHEMA}.sapsii  saps ON saps.icustay_id = icu.icustay_id
     WHERE icu.icustay_id IS NOT NULL
       AND icu.hadm_id    IS NOT NULL
 ),
 icd_pos AS (
    SELECT DISTINCT hadm_id
      FROM {MIMIC_SCHEMA}.diagnoses_icd
     WHERE icd9_code IN ({explicit})
        OR icd9_code LIKE '{SEPTICEMIA_PREFIX}%%'
 ),
 icd_stays AS (
    SELECT i.* FROM icu i JOIN icd_pos x ON x.hadm_id = i.hadm_id
 )
 SELECT
    COUNT(*)                                                  AS n_total_stays,
    COUNT(DISTINCT hadm_id)                                   AS n_total_adm,
    -- Waterfall: each stay is counted exactly once, in the order of the
    -- inclusion filter (sepsis3 → LOS → sapsii NULL → sapsii < 48 → pass).
    SUM(CASE WHEN NOT is_sepsis3
             THEN 1 ELSE 0 END)                               AS n_fail_sepsis3,
    SUM(CASE WHEN is_sepsis3 AND los_h < {H_SNAPSHOT}
             THEN 1 ELSE 0 END)                               AS n_fail_los,
    SUM(CASE WHEN is_sepsis3 AND los_h >= {H_SNAPSHOT}
                              AND sapsii IS NULL
             THEN 1 ELSE 0 END)                               AS n_fail_sapsii_null,
    SUM(CASE WHEN is_sepsis3 AND los_h >= {H_SNAPSHOT}
                              AND sapsii IS NOT NULL
                              AND sapsii < {SAPSII_MIN}
             THEN 1 ELSE 0 END)                               AS n_fail_sapsii_below,
    SUM(CASE WHEN is_sepsis3 AND los_h >= {H_SNAPSHOT}
                              AND sapsii IS NOT NULL
                              AND sapsii >= {SAPSII_MIN}
             THEN 1 ELSE 0 END)                               AS n_pass
  FROM icd_stays
 """
 # ── Stats helpers (no scipy) ────────────────────────────────────────────────
 def wilson_ci(k, n, z=1.959963984540054):
    """Wilson score 95% CI for a binomial proportion. Returns (lo, hi)."""
    if n <= 0:
        return (float("nan"), float("nan"))
    p = k / n
    denom = 1.0 + z*z/n
    centre = (p + z*z/(2.0*n)) / denom
    half   = (z * math.sqrt(p*(1.0 - p)/n + z*z/(4.0*n*n))) / denom
    return (max(0.0, centre - half), min(1.0, centre + half))
 def diff_ci(k1, n1, k2, n2, z=1.959963984540054):
    """Normal-approx 95% CI for (p1 − p2). Returns (delta, lo, hi)."""
    if n1 <= 0 or n2 <= 0:
        return (float("nan"),) * 3
    p1, p2 = k1 / n1, k2 / n2
    se = math.sqrt(p1*(1.0 - p1)/n1 + p2*(1.0 - p2)/n2)
    d  = p1 - p2
    return (d, d - z*se, d + z*se)
 def chi2_2x2(k1, n1, k2, n2):
    """Pearson χ² for the 2×2 table (sepsis± × cohort). Returns (chi2, dof=1).
    Critical value at p=0.05 is 3.841."""
    a, b = k1, n1 - k1            # incl: sepsis+, sepsis−
    c, d = k2, n2 - k2            # excl: sepsis+, sepsis−
    n = n1 + n2
    if n == 0: return (float("nan"), 1)
    row1, row2 = a + b, c + d
    col1, col2 = a + c, b + d
    chi2 = 0.0
    for obs, r, col in ((a, row1, col1), (b, row1, col2),
                         (c, row2, col1), (d, row2, col2)):
        exp = r * col / n
        if exp > 0:
            chi2 += (obs - exp) ** 2 / exp
    return (chi2, 1)
 def fmt_pct(p):    return f"{100.0*p:5.2f}%"
 def fmt_ci(lo,hi): return f"[{100.0*lo:5.2f}, {100.0*hi:5.2f}]"
 # ── Main ────────────────────────────────────────────────────────────────────
 def main():
    print("\n" + "█"*78)
    print("  ICD-coded sepsis prevalence — Phase 5b inclusion vs exclusion")
    print("█"*78)
    print(f"\n  PG DSN:         {PG_DSN}")
    print(f"  MIMIC schema:   {MIMIC_SCHEMA}")
    print(f"  Derived schema: {DERIVED_SCHEMA}")
    print(f"  Inclusion:      sepsis3=TRUE AND LOS≥{H_SNAPSHOT}h AND SAPS-II≥{SAPSII_MIN}")
    print(f"  Explicit ICD-9: {', '.join(EXPLICIT_SEPSIS_ICD9)} "
          f"(995.91 / 995.92 / 785.52)")
    print(f"  Septicemia:     ICD-9 {SEPTICEMIA_PREFIX}.*")
    print(f"\n[1] Querying MIMIC-III...")
    rows  = run_pg(q_prevalence(),            "cohort × ICD prevalence")
    bkdwn = run_pg(q_exclusion_breakdown(),   "exclusion breakdown")
    wfall = run_pg(q_icd_sepsis_waterfall(),  "ICD-sepsis waterfall")
    if not rows:
        print("\n[ERROR] no rows returned. Check PG_DSN / schema permissions.")
        sys.exit(1)
    by = {r["cohort"]: r for r in rows}
    incl = by.get("inclusion", {})
    excl = by.get("exclusion", {})
    allc = by.get("all",       {})
    INT_KEYS = ("n_stays","n_admissions","n_subjects",
                "n_stays_explicit","n_adm_explicit",
                "n_stays_septicemia","n_adm_septicemia",
                "n_stays_any","n_adm_any")
    for c in (incl, excl, allc):
        for k in INT_KEYS:
            c[k] = int(c.get(k) or 0)
    COHORTS = (("inclusion", incl), ("exclusion", excl), ("all", allc))
    # ── [2] Cohort sizes ──────────────────────────────────────────────────
    print(f"\n[2] Cohort sizes")
    print(f"    {'cohort':12s}  {'stays':>8s}  {'admissions':>11s}  {'subjects':>9s}")
    for label, c in COHORTS:
        print(f"    {label:12s}  {c['n_stays']:>8,d}  "
              f"{c['n_admissions']:>11,d}  {c['n_subjects']:>9,d}")
    # ── [3] Why an ICU stay was excluded ──────────────────────────────────
    if bkdwn:
        b = bkdwn[0]
        print(f"\n[3] Exclusion breakdown (non-exclusive: a stay can fail >1 filter)")
        n_total = int(b['n_total'] or 0)
        for lbl, k in (("not Sepsis-3",          "n_not_sepsis3"),
                       (f"ICU LOS < {H_SNAPSHOT}h",  "n_los_short"),
                       ("SAPS-II is NULL",       "n_sapsii_null"),
                       (f"SAPS-II < {SAPSII_MIN}",   "n_sapsii_below")):
            n = int(b[k] or 0)
            pct = 100.0*n/n_total if n_total else 0.0
            print(f"    {lbl:24s}  {n:>8,d}  ({pct:5.2f}% of all ICU stays)")
        print(f"    {'all ICU stays':24s}  {n_total:>8,d}")
    # ── [3b] ICD-sepsis-positive waterfall ────────────────────────────────
    # Diagnostic: of the ICD-coded sepsis stays (explicit ∪ 038.*), which
    # inclusion gate eliminated them?  Mutually exclusive: each stay is
    # counted in the FIRST gate that would reject it, walking in the
    # inclusion-filter order (sepsis3 → LOS → SAPS-II NULL → SAPS-II<48).
    if wfall:
        w = wfall[0]
        n_w = int(w["n_total_stays"] or 0)
        n_adm = int(w["n_total_adm"]   or 0)
        print(f"\n[3b] ICD-sepsis-positive waterfall (mutually exclusive,"
              f" inclusion-filter order)")
        print(f"     {'gate':28s}  {'n stays':>9s}  {'pct':>6s}  cumulative")
        cum = 0
        steps = (
            ("rejected: not Sepsis-3",      "n_fail_sepsis3"),
            (f"rejected: LOS < {H_SNAPSHOT}h",  "n_fail_los"),
            ("rejected: SAPS-II is NULL",   "n_fail_sapsii_null"),
            (f"rejected: SAPS-II < {SAPSII_MIN}",  "n_fail_sapsii_below"),
            (f"PASS (= inclusion)",         "n_pass"),
        )
        for lbl, kn in steps:
            n = int(w[kn] or 0)
            cum += n
            pct = 100.0*n/n_w if n_w else 0.0
            print(f"     {lbl:28s}  {n:>9,d}  {pct:5.2f}%  {cum:>9,d}")
        print(f"     {'TOTAL ICD-sepsis stays':28s}  {n_w:>9,d}  "
              f"({n_adm:,} admissions)")
    # ── [4] Prevalence per definition ─────────────────────────────────────
    DEFS = (
        ("Explicit sepsis (995.91 / 995.92 / 785.52)",
         "n_stays_explicit",   "n_adm_explicit"),
        ("Angus septicemia (038.*)",
         "n_stays_septicemia", "n_adm_septicemia"),
        ("Any of the above (union)",
         "n_stays_any",        "n_adm_any"),
    )
    def _table(title, denom_key_n, denom_key_k):
        """Render the prevalence table for one denominator (stays or
        admissions) and append rows to `results[bucket]`."""
        print(f"\n{title}")
        print(f"    {'definition':45s}  {'cohort':10s}  "
              f"{'n+':>7s}  {'N':>7s}  {'prev':>7s}  {'95% CI (Wilson)':>18s}")
        out = []
        for name, sk, ak in DEFS:
            kkey = sk if denom_key_k == "n_stays" else ak
            for label, c in COHORTS:
                k_, n_ = c[kkey], c[denom_key_n]
                p = k_/n_ if n_ else float("nan")
                lo, hi = wilson_ci(k_, n_)
                print(f"    {name:45s}  {label:10s}  "
                      f"{k_:>7,d}  {n_:>7,d}  {fmt_pct(p):>7s}  "
                      f"{fmt_ci(lo,hi):>18s}")
                out.append({"definition": name, "cohort": label,
                            "k": k_, "n": n_, "prevalence": p,
                            "ci_lo": lo, "ci_hi": hi})
            # Inclusion vs exclusion comparison (the "all" row is just a
            # weighted average of the two so a Δ against it isn't meaningful).
            k1, n1 = incl[kkey], incl[denom_key_n]
            k2, n2 = excl[kkey], excl[denom_key_n]
            d, dlo, dhi = diff_ci(k1, n1, k2, n2)
            chi2, dof  = chi2_2x2(k1, n1, k2, n2)
            sig = "p<0.05" if chi2 > 3.841 else "n.s."
            print(f"    {'  Δ (incl − excl)':45s}  {'':10s}  "
                  f"{'':>7s}  {'':>7s}  {fmt_pct(d):>7s}  "
                  f"{fmt_ci(dlo,dhi):>18s}   χ²={chi2:6.2f} ({sig})")
            out.append({"definition": name, "cohort": "delta_incl_minus_excl",
                        "delta": d, "ci_lo": dlo, "ci_hi": dhi,
                        "chi2": chi2, "dof": dof})
        return out
    results = {
        "by_stays":      _table("[4] ICD-coded sepsis prevalence (denominator = ICU STAYS)",
                                "n_stays", "n_stays"),
        "by_admissions": _table("[5] ICD-coded sepsis prevalence (denominator = ADMISSIONS)",
                                "n_admissions", "n_admissions"),
    }
    # ── Save ────────────────────────────────────────────────────────────
    output = {
        "filters": {
            "h_snapshot_hours": H_SNAPSHOT,
            "sapsii_min": SAPSII_MIN,
            "explicit_icd9": list(EXPLICIT_SEPSIS_ICD9),
            "septicemia_prefix": SEPTICEMIA_PREFIX,
        },
        "cohorts": {
            "inclusion": {k: incl[k] for k in INT_KEYS},
            "exclusion": {k: excl[k] for k in INT_KEYS},
            "all":       {k: allc[k] for k in INT_KEYS},
        },
        "exclusion_breakdown":      (bkdwn[0] if bkdwn else None),
        "icd_sepsis_waterfall":     (wfall[0] if wfall else None),
        "results": results,
    }
    with open(OUT_FILE, "w") as f:
        json.dump(output, f, indent=2, default=str)
    print(f"\n  → Saved: {OUT_FILE}")
    print("\n" + "█"*78 + "\n")
 if __name__ == "__main__":
    main()
--- a/paper3_phase5b_refined.py
+++ b/paper3_phase5b_refined.py
@@ -308,18 +308,21 @@ def bedside_score(p):
 def build_matrix(ids, primitives, cohort):
    import numpy as np
-    X, y, scores, valid = [], [], [], []
+    X, y, scores, saps, valid = [], [], [], [], []
    for sid in ids:
        p = primitives.get(sid)
        if p is None: continue
        f = formula_features(p)
        s, _ = bedside_score(p)
        if f is None or s is None: continue
        sap = cohort[sid].get("sapsii")
        if sap is None: continue
        X.append(f)
        y.append(int(cohort[sid].get("died") or 0))
        scores.append(s)
        saps.append(float(sap))
        valid.append(sid)
-    return np.array(X), np.array(y), np.array(scores), valid
+    return np.array(X), np.array(y), np.array(scores), np.array(saps), valid
 # ── Main ────────────────────────────────────────────────────────────────────
@@ -353,8 +356,10 @@ def main():
               and formula_features(primitives[s]) is not None]
    print(f"    usable: {len(all_ids):,}")
-    X_all, y_all, S_all, _ = build_matrix(all_ids, primitives, cohort)
+    X_all, y_all, S_all, SAPS_all, _ = build_matrix(all_ids, primitives, cohort)
    print(f"    mortality: {100*y_all.mean():.1f}%")
    print(f"    SAPS-II:   mean={SAPS_all.mean():.1f}  "
          f"min={SAPS_all.min():.0f}  max={SAPS_all.max():.0f}")
    # ══════════════════════════════════════════════════════════════════════
    # [4] 5-fold CV — 5-term formula
@@ -366,13 +371,13 @@ def main():
    subj_arr = np.array(subject_ids)
    kf = KFold(n_splits=N_FOLDS, shuffle=False)
-    fold_aucs, fold_aucs_score, fold_coefs = [], [], []
+    fold_aucs, fold_aucs_score, fold_aucs_saps, fold_coefs = [], [], [], []
    for k, (tr_idx, te_idx) in enumerate(kf.split(subj_arr)):
        tr_subs = set(subj_arr[tr_idx].tolist())
        tr_ids = [s for s in all_ids if cohort[s]["subject_id"] in tr_subs]
        te_ids = [s for s in all_ids if cohort[s]["subject_id"] not in tr_subs]
-        X_tr, y_tr, S_tr, _ = build_matrix(tr_ids, primitives, cohort)
+        X_tr, y_tr, S_tr, _, _ = build_matrix(tr_ids, primitives, cohort)
-        X_te, y_te, S_te, _ = build_matrix(te_ids, primitives, cohort)
+        X_te, y_te, S_te, SAPS_te, _ = build_matrix(te_ids, primitives, cohort)
        mu = X_tr.mean(0); sd = X_tr.std(0) + 1e-9
        lr = LogisticRegression(C=1.0, max_iter=1000, random_state=42)
@@ -380,20 +385,35 @@ def main():
        pred = lr.predict_proba((X_te - mu) / sd)[:, 1]
        auc_f = roc_auc_score(y_te, pred)
        auc_s = roc_auc_score(y_te, S_te)        # Ordinal score AUROC
        auc_saps = roc_auc_score(y_te, SAPS_te)  # SAPS-II baseline AUROC
        raw_beta = lr.coef_[0] / sd
        raw_int  = lr.intercept_[0] - sum(lr.coef_[0][i]*mu[i]/sd[i] for i in range(len(sd)))
        fold_aucs.append(auc_f)
        fold_aucs_score.append(auc_s)
        fold_aucs_saps.append(auc_saps)
        fold_coefs.append([raw_int] + list(raw_beta))
-        print(f"    fold {k+1}: formula AUROC={auc_f:.4f}  ordinal score AUROC={auc_s:.4f}")
+        print(f"    fold {k+1}: formula AUROC={auc_f:.4f}  "
              f"ordinal AUROC={auc_s:.4f}  SAPS-II AUROC={auc_saps:.4f}")
    fa = np.array(fold_aucs); fas = np.array(fold_aucs_score)
    fsap = np.array(fold_aucs_saps)
    print(f"\n    5-term formula CV AUROC:  {fa.mean():.4f} ± {fa.std():.4f}  "
          f"(range {fa.min():.4f}–{fa.max():.4f})")
    print(f"    Ordinal score  CV AUROC:  {fas.mean():.4f} ± {fas.std():.4f}  "
          f"(range {fas.min():.4f}–{fas.max():.4f})")
    print(f"    SAPS-II        CV AUROC:  {fsap.mean():.4f} ± {fsap.std():.4f}  "
          f"(range {fsap.min():.4f}–{fsap.max():.4f})")
    print(f"    Ordinal loss:             {fa.mean()-fas.mean():+.4f}")
    print(f"    Δ vs SAPS-II (formula):   {fa.mean()-fsap.mean():+.4f}")
    print(f"    Δ vs SAPS-II (ordinal):   {fas.mean()-fsap.mean():+.4f}")
    # SAPS-II is a fixed pre-computed score (no parameters fit here), so an
    # in-cohort AUROC is not optimistic — report it on the full Kepler cohort
    # for a single, directly comparable headline number.
    saps_auc_overall = roc_auc_score(y_all, SAPS_all)
    print(f"\n    SAPS-II AUROC on full Kepler cohort (n={len(y_all):,}): "
          f"{saps_auc_overall:.4f}")
    fc = np.array(fold_coefs)
    print(f"\n    Coefficient stability (5 terms):")
@@ -500,8 +520,8 @@ def main():
    tr_subs = set(subs[:n_tr])
    tr_ids = [s for s in all_ids if cohort[s]["subject_id"] in tr_subs]
    te_ids = [s for s in all_ids if cohort[s]["subject_id"] not in tr_subs]
-    X_tr, y_tr, _, _ = build_matrix(tr_ids, primitives, cohort)
+    X_tr, y_tr, _, _, _ = build_matrix(tr_ids, primitives, cohort)
-    X_te, y_te, S_te, _ = build_matrix(te_ids, primitives, cohort)
+    X_te, y_te, S_te, _, _ = build_matrix(te_ids, primitives, cohort)
    mu = X_tr.mean(0); sd = X_tr.std(0) + 1e-9
    lr_cal = LogisticRegression(C=1.0, max_iter=1000, random_state=42)
    lr_cal.fit((X_tr - mu) / sd, y_tr)
@@ -543,6 +563,11 @@ def main():
    print(f"\n    Ordinal bedside score (0–13 pts):")
    print(f"      5-fold CV AUROC:  {fas.mean():.4f} ± {fas.std():.4f}")
    print(f"      AUROC loss vs continuous: {fa.mean()-fas.mean():+.4f}")
    print(f"\n    SAPS-II baseline (same cohort):")
    print(f"      Overall AUROC:    {saps_auc_overall:.4f}")
    print(f"      5-fold CV AUROC:  {fsap.mean():.4f} ± {fsap.std():.4f}")
    print(f"      Δ formula − SAPS-II: {fa.mean()-fsap.mean():+.4f}")
    print(f"      Δ ordinal − SAPS-II: {fas.mean()-fsap.mean():+.4f}")
    print(f"\n    Risk bands (ordinal score):")
    for bn in ["low", "mid", "high"]:
        br = band_results.get(bn, {})
@@ -571,6 +596,18 @@ def main():
            "score_distribution": score_rows,
            "risk_bands": band_results,
        },
        "sapsii_baseline": {
            "n": int(len(SAPS_all)),
            "sapsii_mean": float(SAPS_all.mean()),
            "sapsii_min": float(SAPS_all.min()),
            "sapsii_max": float(SAPS_all.max()),
            "auroc_overall": float(saps_auc_overall),
            "cv_auroc_mean": float(fsap.mean()),
            "cv_auroc_std": float(fsap.std()),
            "cv_auroc_range": [float(fsap.min()), float(fsap.max())],
            "delta_formula_minus_sapsii": float(fa.mean() - fsap.mean()),
            "delta_ordinal_minus_sapsii": float(fas.mean() - fsap.mean()),
        },
    }
    with open(OUT_FILE, "w") as f:
        json.dump(output, f, indent=2, default=str)
Author	SHA1	Message	Date
David Madl	5dc1d3baa0	add: diag	2026-05-05 19:35:11 +02:00
David Madl	df7c0350a3	feat: add SAPS-II eval to Kepler score comparison	2026-05-05 18:35:06 +02:00