add: diag

feat: add SAPS-II eval to Kepler score comparison
chore: port to MIMIC-III using AI
2026-05-05 19:35:11 +02:00 · 2026-05-05 18:35:06 +02:00 · 2026-05-05 18:19:11 +02:00 · 2026-05-05 18:11:50 +02:00 · 2026-05-05 13:55:32 +02:00 · 2026-05-05 10:59:07 +02:00
4 changed files with 1828 additions and 34 deletions
--- a/paper2_festung_teil3.py
+++ b/paper2_festung_teil3.py
@@ -9,12 +9,25 @@ Benutzt die Full-Pop-Prädiktionen aus dem vorherigen Lauf.
 Basilakis 2026 · chicxulub.ai
 """

-import json, sys, math, time, random
+import json, os, sys, math, time, random
 from collections import defaultdict

-BQ_PROJECT   = "goddard-gap"
-DATA_PROJECT = "physionet-data"
-NE_ITEMID    = 221906
+# PostgreSQL connection string (libpq DSN). Override with env var.
+# e.g. "host=localhost port=5432 dbname=mimic user=postgres password=..."
+PG_DSN         = os.environ.get("MIMIC_PG_DSN", "dbname=mimic3")
+# Schema holding the stock MIMIC-III v1.3 tables (admissions, icustays,
+# labevents, chartevents, inputevents_mv, inputevents_cv, prescriptions,
+# diagnoses_icd, d_items, ...).
+MIMIC_SCHEMA   = os.environ.get("MIMIC_SCHEMA", "mimiciii")
+# Schema holding the locally built derived tables (sapsii, sepsis3, ...);
+# see sql/schemas.sql.  Defaults to the same schema as MIMIC-III itself.
+DERIVED_SCHEMA = os.environ.get("DERIVED_SCHEMA", MIMIC_SCHEMA)
+
+# MIMIC-III stores Norepinephrine under different itemids in CareVue
+# (inputevents_cv: 30047, 30120) and MetaVision (inputevents_mv: 221906).
+NE_ITEMIDS_MV  = [221906]
+NE_ITEMIDS_CV  = [30047, 30120]
+
 SAPS_WINDOW  = 10
 PARAM_KEYS = ["lactate","creatinine","ph","troponin","hemoglobin",
              "heart_rate","map_bp","spo2","temperature","ne_dose"]
@@ -52,10 +65,24 @@ GALAXY_PRIORITY = ["sepsis","cardiogenic_shock","post_cardiac_arrest","ards",
    "acute_mi","aki","liver_failure","gi_bleeding","stroke","pe","dka",
    "heart_failure","pneumonia","copd","afib","post_cardiac_surgery"]

-def run_bq(sql):
-    from google.cloud import bigquery
-    client = bigquery.Client(project=BQ_PROJECT)
-    return [dict(r.items()) for r in client.query(sql).result()]
+_PG_CONN = None
+def _pg_conn():
+    global _PG_CONN
+    if _PG_CONN is None or getattr(_PG_CONN, "closed", 0):
+        import psycopg2
+        _PG_CONN = psycopg2.connect(PG_DSN)
+        _PG_CONN.set_session(readonly=True, autocommit=True)
+    return _PG_CONN
+
+def run_pg(sql):
+    """Execute a read-only SQL query and return rows as list[dict]."""
+    import psycopg2.extras
+    conn = _pg_conn()
+    with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+        cur.execute(sql)
+        if cur.description is None:
+            return []
+        return [dict(r) for r in cur.fetchall()]

 def auc_fast(preds):
    if not preds: return 0.5
@@ -70,6 +97,25 @@ def auc_fast(preds):
        ties+=(k-j)
    return (conc+0.5*ties)/(len(pos)*len(neg))

+def auc_fast_gal(gal_preds):
+    """Pooled within-stratum concordance (Σ_g conc_g) / (Σ_g n_pos_g·n_neg_g).
+    Equivalent to a pair-weighted average of per-galaxy AUCs."""
+    if not gal_preds: return 0.5
+    conc = 0; ties = 0; pairs = 0
+    for _, preds in gal_preds.items():
+        pos = sorted(p["p"] for p in preds if p["a"] == 1)
+        neg = sorted(p["p"] for p in preds if p["a"] == 0)
+        if not pos or not neg: continue
+        pairs += len(pos) * len(neg)
+        j = 0
+        for pv in pos:
+            while j < len(neg) and neg[j] < pv: j += 1
+            conc += j; k = j
+            while k < len(neg) and neg[k] == pv: k += 1
+            ties += (k - j)
+    if pairs == 0: return 0.5
+    return (conc + 0.5 * ties) / pairs
+
 def compute_centroid(pts):
    s=defaultdict(float);c=defaultdict(int)
    for p in pts:
@@ -99,33 +145,63 @@ def td(pv,centroid,weights):

 def load_all_icu():
    print("  Loading ALL ICU patients...")
+    ne_mv = ",".join(str(i) for i in NE_ITEMIDS_MV)
+    ne_cv = ",".join(str(i) for i in NE_ITEMIDS_CV)
    sql=f"""WITH icu_pts AS (
        SELECT DISTINCT a.hadm_id,a.hospital_expire_flag AS died,s.sapsii,icu.intime,
            s.sapsii_prob AS saps_prob
-        FROM `{DATA_PROJECT}.mimiciv_3_1_hosp.admissions` a
-        JOIN `{DATA_PROJECT}.mimiciv_3_1_icu.icustays` icu ON a.hadm_id=icu.hadm_id
-        JOIN `{DATA_PROJECT}.mimiciv_3_1_derived.sapsii` s ON icu.stay_id=s.stay_id
+        FROM {MIMIC_SCHEMA}.admissions a
+        JOIN {MIMIC_SCHEMA}.icustays icu ON a.hadm_id=icu.hadm_id
+        JOIN {DERIVED_SCHEMA}.sapsii s ON icu.icustay_id=s.icustay_id
        WHERE s.sapsii BETWEEN 20 AND 90),
-    l_lac AS (SELECT le.hadm_id,MAX(le.valuenum) AS val FROM `{DATA_PROJECT}.mimiciv_3_1_hosp.labevents` le JOIN icu_pts ip ON le.hadm_id=ip.hadm_id WHERE le.itemid=50813 AND le.valuenum IS NOT NULL AND le.charttime BETWEEN ip.intime AND TIMESTAMP_ADD(ip.intime,INTERVAL 24 HOUR) GROUP BY le.hadm_id),
-    l_krea AS (SELECT le.hadm_id,MAX(le.valuenum) AS val FROM `{DATA_PROJECT}.mimiciv_3_1_hosp.labevents` le JOIN icu_pts ip ON le.hadm_id=ip.hadm_id WHERE le.itemid=50912 AND le.valuenum IS NOT NULL AND le.charttime BETWEEN ip.intime AND TIMESTAMP_ADD(ip.intime,INTERVAL 24 HOUR) GROUP BY le.hadm_id),
-    l_ph AS (SELECT le.hadm_id,MIN(le.valuenum) AS val FROM `{DATA_PROJECT}.mimiciv_3_1_hosp.labevents` le JOIN icu_pts ip ON le.hadm_id=ip.hadm_id WHERE le.itemid IN (50820,50831) AND le.valuenum IS NOT NULL AND le.charttime BETWEEN ip.intime AND TIMESTAMP_ADD(ip.intime,INTERVAL 24 HOUR) GROUP BY le.hadm_id),
-    l_trop AS (SELECT le.hadm_id,MAX(le.valuenum) AS val FROM `{DATA_PROJECT}.mimiciv_3_1_hosp.labevents` le JOIN icu_pts ip ON le.hadm_id=ip.hadm_id WHERE le.itemid IN (51002,51003) AND le.valuenum IS NOT NULL AND le.charttime BETWEEN ip.intime AND TIMESTAMP_ADD(ip.intime,INTERVAL 24 HOUR) GROUP BY le.hadm_id),
-    l_hb AS (SELECT le.hadm_id,MIN(le.valuenum) AS val FROM `{DATA_PROJECT}.mimiciv_3_1_hosp.labevents` le JOIN icu_pts ip ON le.hadm_id=ip.hadm_id WHERE le.itemid=51222 AND le.valuenum IS NOT NULL AND le.charttime BETWEEN ip.intime AND TIMESTAMP_ADD(ip.intime,INTERVAL 24 HOUR) GROUP BY le.hadm_id),
-    c_hr AS (SELECT ce.hadm_id,MAX(ce.valuenum) AS val FROM `{DATA_PROJECT}.mimiciv_3_1_icu.chartevents` ce JOIN icu_pts ip ON ce.hadm_id=ip.hadm_id JOIN `{DATA_PROJECT}.mimiciv_3_1_icu.icustays` icu ON ce.stay_id=icu.stay_id WHERE ce.itemid=220045 AND ce.valuenum BETWEEN 20 AND 250 AND ce.charttime BETWEEN icu.intime AND TIMESTAMP_ADD(icu.intime,INTERVAL 24 HOUR) GROUP BY ce.hadm_id),
-    c_map AS (SELECT ce.hadm_id,MIN(ce.valuenum) AS val FROM `{DATA_PROJECT}.mimiciv_3_1_icu.chartevents` ce JOIN icu_pts ip ON ce.hadm_id=ip.hadm_id JOIN `{DATA_PROJECT}.mimiciv_3_1_icu.icustays` icu ON ce.stay_id=icu.stay_id WHERE ce.itemid IN (220052,220181,225312) AND ce.valuenum BETWEEN 20 AND 200 AND ce.charttime BETWEEN icu.intime AND TIMESTAMP_ADD(icu.intime,INTERVAL 24 HOUR) GROUP BY ce.hadm_id),
-    c_spo2 AS (SELECT ce.hadm_id,MIN(ce.valuenum) AS val FROM `{DATA_PROJECT}.mimiciv_3_1_icu.chartevents` ce JOIN icu_pts ip ON ce.hadm_id=ip.hadm_id JOIN `{DATA_PROJECT}.mimiciv_3_1_icu.icustays` icu ON ce.stay_id=icu.stay_id WHERE ce.itemid=220277 AND ce.valuenum BETWEEN 50 AND 100 AND ce.charttime BETWEEN icu.intime AND TIMESTAMP_ADD(icu.intime,INTERVAL 24 HOUR) GROUP BY ce.hadm_id),
-    c_temp AS (SELECT ce.hadm_id,MIN(ce.valuenum) AS val FROM `{DATA_PROJECT}.mimiciv_3_1_icu.chartevents` ce JOIN icu_pts ip ON ce.hadm_id=ip.hadm_id JOIN `{DATA_PROJECT}.mimiciv_3_1_icu.icustays` icu ON ce.stay_id=icu.stay_id WHERE ce.itemid=223762 AND ce.valuenum BETWEEN 28 AND 43 AND ce.charttime BETWEEN icu.intime AND TIMESTAMP_ADD(icu.intime,INTERVAL 24 HOUR) GROUP BY ce.hadm_id),
-    ne AS (SELECT ie.hadm_id,MAX(ie.rate) AS val FROM `{DATA_PROJECT}.mimiciv_3_1_icu.inputevents` ie JOIN icu_pts ip ON ie.hadm_id=ip.hadm_id JOIN `{DATA_PROJECT}.mimiciv_3_1_icu.icustays` icu ON ie.stay_id=icu.stay_id WHERE ie.itemid={NE_ITEMID} AND ie.rate>0 AND ie.starttime BETWEEN icu.intime AND TIMESTAMP_ADD(icu.intime,INTERVAL 24 HOUR) GROUP BY ie.hadm_id)
+    l_lac AS (SELECT le.hadm_id,MAX(le.valuenum) AS val FROM {MIMIC_SCHEMA}.labevents le JOIN icu_pts ip ON le.hadm_id=ip.hadm_id WHERE le.itemid=50813 AND le.valuenum IS NOT NULL AND le.charttime BETWEEN ip.intime AND ip.intime + INTERVAL '24 hours' GROUP BY le.hadm_id),
+    l_krea AS (SELECT le.hadm_id,MAX(le.valuenum) AS val FROM {MIMIC_SCHEMA}.labevents le JOIN icu_pts ip ON le.hadm_id=ip.hadm_id WHERE le.itemid=50912 AND le.valuenum IS NOT NULL AND le.charttime BETWEEN ip.intime AND ip.intime + INTERVAL '24 hours' GROUP BY le.hadm_id),
+    l_ph AS (SELECT le.hadm_id,MIN(le.valuenum) AS val FROM {MIMIC_SCHEMA}.labevents le JOIN icu_pts ip ON le.hadm_id=ip.hadm_id WHERE le.itemid IN (50820,50831) AND le.valuenum IS NOT NULL AND le.charttime BETWEEN ip.intime AND ip.intime + INTERVAL '24 hours' GROUP BY le.hadm_id),
+    l_trop AS (SELECT le.hadm_id,MAX(le.valuenum) AS val FROM {MIMIC_SCHEMA}.labevents le JOIN icu_pts ip ON le.hadm_id=ip.hadm_id WHERE le.itemid IN (51002,51003) AND le.valuenum IS NOT NULL AND le.charttime BETWEEN ip.intime AND ip.intime + INTERVAL '24 hours' GROUP BY le.hadm_id),
+    l_hb AS (SELECT le.hadm_id,MIN(le.valuenum) AS val FROM {MIMIC_SCHEMA}.labevents le JOIN icu_pts ip ON le.hadm_id=ip.hadm_id WHERE le.itemid=51222 AND le.valuenum IS NOT NULL AND le.charttime BETWEEN ip.intime AND ip.intime + INTERVAL '24 hours' GROUP BY le.hadm_id),
+    c_hr AS (SELECT ce.hadm_id,MAX(ce.valuenum) AS val FROM {MIMIC_SCHEMA}.chartevents ce JOIN icu_pts ip ON ce.hadm_id=ip.hadm_id JOIN {MIMIC_SCHEMA}.icustays icu ON ce.icustay_id=icu.icustay_id WHERE ce.itemid IN (211,220045) AND ce.valuenum BETWEEN 20 AND 250 AND ce.charttime BETWEEN icu.intime AND icu.intime + INTERVAL '24 hours' GROUP BY ce.hadm_id),
+    c_map AS (SELECT ce.hadm_id,MIN(ce.valuenum) AS val FROM {MIMIC_SCHEMA}.chartevents ce JOIN icu_pts ip ON ce.hadm_id=ip.hadm_id JOIN {MIMIC_SCHEMA}.icustays icu ON ce.icustay_id=icu.icustay_id WHERE ce.itemid IN (52,456,6702,220052,220181,225312) AND ce.valuenum BETWEEN 20 AND 200 AND ce.charttime BETWEEN icu.intime AND icu.intime + INTERVAL '24 hours' GROUP BY ce.hadm_id),
+    c_spo2 AS (SELECT ce.hadm_id,MIN(ce.valuenum) AS val FROM {MIMIC_SCHEMA}.chartevents ce JOIN icu_pts ip ON ce.hadm_id=ip.hadm_id JOIN {MIMIC_SCHEMA}.icustays icu ON ce.icustay_id=icu.icustay_id WHERE ce.itemid IN (646,220277) AND ce.valuenum BETWEEN 50 AND 100 AND ce.charttime BETWEEN icu.intime AND icu.intime + INTERVAL '24 hours' GROUP BY ce.hadm_id),
+    -- Temperature: pull all four MIMIC-III itemids (676/223762 nominally
+    -- Celsius, 678/223761 nominally Fahrenheit) and decide the unit from
+    -- the value itself.  Plausible body temperature in C is ~28..43 and
+    -- in F is ~82..110; the two ranges don't overlap, so a value in the
+    -- F band can be safely converted to C even if it was charted under a
+    -- "Celsius" itemid (and vice versa).  Anything outside both bands is
+    -- treated as sensor noise and dropped.
+    c_temp AS (
+        SELECT ce.hadm_id,
+               MIN(CASE
+                     WHEN ce.valuenum BETWEEN 28  AND 43  THEN ce.valuenum
+                     WHEN ce.valuenum BETWEEN 82  AND 110 THEN (ce.valuenum - 32.0) / 1.8
+                   END) AS val
+          FROM {MIMIC_SCHEMA}.chartevents ce
+          JOIN icu_pts ip ON ce.hadm_id=ip.hadm_id
+          JOIN {MIMIC_SCHEMA}.icustays icu ON ce.icustay_id=icu.icustay_id
+         WHERE ce.itemid IN (676, 223762, 678, 223761)
+           AND ce.valuenum IS NOT NULL
+           AND (ce.valuenum BETWEEN 28 AND 43 OR ce.valuenum BETWEEN 82 AND 110)
+           AND ce.charttime BETWEEN icu.intime AND icu.intime + INTERVAL '24 hours'
+         GROUP BY ce.hadm_id),
+    ne_all AS (
+        SELECT ie.hadm_id, ie.icustay_id, ie.rate, ie.starttime AS evttime
+            FROM {MIMIC_SCHEMA}.inputevents_mv ie
+            WHERE ie.itemid IN ({ne_mv}) AND ie.rate>0
+        UNION ALL
+        SELECT ie.hadm_id, ie.icustay_id, ie.rate, ie.charttime AS evttime
+            FROM {MIMIC_SCHEMA}.inputevents_cv ie
+            WHERE ie.itemid IN ({ne_cv}) AND ie.rate>0),
+    ne AS (SELECT ie.hadm_id,MAX(ie.rate) AS val FROM ne_all ie JOIN icu_pts ip ON ie.hadm_id=ip.hadm_id JOIN {MIMIC_SCHEMA}.icustays icu ON ie.icustay_id=icu.icustay_id WHERE ie.evttime BETWEEN icu.intime AND icu.intime + INTERVAL '24 hours' GROUP BY ie.hadm_id)
    SELECT ip.hadm_id,ip.died,ip.sapsii,ip.saps_prob,
        ll.val AS lactate,lk.val AS creatinine,lp.val AS ph,lt.val AS troponin,lh.val AS hemoglobin,
-        chr.val AS heart_rate,cma.val AS map_bp,csp.val AS spo2,cte.val AS temperature,ne.val AS ne_dose
+        chr_.val AS heart_rate,cma.val AS map_bp,csp.val AS spo2,cte.val AS temperature,ne.val AS ne_dose
    FROM icu_pts ip
    LEFT JOIN l_lac ll ON ip.hadm_id=ll.hadm_id LEFT JOIN l_krea lk ON ip.hadm_id=lk.hadm_id
    LEFT JOIN l_ph lp ON ip.hadm_id=lp.hadm_id LEFT JOIN l_trop lt ON ip.hadm_id=lt.hadm_id
-    LEFT JOIN l_hb lh ON ip.hadm_id=lh.hadm_id LEFT JOIN c_hr chr ON ip.hadm_id=chr.hadm_id
+    LEFT JOIN l_hb lh ON ip.hadm_id=lh.hadm_id LEFT JOIN c_hr chr_ ON ip.hadm_id=chr_.hadm_id
    LEFT JOIN c_map cma ON ip.hadm_id=cma.hadm_id LEFT JOIN c_spo2 csp ON ip.hadm_id=csp.hadm_id
    LEFT JOIN c_temp cte ON ip.hadm_id=cte.hadm_id LEFT JOIN ne ON ip.hadm_id=ne.hadm_id"""
-    rows=run_bq(sql)
+    rows=run_pg(sql)
    pts=[{k:r.get(k) for k in ["hadm_id","died","sapsii","saps_prob"]+PARAM_KEYS}
         for r in rows if sum(1 for k in PARAM_KEYS if r.get(k) is not None)>=3 and r.get("died") is not None]
    print(f"  -> {len(pts)} patients"); return pts
@@ -135,10 +211,13 @@ def assign_galaxies(pts):
    hids=[p["hadm_id"] for p in pts];ps=defaultdict(set)
    for i in range(0,len(hids),10000):
        chunk=hids[i:i+10000]
-        for r in run_bq(f"SELECT hadm_id,icd_code,icd_version FROM `{DATA_PROJECT}.mimiciv_3_1_hosp.diagnoses_icd` WHERE hadm_id IN ({','.join(str(h) for h in chunk)})"):
+        # MIMIC-III v1.3 only carries ICD-9 codes (column `icd9_code`).
+        for r in run_pg(f"SELECT hadm_id,icd9_code FROM {MIMIC_SCHEMA}.diagnoses_icd WHERE hadm_id IN ({','.join(str(h) for h in chunk)})"):
+            code = r.get("icd9_code")
+            if code is None: continue
            for sk,sd in SYNDROME_ICDS.items():
-                for rc in sd.get(f"icd_{r['icd_version']}",[]):
-                    if r["icd_code"].startswith(rc): ps[r["hadm_id"]].add(sk);break
+                for rc in sd.get("icd_9",[]):
+                    if code.startswith(rc): ps[r["hadm_id"]].add(sk);break
    for p in pts:
        p["galaxy"]=None
        for g in GALAXY_PRIORITY:
@@ -147,14 +226,33 @@ def assign_galaxies(pts):
 def load_therapy_hadmids(tkey):
    t=THERAPIES[tkey]
    if tkey=="ne_high":
-        return set(r["hadm_id"] for r in run_bq(f"SELECT DISTINCT ie.hadm_id FROM `{DATA_PROJECT}.mimiciv_3_1_icu.inputevents` ie JOIN `{DATA_PROJECT}.mimiciv_3_1_icu.icustays` icu ON ie.stay_id=icu.stay_id WHERE ie.itemid={NE_ITEMID} AND ie.rate>=0.5 AND ie.starttime BETWEEN icu.intime AND TIMESTAMP_ADD(icu.intime,INTERVAL 24 HOUR)"))
+        ne_mv = ",".join(str(i) for i in NE_ITEMIDS_MV)
+        ne_cv = ",".join(str(i) for i in NE_ITEMIDS_CV)
+        sql = f"""
+            SELECT DISTINCT ie.hadm_id
+              FROM {MIMIC_SCHEMA}.inputevents_mv ie
+              JOIN {MIMIC_SCHEMA}.icustays icu ON ie.icustay_id=icu.icustay_id
+             WHERE ie.itemid IN ({ne_mv}) AND ie.rate>=0.5
+               AND ie.starttime BETWEEN icu.intime AND icu.intime + INTERVAL '24 hours'
+            UNION
+            SELECT DISTINCT ie.hadm_id
+              FROM {MIMIC_SCHEMA}.inputevents_cv ie
+              JOIN {MIMIC_SCHEMA}.icustays icu ON ie.icustay_id=icu.icustay_id
+             WHERE ie.itemid IN ({ne_cv}) AND ie.rate>=0.5
+               AND ie.charttime BETWEEN icu.intime AND icu.intime + INTERVAL '24 hours'
+        """
+        return set(r["hadm_id"] for r in run_pg(sql))
    clauses=[]
+    # MIMIC-III splits inputevents across MetaVision (starttime) and CareVue
+    # (charttime); we have to query both and UNION the hadm_ids.
    for d in t.get("drugs_input",[]):
-        clauses.append(f"SELECT DISTINCT ie.hadm_id FROM `{DATA_PROJECT}.mimiciv_3_1_icu.inputevents` ie JOIN `{DATA_PROJECT}.mimiciv_3_1_icu.d_items` di ON ie.itemid=di.itemid JOIN `{DATA_PROJECT}.mimiciv_3_1_icu.icustays` icu ON ie.stay_id=icu.stay_id WHERE di.label LIKE '%{d}%' AND ie.starttime BETWEEN icu.intime AND TIMESTAMP_ADD(icu.intime,INTERVAL 24 HOUR)")
+        clauses.append(f"SELECT DISTINCT ie.hadm_id FROM {MIMIC_SCHEMA}.inputevents_mv ie JOIN {MIMIC_SCHEMA}.d_items di ON ie.itemid=di.itemid JOIN {MIMIC_SCHEMA}.icustays icu ON ie.icustay_id=icu.icustay_id WHERE di.label ILIKE '%{d}%' AND ie.starttime BETWEEN icu.intime AND icu.intime + INTERVAL '24 hours'")
+        clauses.append(f"SELECT DISTINCT ie.hadm_id FROM {MIMIC_SCHEMA}.inputevents_cv ie JOIN {MIMIC_SCHEMA}.d_items di ON ie.itemid=di.itemid JOIN {MIMIC_SCHEMA}.icustays icu ON ie.icustay_id=icu.icustay_id WHERE di.label ILIKE '%{d}%' AND ie.charttime BETWEEN icu.intime AND icu.intime + INTERVAL '24 hours'")
+    # MIMIC-III prescriptions uses DATE-precision `startdate` (not `starttime`).
    for d in t.get("drugs_rx",[]):
-        clauses.append(f"SELECT DISTINCT p.hadm_id FROM `{DATA_PROJECT}.mimiciv_3_1_hosp.prescriptions` p JOIN `{DATA_PROJECT}.mimiciv_3_1_icu.icustays` icu ON p.hadm_id=icu.hadm_id WHERE p.drug LIKE '%{d}%' AND p.starttime BETWEEN icu.intime AND TIMESTAMP_ADD(icu.intime,INTERVAL 24 HOUR)")
+        clauses.append(f"SELECT DISTINCT p.hadm_id FROM {MIMIC_SCHEMA}.prescriptions p JOIN {MIMIC_SCHEMA}.icustays icu ON p.hadm_id=icu.hadm_id WHERE p.drug ILIKE '%{d}%' AND p.startdate BETWEEN icu.intime AND icu.intime + INTERVAL '24 hours'")
    if not clauses: return set()
-    return set(r["hadm_id"] for r in run_bq(" UNION DISTINCT ".join(clauses)))
+    return set(r["hadm_id"] for r in run_pg(" UNION ".join(clauses)))

 def run_loo(test_pts,ref_pts,therapy_hids,by_gal,label):
    """Returns list of {a, p, g, hadm_id} — includes hadm_id for fair comparison."""
@@ -199,7 +297,9 @@ def main():
    print(f"  1: Calibration  2: Fair Comparison  3: Summary")
    print(f"{'='*76}\n")

-    all_pts=load_all_icu();assign_galaxies(all_pts)
+    all_pts=load_all_icu();
+    all_pts = [p for p in all_pts if p.get("saps_prob") is not None]
+    assign_galaxies(all_pts)
    by_gal=defaultdict(list)
    for p in all_pts:
        if p["galaxy"]: by_gal[p["galaxy"]].append(p)
@@ -214,10 +314,33 @@ def main():
    results = {}

    # ── Full pop LOO ───────────────────────────────────────────────
-    print(f"\n  Running full-population LOO...")
+    print(f"\n  Running s-population LOO...")
    p_full = run_loo(all_pts, all_pts, therapy_hids, by_gal, "FULL-POP")
    a_td = auc_fast(p_full)
-    print(f"  * TD full pop: AUC {a_td:.4f}  n={len(p_full):,}")
+    print(f"  * TD s-pop: AUC {a_td:.4f}  n={len(p_full):,}")
+
+    # SAPS score for comparison
+    saps_preds_all = [{"a": p["died"], "p": p["saps_prob"]}
+                      for p in all_pts if p.get("saps_prob") is not None]
+    a_td_s = auc_fast(saps_preds_all)
+    print(f"  * SAPS-II s-pop: AUC {a_td_s:.4f}  n={len(saps_preds_all):,}")
+
+    saps_preds_by_gal = {
+        gal: [{"a": p["died"], "p": p["saps_prob"]}
+              for p in gal_pts if p.get("saps_prob") is not None]
+        for gal, gal_pts in by_gal.items()
+    }
+    a_td_sg = auc_fast_gal(saps_preds_by_gal)
+    ns = str([len(gal_pts) for (gal, gal_pts) in saps_preds_by_gal.items()])
+    print(f"  * SAPS-II by gal: AUC {a_td_sg:.4f}  n={ns}")
+
+    td_preds_by_gal = defaultdict(list)
+    for pred in p_full:
+        if pred.get("g"):
+            td_preds_by_gal[pred["g"]].append({"a": pred["a"], "p": pred["p"]})
+    a_td_g = auc_fast_gal(dict(td_preds_by_gal))
+    ns = str([len(gal_pts) for (gal, gal_pts) in td_preds_by_gal.items()])
+    print(f"  * TD by gal:      AUC {a_td_g:.4f}  n={ns}")

    # ══════════════════════════════════════════════════════════════
    # 1. CALIBRATION (10 Dezile)
--- a/paper3_phase5b_icd_sepsis_prevalence.py
+++ b/paper3_phase5b_icd_sepsis_prevalence.py
@@ -0,0 +1,447 @@
+"""
+ICD-coded sepsis prevalence — inclusion vs exclusion cohort
+═════════════════════════════════════════════════════════════════════════════
+Companion analysis for paper3_phase5b_refined.py.
+
+The Phase 5b cohort SQL (`q_cohort()`) keeps an ICU stay only when ALL of:
+
+    1. sepsis3.sepsis3 = TRUE                  (Sepsis-3 derived flag)
+    2. ICU length-of-stay ≥ 24h                (H_SNAPSHOT)
+    3. sapsii IS NOT NULL AND sapsii ≥ 48      (SAPS-II Q4)
+
+This script computes the prevalence of *explicit* ICD-coded sepsis on:
+    (a) the INCLUSION cohort  — stays that satisfy all three filters,
+    (b) the EXCLUSION cohort  — every other ICU stay in MIMIC-III (fails
+        at least one of the three filters above), and
+    (c) ALL ICU STAYS         — the full mimiciii.icustays universe
+        (= inclusion ∪ exclusion).
+
+Note: stay-level totals partition cleanly (incl + excl = all), but at the
+admission and subject level a single hadm_id / subject can have ICU stays
+in both buckets, so the "all" row is computed via SQL GROUPING SETS rather
+than by summing.
+
+ICD-coded sepsis is evaluated at the hospital-admission level (a stay is
+"ICD-sepsis +" if its parent hadm_id carries any of the codes below):
+
+    - Explicit sepsis (ICD-9):   995.91, 995.92, 785.52
+       (matches paper2_festung_teil3.py SYNDROME_ICDS["sepsis"]["icd_9"])
+
+    - Angus septicemia (ICD-9):  038.*   (any 038-prefixed code)
+
+    - Any of the above (union)
+
+For each cohort × definition combination we report:
+    n positives, prevalence, Wilson 95% CI.
+The inclusion vs exclusion difference is reported with a normal-approx
+95% CI and a Pearson χ² statistic (no scipy dependency).
+
+Usage:
+  python paper3_phase5b_icd_sepsis_prevalence.py
+"""
+
+import json, math, os, sys, time
+
+# Reuse the same env-var contract as paper3_phase5b_refined.py.
+PG_DSN         = os.environ.get("MIMIC_PG_DSN", "dbname=mimic3")
+MIMIC_SCHEMA   = os.environ.get("MIMIC_SCHEMA",  "mimiciii")
+DERIVED_SCHEMA = os.environ.get("DERIVED_SCHEMA", MIMIC_SCHEMA)
+
+H_SNAPSHOT  = 24      # ICU LOS threshold, hours (matches paper3 phase 5b)
+SAPSII_MIN  = 48      # SAPS-II Q4 cutoff (matches paper3 phase 5b)
+OUT_FILE    = "paper3_phase5b_icd_sepsis_prevalence.json"
+
+# Explicit sepsis codes (ICD-9, MIMIC-III stores them WITHOUT decimal point):
+#   995.91  →  '99591'   Sepsis
+#   995.92  →  '99592'   Severe sepsis
+#   785.52  →  '78552'   Septic shock
+EXPLICIT_SEPSIS_ICD9 = ("99591", "99592", "78552")
+
+# Angus-style broad septicemia bucket: any ICD-9 starting with 038.
+SEPTICEMIA_PREFIX = "038"
+
+
+_PG_CONN = None
+def _pg_conn():
+    global _PG_CONN
+    if _PG_CONN is None or getattr(_PG_CONN, "closed", 0):
+        import psycopg2
+        _PG_CONN = psycopg2.connect(PG_DSN)
+        _PG_CONN.set_session(readonly=True, autocommit=True)
+    return _PG_CONN
+
+
+def run_pg(sql, label=""):
+    import psycopg2.extras
+    conn = _pg_conn()
+    t0 = time.time()
+    with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+        cur.execute(sql)
+        rows = [dict(r) for r in cur.fetchall()] if cur.description else []
+    print(f"    {label:40s}  {len(rows):>8,d} rows  ({time.time()-t0:.1f}s)")
+    return rows
+
+
+# ── SQL ─────────────────────────────────────────────────────────────────────
+#
+# One pass: classify every ICU stay as "inclusion" or "exclusion" using the
+# Phase 5b filter, then left-join the two ICD code sets at the hadm_id level
+# and aggregate. This mirrors q_cohort() exactly for the inclusion bucket
+# (sepsis3 = TRUE AND LOS ≥ 24h AND sapsii ≥ 48), and treats every other
+# ICU stay in mimiciii.icustays as exclusion. GROUPING SETS adds a third
+# row (cohort = NULL → 'all') aggregated over the full ICU universe so that
+# admission- and subject-level distinct counts are correct (a single hadm_id
+# may straddle both buckets, so we cannot just sum incl + excl).
+
+def q_prevalence():
+    explicit = ",".join(f"'{c}'" for c in EXPLICIT_SEPSIS_ICD9)
+    return f"""
+WITH icu AS (
+    SELECT icu.icustay_id,
+           icu.hadm_id,
+           icu.subject_id,
+           EXTRACT(EPOCH FROM (icu.outtime - icu.intime)) / 3600.0 AS los_h,
+           COALESCE(s3.sepsis3, FALSE) AS is_sepsis3,
+           saps.sapsii AS sapsii
+      FROM {MIMIC_SCHEMA}.icustays icu
+      LEFT JOIN {DERIVED_SCHEMA}.sepsis3 s3   ON s3.icustay_id   = icu.icustay_id
+      LEFT JOIN {DERIVED_SCHEMA}.sapsii  saps ON saps.icustay_id = icu.icustay_id
+     WHERE icu.icustay_id IS NOT NULL
+       AND icu.hadm_id    IS NOT NULL
+),
+classified AS (
+    SELECT icustay_id, hadm_id, subject_id,
+           is_sepsis3, los_h, sapsii,
+           CASE WHEN is_sepsis3 = TRUE
+                     AND los_h  >= {H_SNAPSHOT}
+                     AND sapsii IS NOT NULL
+                     AND sapsii >= {SAPSII_MIN}
+                THEN 'inclusion' ELSE 'exclusion' END AS cohort
+      FROM icu
+),
+explicit_sepsis AS (
+    SELECT DISTINCT hadm_id
+      FROM {MIMIC_SCHEMA}.diagnoses_icd
+     WHERE icd9_code IN ({explicit})
+),
+septicemia AS (
+    SELECT DISTINCT hadm_id
+      FROM {MIMIC_SCHEMA}.diagnoses_icd
+     WHERE icd9_code LIKE '{SEPTICEMIA_PREFIX}%%'
+)
+SELECT
+    COALESCE(c.cohort, 'all')                   AS cohort,
+    COUNT(*)                                    AS n_stays,
+    COUNT(DISTINCT c.hadm_id)                   AS n_admissions,
+    COUNT(DISTINCT c.subject_id)                AS n_subjects,
+
+    SUM(CASE WHEN e.hadm_id IS NOT NULL
+             THEN 1 ELSE 0 END)                 AS n_stays_explicit,
+    COUNT(DISTINCT CASE WHEN e.hadm_id IS NOT NULL
+                        THEN c.hadm_id END)     AS n_adm_explicit,
+
+    SUM(CASE WHEN s.hadm_id IS NOT NULL
+             THEN 1 ELSE 0 END)                 AS n_stays_septicemia,
+    COUNT(DISTINCT CASE WHEN s.hadm_id IS NOT NULL
+                        THEN c.hadm_id END)     AS n_adm_septicemia,
+
+    SUM(CASE WHEN e.hadm_id IS NOT NULL
+              OR s.hadm_id IS NOT NULL
+             THEN 1 ELSE 0 END)                 AS n_stays_any,
+    COUNT(DISTINCT CASE WHEN e.hadm_id IS NOT NULL
+                          OR s.hadm_id IS NOT NULL
+                        THEN c.hadm_id END)     AS n_adm_any
+  FROM classified c
+  LEFT JOIN explicit_sepsis e ON e.hadm_id = c.hadm_id
+  LEFT JOIN septicemia      s ON s.hadm_id = c.hadm_id
+ GROUP BY GROUPING SETS ((c.cohort), ())
+"""
+
+
+def q_exclusion_breakdown():
+    """How many excluded stays fail each individual filter (non-exclusive
+    counts; an excluded stay can violate >1 criterion)."""
+    return f"""
+WITH icu AS (
+    SELECT icu.icustay_id,
+           EXTRACT(EPOCH FROM (icu.outtime - icu.intime)) / 3600.0 AS los_h,
+           COALESCE(s3.sepsis3, FALSE) AS is_sepsis3,
+           saps.sapsii AS sapsii
+      FROM {MIMIC_SCHEMA}.icustays icu
+      LEFT JOIN {DERIVED_SCHEMA}.sepsis3 s3   ON s3.icustay_id   = icu.icustay_id
+      LEFT JOIN {DERIVED_SCHEMA}.sapsii  saps ON saps.icustay_id = icu.icustay_id
+     WHERE icu.icustay_id IS NOT NULL
+       AND icu.hadm_id    IS NOT NULL
+)
+SELECT
+    COUNT(*)                                                AS n_total,
+    SUM(CASE WHEN is_sepsis3 = FALSE
+             THEN 1 ELSE 0 END)                             AS n_not_sepsis3,
+    SUM(CASE WHEN los_h < {H_SNAPSHOT}
+             THEN 1 ELSE 0 END)                             AS n_los_short,
+    SUM(CASE WHEN sapsii IS NULL
+             THEN 1 ELSE 0 END)                             AS n_sapsii_null,
+    SUM(CASE WHEN sapsii IS NOT NULL AND sapsii < {SAPSII_MIN}
+             THEN 1 ELSE 0 END)                             AS n_sapsii_below
+  FROM icu
+"""
+
+
+def q_icd_sepsis_waterfall():
+    """Mutually-exclusive waterfall, restricted to ICD-coded sepsis stays
+    only, showing which inclusion gate eliminated them.  Uses the union
+    definition (explicit sepsis codes ∪ 038.* septicemia)."""
+    explicit = ",".join(f"'{c}'" for c in EXPLICIT_SEPSIS_ICD9)
+    return f"""
+WITH icu AS (
+    SELECT icu.icustay_id, icu.hadm_id,
+           EXTRACT(EPOCH FROM (icu.outtime - icu.intime)) / 3600.0 AS los_h,
+           COALESCE(s3.sepsis3, FALSE) AS is_sepsis3,
+           saps.sapsii AS sapsii
+      FROM {MIMIC_SCHEMA}.icustays icu
+      LEFT JOIN {DERIVED_SCHEMA}.sepsis3 s3   ON s3.icustay_id   = icu.icustay_id
+      LEFT JOIN {DERIVED_SCHEMA}.sapsii  saps ON saps.icustay_id = icu.icustay_id
+     WHERE icu.icustay_id IS NOT NULL
+       AND icu.hadm_id    IS NOT NULL
+),
+icd_pos AS (
+    SELECT DISTINCT hadm_id
+      FROM {MIMIC_SCHEMA}.diagnoses_icd
+     WHERE icd9_code IN ({explicit})
+        OR icd9_code LIKE '{SEPTICEMIA_PREFIX}%%'
+),
+icd_stays AS (
+    SELECT i.* FROM icu i JOIN icd_pos x ON x.hadm_id = i.hadm_id
+)
+SELECT
+    COUNT(*)                                                  AS n_total_stays,
+    COUNT(DISTINCT hadm_id)                                   AS n_total_adm,
+
+    -- Waterfall: each stay is counted exactly once, in the order of the
+    -- inclusion filter (sepsis3 → LOS → sapsii NULL → sapsii < 48 → pass).
+    SUM(CASE WHEN NOT is_sepsis3
+             THEN 1 ELSE 0 END)                               AS n_fail_sepsis3,
+    SUM(CASE WHEN is_sepsis3 AND los_h < {H_SNAPSHOT}
+             THEN 1 ELSE 0 END)                               AS n_fail_los,
+    SUM(CASE WHEN is_sepsis3 AND los_h >= {H_SNAPSHOT}
+                              AND sapsii IS NULL
+             THEN 1 ELSE 0 END)                               AS n_fail_sapsii_null,
+    SUM(CASE WHEN is_sepsis3 AND los_h >= {H_SNAPSHOT}
+                              AND sapsii IS NOT NULL
+                              AND sapsii < {SAPSII_MIN}
+             THEN 1 ELSE 0 END)                               AS n_fail_sapsii_below,
+    SUM(CASE WHEN is_sepsis3 AND los_h >= {H_SNAPSHOT}
+                              AND sapsii IS NOT NULL
+                              AND sapsii >= {SAPSII_MIN}
+             THEN 1 ELSE 0 END)                               AS n_pass
+  FROM icd_stays
+"""
+
+
+# ── Stats helpers (no scipy) ────────────────────────────────────────────────
+def wilson_ci(k, n, z=1.959963984540054):
+    """Wilson score 95% CI for a binomial proportion. Returns (lo, hi)."""
+    if n <= 0:
+        return (float("nan"), float("nan"))
+    p = k / n
+    denom = 1.0 + z*z/n
+    centre = (p + z*z/(2.0*n)) / denom
+    half   = (z * math.sqrt(p*(1.0 - p)/n + z*z/(4.0*n*n))) / denom
+    return (max(0.0, centre - half), min(1.0, centre + half))
+
+
+def diff_ci(k1, n1, k2, n2, z=1.959963984540054):
+    """Normal-approx 95% CI for (p1 − p2). Returns (delta, lo, hi)."""
+    if n1 <= 0 or n2 <= 0:
+        return (float("nan"),) * 3
+    p1, p2 = k1 / n1, k2 / n2
+    se = math.sqrt(p1*(1.0 - p1)/n1 + p2*(1.0 - p2)/n2)
+    d  = p1 - p2
+    return (d, d - z*se, d + z*se)
+
+
+def chi2_2x2(k1, n1, k2, n2):
+    """Pearson χ² for the 2×2 table (sepsis± × cohort). Returns (chi2, dof=1).
+    Critical value at p=0.05 is 3.841."""
+    a, b = k1, n1 - k1            # incl: sepsis+, sepsis−
+    c, d = k2, n2 - k2            # excl: sepsis+, sepsis−
+    n = n1 + n2
+    if n == 0: return (float("nan"), 1)
+    row1, row2 = a + b, c + d
+    col1, col2 = a + c, b + d
+    chi2 = 0.0
+    for obs, r, col in ((a, row1, col1), (b, row1, col2),
+                         (c, row2, col1), (d, row2, col2)):
+        exp = r * col / n
+        if exp > 0:
+            chi2 += (obs - exp) ** 2 / exp
+    return (chi2, 1)
+
+
+def fmt_pct(p):    return f"{100.0*p:5.2f}%"
+def fmt_ci(lo,hi): return f"[{100.0*lo:5.2f}, {100.0*hi:5.2f}]"
+
+
+# ── Main ────────────────────────────────────────────────────────────────────
+def main():
+    print("\n" + "█"*78)
+    print("  ICD-coded sepsis prevalence — Phase 5b inclusion vs exclusion")
+    print("█"*78)
+    print(f"\n  PG DSN:         {PG_DSN}")
+    print(f"  MIMIC schema:   {MIMIC_SCHEMA}")
+    print(f"  Derived schema: {DERIVED_SCHEMA}")
+    print(f"  Inclusion:      sepsis3=TRUE AND LOS≥{H_SNAPSHOT}h AND SAPS-II≥{SAPSII_MIN}")
+    print(f"  Explicit ICD-9: {', '.join(EXPLICIT_SEPSIS_ICD9)} "
+          f"(995.91 / 995.92 / 785.52)")
+    print(f"  Septicemia:     ICD-9 {SEPTICEMIA_PREFIX}.*")
+
+    print(f"\n[1] Querying MIMIC-III...")
+    rows  = run_pg(q_prevalence(),            "cohort × ICD prevalence")
+    bkdwn = run_pg(q_exclusion_breakdown(),   "exclusion breakdown")
+    wfall = run_pg(q_icd_sepsis_waterfall(),  "ICD-sepsis waterfall")
+
+    if not rows:
+        print("\n[ERROR] no rows returned. Check PG_DSN / schema permissions.")
+        sys.exit(1)
+
+    by = {r["cohort"]: r for r in rows}
+    incl = by.get("inclusion", {})
+    excl = by.get("exclusion", {})
+    allc = by.get("all",       {})
+
+    INT_KEYS = ("n_stays","n_admissions","n_subjects",
+                "n_stays_explicit","n_adm_explicit",
+                "n_stays_septicemia","n_adm_septicemia",
+                "n_stays_any","n_adm_any")
+    for c in (incl, excl, allc):
+        for k in INT_KEYS:
+            c[k] = int(c.get(k) or 0)
+
+    COHORTS = (("inclusion", incl), ("exclusion", excl), ("all", allc))
+
+    # ── [2] Cohort sizes ──────────────────────────────────────────────────
+    print(f"\n[2] Cohort sizes")
+    print(f"    {'cohort':12s}  {'stays':>8s}  {'admissions':>11s}  {'subjects':>9s}")
+    for label, c in COHORTS:
+        print(f"    {label:12s}  {c['n_stays']:>8,d}  "
+              f"{c['n_admissions']:>11,d}  {c['n_subjects']:>9,d}")
+
+    # ── [3] Why an ICU stay was excluded ──────────────────────────────────
+    if bkdwn:
+        b = bkdwn[0]
+        print(f"\n[3] Exclusion breakdown (non-exclusive: a stay can fail >1 filter)")
+        n_total = int(b['n_total'] or 0)
+        for lbl, k in (("not Sepsis-3",          "n_not_sepsis3"),
+                       (f"ICU LOS < {H_SNAPSHOT}h",  "n_los_short"),
+                       ("SAPS-II is NULL",       "n_sapsii_null"),
+                       (f"SAPS-II < {SAPSII_MIN}",   "n_sapsii_below")):
+            n = int(b[k] or 0)
+            pct = 100.0*n/n_total if n_total else 0.0
+            print(f"    {lbl:24s}  {n:>8,d}  ({pct:5.2f}% of all ICU stays)")
+        print(f"    {'all ICU stays':24s}  {n_total:>8,d}")
+
+    # ── [3b] ICD-sepsis-positive waterfall ────────────────────────────────
+    # Diagnostic: of the ICD-coded sepsis stays (explicit ∪ 038.*), which
+    # inclusion gate eliminated them?  Mutually exclusive: each stay is
+    # counted in the FIRST gate that would reject it, walking in the
+    # inclusion-filter order (sepsis3 → LOS → SAPS-II NULL → SAPS-II<48).
+    if wfall:
+        w = wfall[0]
+        n_w = int(w["n_total_stays"] or 0)
+        n_adm = int(w["n_total_adm"]   or 0)
+        print(f"\n[3b] ICD-sepsis-positive waterfall (mutually exclusive,"
+              f" inclusion-filter order)")
+        print(f"     {'gate':28s}  {'n stays':>9s}  {'pct':>6s}  cumulative")
+        cum = 0
+        steps = (
+            ("rejected: not Sepsis-3",      "n_fail_sepsis3"),
+            (f"rejected: LOS < {H_SNAPSHOT}h",  "n_fail_los"),
+            ("rejected: SAPS-II is NULL",   "n_fail_sapsii_null"),
+            (f"rejected: SAPS-II < {SAPSII_MIN}",  "n_fail_sapsii_below"),
+            (f"PASS (= inclusion)",         "n_pass"),
+        )
+        for lbl, kn in steps:
+            n = int(w[kn] or 0)
+            cum += n
+            pct = 100.0*n/n_w if n_w else 0.0
+            print(f"     {lbl:28s}  {n:>9,d}  {pct:5.2f}%  {cum:>9,d}")
+        print(f"     {'TOTAL ICD-sepsis stays':28s}  {n_w:>9,d}  "
+              f"({n_adm:,} admissions)")
+
+    # ── [4] Prevalence per definition ─────────────────────────────────────
+    DEFS = (
+        ("Explicit sepsis (995.91 / 995.92 / 785.52)",
+         "n_stays_explicit",   "n_adm_explicit"),
+        ("Angus septicemia (038.*)",
+         "n_stays_septicemia", "n_adm_septicemia"),
+        ("Any of the above (union)",
+         "n_stays_any",        "n_adm_any"),
+    )
+
+    def _table(title, denom_key_n, denom_key_k):
+        """Render the prevalence table for one denominator (stays or
+        admissions) and append rows to `results[bucket]`."""
+        print(f"\n{title}")
+        print(f"    {'definition':45s}  {'cohort':10s}  "
+              f"{'n+':>7s}  {'N':>7s}  {'prev':>7s}  {'95% CI (Wilson)':>18s}")
+        out = []
+        for name, sk, ak in DEFS:
+            kkey = sk if denom_key_k == "n_stays" else ak
+            for label, c in COHORTS:
+                k_, n_ = c[kkey], c[denom_key_n]
+                p = k_/n_ if n_ else float("nan")
+                lo, hi = wilson_ci(k_, n_)
+                print(f"    {name:45s}  {label:10s}  "
+                      f"{k_:>7,d}  {n_:>7,d}  {fmt_pct(p):>7s}  "
+                      f"{fmt_ci(lo,hi):>18s}")
+                out.append({"definition": name, "cohort": label,
+                            "k": k_, "n": n_, "prevalence": p,
+                            "ci_lo": lo, "ci_hi": hi})
+
+            # Inclusion vs exclusion comparison (the "all" row is just a
+            # weighted average of the two so a Δ against it isn't meaningful).
+            k1, n1 = incl[kkey], incl[denom_key_n]
+            k2, n2 = excl[kkey], excl[denom_key_n]
+            d, dlo, dhi = diff_ci(k1, n1, k2, n2)
+            chi2, dof  = chi2_2x2(k1, n1, k2, n2)
+            sig = "p<0.05" if chi2 > 3.841 else "n.s."
+            print(f"    {'  Δ (incl − excl)':45s}  {'':10s}  "
+                  f"{'':>7s}  {'':>7s}  {fmt_pct(d):>7s}  "
+                  f"{fmt_ci(dlo,dhi):>18s}   χ²={chi2:6.2f} ({sig})")
+            out.append({"definition": name, "cohort": "delta_incl_minus_excl",
+                        "delta": d, "ci_lo": dlo, "ci_hi": dhi,
+                        "chi2": chi2, "dof": dof})
+        return out
+
+    results = {
+        "by_stays":      _table("[4] ICD-coded sepsis prevalence (denominator = ICU STAYS)",
+                                "n_stays", "n_stays"),
+        "by_admissions": _table("[5] ICD-coded sepsis prevalence (denominator = ADMISSIONS)",
+                                "n_admissions", "n_admissions"),
+    }
+
+    # ── Save ────────────────────────────────────────────────────────────
+    output = {
+        "filters": {
+            "h_snapshot_hours": H_SNAPSHOT,
+            "sapsii_min": SAPSII_MIN,
+            "explicit_icd9": list(EXPLICIT_SEPSIS_ICD9),
+            "septicemia_prefix": SEPTICEMIA_PREFIX,
+        },
+        "cohorts": {
+            "inclusion": {k: incl[k] for k in INT_KEYS},
+            "exclusion": {k: excl[k] for k in INT_KEYS},
+            "all":       {k: allc[k] for k in INT_KEYS},
+        },
+        "exclusion_breakdown":      (bkdwn[0] if bkdwn else None),
+        "icd_sepsis_waterfall":     (wfall[0] if wfall else None),
+        "results": results,
+    }
+    with open(OUT_FILE, "w") as f:
+        json.dump(output, f, indent=2, default=str)
+    print(f"\n  → Saved: {OUT_FILE}")
+    print("\n" + "█"*78 + "\n")
+
+
+if __name__ == "__main__":
+    main()
--- a/paper3_phase5b_refined.py
+++ b/paper3_phase5b_refined.py
@@ -0,0 +1,619 @@
+"""
+PAPER 3 — PHASE 5b: 5-TERM FORMULA + ORDINAL BEDSIDE SCORE
+═════════════════════════════════════════════════════════════════════════════
+Two refinements from Phase 5:
+
+  1. DROP log(1 + ne_auc). Its bootstrap CI crossed zero, and it is
+     collinear with I(ne_at_h24 > 0.08). Simpler 5-term formula.
+
+  2. BEDSIDE SCORE via ORDINAL BINNING (not coefficient rounding).
+     Each of the 5 remaining terms gets mapped to 0–4 points based on
+     clinically meaningful thresholds. SOFA-style integer score:
+
+       Lactate h24         0 if <2.5  |  2 if 2.5–4     |  4 if >4
+       Oliguria (ml/kg)    0 if ≥20   |  1 if 10–20     |  2 if <10
+       NE at h24           0 if ≤0.08 |  3 if >0.08
+       HR deviation        0 if 70–100|  1 if 60–70/100-120 | 2 if <60/>120
+       Pressor MAP/NE      0 if >3000 |  1 if 1000–3000 |  2 if <1000
+
+     Max 13 points. Bedside-ready.
+
+Repeats the full Phase 5 validation: CV, multi-seed, bootstrap, calibration,
+subgroups — for BOTH the 5-term continuous formula AND the ordinal score.
+
+Usage:
+  python paper3_phase5b_refined.py
+"""
+
+import json, os, sys, math, time, random
+from collections import defaultdict
+
+# PostgreSQL connection string (libpq DSN). Override with env var.
+# e.g. "host=localhost port=5432 dbname=mimic user=postgres password=..."
+PG_DSN         = os.environ.get("MIMIC_PG_DSN", "dbname=mimic3")
+# Schema holding the stock MIMIC-III v1.3 tables (admissions, icustays,
+# labevents, chartevents, inputevents_mv, inputevents_cv, outputevents,
+# patients, d_items, ...).
+MIMIC_SCHEMA   = os.environ.get("MIMIC_SCHEMA", "mimiciii")
+# Schema holding the locally built derived tables (sapsii, sepsis3,
+# norepinephrine_dose, weight_durations, ...); see sql/schemas.sql.
+# Defaults to the same schema as MIMIC-III itself.
+DERIVED_SCHEMA = os.environ.get("DERIVED_SCHEMA", MIMIC_SCHEMA)
+
+H_SNAPSHOT  = 24
+H_PEAK_NE   = 12
+TRAIN_FRAC  = 0.70
+N_SEEDS     = 10
+N_FOLDS     = 5
+N_BOOTSTRAP = 1000
+OUT_FILE    = "paper3_phase5b_refined.json"
+
+LACTATE_ID  = 50813
+# MAP: 52, 456, 6702 = CareVue; 220052, 220181, 225312 = MetaVision.
+MAP_ITEMIDS = [52, 456, 6702, 220052, 220181, 225312]
+# HR: 211 = CareVue; 220045 = MetaVision.
+HR_ITEMIDS  = [211, 220045]
+
+
+_PG_CONN = None
+def _pg_conn():
+    global _PG_CONN
+    if _PG_CONN is None or getattr(_PG_CONN, "closed", 0):
+        import psycopg2
+        _PG_CONN = psycopg2.connect(PG_DSN)
+        _PG_CONN.set_session(readonly=True, autocommit=True)
+    return _PG_CONN
+
+
+def run_pg(sql, label=""):
+    try:
+        import psycopg2.extras
+        conn = _pg_conn()
+        t0 = time.time()
+        with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+            cur.execute(sql)
+            rows = [dict(r) for r in cur.fetchall()] if cur.description else []
+        print(f"    {label:32s}  {len(rows):>8,d} rows  ({time.time()-t0:.1f}s)")
+        return rows
+    except Exception as e:
+        print(f"[PG ERROR] {label}: {e}"); return []
+
+
+# ── Queries (PostgreSQL / MIMIC-III v1.3, SAPS Q4 pre-filtered) ────────────
+#
+# Notes on the port from BigQuery / MIMIC-IV:
+#   * `stay_id` (MIMIC-IV) is `icustay_id` in MIMIC-III; we alias to
+#     `stay_id` so the downstream Python is unchanged.
+#   * `mimiciv_3_1_icu.inputevents` (single table, mcg/kg/min) is split
+#     across `inputevents_mv` and `inputevents_cv` in MIMIC-III with
+#     different itemids and units.  The `norepinephrine_dose` table built
+#     by sql/build_sepsis3.sql already merges both eras and normalises
+#     rates to mcg/kg/min, so we use that instead of the raw inputs.
+#   * Weight in MIMIC-IV is read from chartevents itemids 226512/224639
+#     (MetaVision-only).  In MIMIC-III those itemids cover only the MV
+#     half of the cohort, so we use the `weight_durations` table built by
+#     sql/build_sepsis3.sql (admit + daily + neonate + echo, both eras).
+#   * `pat.anchor_age` (MIMIC-IV) → computed from `pat.dob` against
+#     `icu.intime`.  MIMIC-III shifts dob backwards by ~300 years for
+#     patients ≥89; we cap the result at 120.
+
+def q_cohort():
+    return f"""
+WITH weight_first AS (
+  SELECT wd.icustay_id, MIN(wd.weight) AS weight_kg
+  FROM {DERIVED_SCHEMA}.weight_durations wd
+  JOIN {MIMIC_SCHEMA}.icustays icu ON icu.icustay_id = wd.icustay_id
+  WHERE wd.weight BETWEEN 30 AND 300
+    AND wd.starttime <= icu.intime + INTERVAL '24 hours'
+    AND wd.endtime   >= icu.intime
+  GROUP BY wd.icustay_id
+)
+SELECT icu.icustay_id AS stay_id, icu.subject_id, icu.intime,
+    LEAST(120.0, EXTRACT(EPOCH FROM (icu.intime - pat.dob)) / 31556952.0) AS age,
+    pat.gender,
+    saps.sapsii, adm.hospital_expire_flag AS died,
+    COALESCE(wf.weight_kg, 75.0) AS weight_kg
+FROM {DERIVED_SCHEMA}.sepsis3 s3
+JOIN {MIMIC_SCHEMA}.icustays   icu ON icu.icustay_id = s3.icustay_id
+JOIN {MIMIC_SCHEMA}.admissions adm ON adm.hadm_id    = icu.hadm_id
+JOIN {MIMIC_SCHEMA}.patients   pat ON pat.subject_id = icu.subject_id
+LEFT JOIN {DERIVED_SCHEMA}.sapsii saps ON saps.icustay_id = icu.icustay_id
+LEFT JOIN weight_first wf             ON wf.icustay_id   = icu.icustay_id
+WHERE s3.sepsis3 = TRUE
+  AND EXTRACT(EPOCH FROM (icu.outtime - icu.intime)) / 3600.0 >= {H_SNAPSHOT}
+  AND saps.sapsii IS NOT NULL AND saps.sapsii >= 48
+"""
+
+
+def q_ne():
+    return f"""
+SELECT nd.icustay_id AS stay_id,
+    EXTRACT(EPOCH FROM (nd.starttime - icu.intime)) / 60.0 AS start_min,
+    EXTRACT(EPOCH FROM (nd.endtime   - icu.intime)) / 60.0 AS end_min,
+    nd.vaso_rate AS rate
+FROM {DERIVED_SCHEMA}.norepinephrine_dose nd
+JOIN {MIMIC_SCHEMA}.icustays   icu  ON icu.icustay_id  = nd.icustay_id
+JOIN {DERIVED_SCHEMA}.sepsis3  s3   ON s3.icustay_id   = nd.icustay_id
+JOIN {DERIVED_SCHEMA}.sapsii   saps ON saps.icustay_id = nd.icustay_id
+WHERE s3.sepsis3 = TRUE AND saps.sapsii >= 48
+  AND nd.vaso_rate > 0
+  AND nd.starttime BETWEEN icu.intime AND icu.intime + INTERVAL '30 hours'
+"""
+
+
+def q_fluid_out():
+    return f"""
+SELECT oe.icustay_id AS stay_id, SUM(oe.value) AS fluid_out_ml
+FROM {MIMIC_SCHEMA}.outputevents oe
+JOIN {MIMIC_SCHEMA}.icustays   icu  ON icu.icustay_id  = oe.icustay_id
+JOIN {DERIVED_SCHEMA}.sepsis3  s3   ON s3.icustay_id   = oe.icustay_id
+JOIN {DERIVED_SCHEMA}.sapsii   saps ON saps.icustay_id = icu.icustay_id
+WHERE s3.sepsis3 = TRUE AND saps.sapsii >= 48
+  AND oe.value > 0
+  AND oe.charttime BETWEEN icu.intime AND icu.intime + INTERVAL '{H_SNAPSHOT} hours'
+GROUP BY oe.icustay_id
+"""
+
+
+def q_vitals():
+    ids = ",".join(str(x) for x in MAP_ITEMIDS + HR_ITEMIDS)
+    return f"""
+SELECT ce.icustay_id AS stay_id, ce.itemid, AVG(ce.valuenum) AS val
+FROM {MIMIC_SCHEMA}.chartevents ce
+JOIN {MIMIC_SCHEMA}.icustays   icu  ON icu.icustay_id  = ce.icustay_id
+JOIN {DERIVED_SCHEMA}.sepsis3  s3   ON s3.icustay_id   = ce.icustay_id
+JOIN {DERIVED_SCHEMA}.sapsii   saps ON saps.icustay_id = icu.icustay_id
+WHERE s3.sepsis3 = TRUE AND saps.sapsii >= 48
+  AND ce.itemid IN ({ids})
+  AND ce.valuenum IS NOT NULL AND ce.valuenum > 0
+  AND ce.charttime BETWEEN icu.intime + INTERVAL '20 hours'
+                       AND icu.intime + INTERVAL '28 hours'
+GROUP BY ce.icustay_id, ce.itemid
+"""
+
+
+def q_lactate():
+    return f"""
+SELECT icu.icustay_id AS stay_id,
+    EXTRACT(EPOCH FROM (le.charttime - icu.intime)) / 60.0 AS offset_min,
+    le.valuenum AS val
+FROM {MIMIC_SCHEMA}.labevents le
+JOIN {MIMIC_SCHEMA}.icustays   icu  ON icu.hadm_id     = le.hadm_id
+JOIN {DERIVED_SCHEMA}.sepsis3  s3   ON s3.icustay_id   = icu.icustay_id
+JOIN {DERIVED_SCHEMA}.sapsii   saps ON saps.icustay_id = icu.icustay_id
+WHERE s3.sepsis3 = TRUE AND saps.sapsii >= 48
+  AND le.itemid = {LACTATE_ID}
+  AND le.valuenum IS NOT NULL
+  AND le.charttime BETWEEN icu.intime AND icu.intime + INTERVAL '30 hours'
+"""
+
+
+# ── Primitives ──────────────────────────────────────────────────────────────
+def build_primitives(cohort, ne_rows, fout_rows, vital_rows, lac_rows):
+    print(f"\n[3] Building primitives...")
+    ne_by = defaultdict(list)
+    for r in ne_rows: ne_by[r["stay_id"]].append(r)
+    fout = {r["stay_id"]: r["fluid_out_ml"] or 0 for r in fout_rows}
+
+    vital_by = defaultdict(dict)
+    for r in vital_rows:
+        iid = r["itemid"]
+        key = "map" if iid in MAP_ITEMIDS else "hr"
+        if r["stay_id"] is not None:
+            cur = vital_by[r["stay_id"]].get(key)
+            vital_by[r["stay_id"]][key] = r["val"] if cur is None else (cur + r["val"])/2
+
+    lac_by = defaultdict(list)
+    for r in lac_rows: lac_by[r["stay_id"]].append(r)
+
+    prim = {}
+    for sid, c in cohort.items():
+        weight = c.get("weight_kg") or 75.0
+        events = ne_by.get(sid, [])
+        ne_h24 = 0.0
+        for ev in events:
+            sm, em, rate = ev["start_min"], ev["end_min"], ev["rate"]
+            if None in (sm, em, rate): continue
+            if sm <= H_SNAPSHOT*60 <= em and rate > ne_h24: ne_h24 = rate
+
+        lacs = sorted(lac_by.get(sid, []), key=lambda x: x["offset_min"] or 0)
+        lac_h24 = None
+        if lacs:
+            near = [r for r in lacs if r["offset_min"] is not None
+                    and 18*60 <= r["offset_min"] <= 28*60]
+            lac_h24 = near[-1]["val"] if near else lacs[-1]["val"]
+
+        v = vital_by.get(sid, {})
+        map_h24 = v.get("map")
+        hr_h24  = v.get("hr")
+        pr = map_h24 / (ne_h24 + 0.01) if map_h24 is not None else None
+
+        prim[sid] = {
+            "ne_at_h24": ne_h24,
+            "fluid_out_per_kg": fout.get(sid, 0) / weight,
+            "lactate_h24": lac_h24,
+            "map_h24": map_h24, "hr_h24": hr_h24,
+            "pressor_resistance": pr,
+        }
+    return prim
+
+
+# ── 5-term continuous formula ──────────────────────────────────────────────
+def formula_features(p):
+    lac  = p.get("lactate_h24")
+    fout = p.get("fluid_out_per_kg")
+    ne24 = p.get("ne_at_h24", 0.0) or 0.0
+    hr   = p.get("hr_h24")
+    pr   = p.get("pressor_resistance")
+    if lac is None or fout is None or hr is None or pr is None:
+        return None
+    return [
+        max(0, lac - 2.5),              # lactate hinge
+        max(0, 20 - fout),              # oliguria hinge
+        1.0 if ne24 > 0.08 else 0.0,    # NE persistence
+        abs(hr - 85) / 20,              # HR deviation
+        math.log(pr + 1.0),             # pressor efficiency
+    ]
+
+
+FEATURE_LABELS = [
+    "max(0, lactate_h24 − 2.5)",
+    "max(0, 20 − fluid_out_per_kg)",
+    "I(ne_at_h24 > 0.08)",
+    "|hr_h24 − 85| / 20",
+    "log(pressor_resistance + 1)",
+]
+
+
+# ── Ordinal bedside score (0–13 pts) ───────────────────────────────────────
+def bedside_score(p):
+    lac  = p.get("lactate_h24")
+    fout = p.get("fluid_out_per_kg")
+    ne24 = p.get("ne_at_h24", 0.0) or 0.0
+    hr   = p.get("hr_h24")
+    pr   = p.get("pressor_resistance")
+    if lac is None or fout is None or hr is None or pr is None:
+        return None, None
+
+    # Lactate (0 / 2 / 4)
+    if   lac <  2.5: pts_lac = 0
+    elif lac <= 4.0: pts_lac = 2
+    else:            pts_lac = 4
+
+    # Oliguria (0 / 1 / 2)
+    if   fout >= 20: pts_olig = 0
+    elif fout >= 10: pts_olig = 1
+    else:            pts_olig = 2
+
+    # NE persistence (0 / 3)
+    pts_ne = 3 if ne24 > 0.08 else 0
+
+    # HR deviation (0 / 1 / 2)
+    if   70 <= hr <= 100:                            pts_hr = 0
+    elif (60 <= hr < 70) or (100 < hr <= 120):       pts_hr = 1
+    else:                                             pts_hr = 2
+
+    # Pressor efficiency (0 / 1 / 2)
+    if   pr > 3000:  pts_pr = 0
+    elif pr >= 1000: pts_pr = 1
+    else:            pts_pr = 2
+
+    total = pts_lac + pts_olig + pts_ne + pts_hr + pts_pr
+    breakdown = {
+        "lactate": pts_lac, "oliguria": pts_olig,
+        "ne_persist": pts_ne, "hr_dev": pts_hr, "pressor_eff": pts_pr,
+    }
+    return total, breakdown
+
+
+def build_matrix(ids, primitives, cohort):
+    import numpy as np
+    X, y, scores, saps, valid = [], [], [], [], []
+    for sid in ids:
+        p = primitives.get(sid)
+        if p is None: continue
+        f = formula_features(p)
+        s, _ = bedside_score(p)
+        if f is None or s is None: continue
+        sap = cohort[sid].get("sapsii")
+        if sap is None: continue
+        X.append(f)
+        y.append(int(cohort[sid].get("died") or 0))
+        scores.append(s)
+        saps.append(float(sap))
+        valid.append(sid)
+    return np.array(X), np.array(y), np.array(scores), np.array(saps), valid
+
+
+# ── Main ────────────────────────────────────────────────────────────────────
+def main():
+    try:
+        import numpy as np
+        from sklearn.linear_model import LogisticRegression
+        from sklearn.metrics import roc_auc_score, brier_score_loss
+        from sklearn.model_selection import KFold
+    except ImportError as e:
+        print(f"\nERROR: {e}")
+        print("Install: pip install scikit-learn numpy")
+        sys.exit(1)
+
+    print("\n" + "█"*78)
+    print("  PAPER 3 — PHASE 5b: 5-term formula + ordinal bedside score")
+    print("█"*78)
+
+    print(f"\n[1] Fetching data...")
+    cohort_rows = run_pg(q_cohort(),    "cohort")
+    ne_rows     = run_pg(q_ne(),        "NE events")
+    fout_rows   = run_pg(q_fluid_out(), "Fluid out")
+    vital_rows  = run_pg(q_vitals(),    "Vitals h20-28")
+    lac_rows    = run_pg(q_lactate(),   "Lactate")
+
+    cohort = {r["stay_id"]: dict(r) for r in cohort_rows}
+    print(f"\n[2] Cohort: {len(cohort):,} SAPS Q4 sepsis-3")
+
+    primitives = build_primitives(cohort, ne_rows, fout_rows, vital_rows, lac_rows)
+    all_ids = [s for s in cohort if primitives.get(s)
+               and formula_features(primitives[s]) is not None]
+    print(f"    usable: {len(all_ids):,}")
+
+    X_all, y_all, S_all, SAPS_all, _ = build_matrix(all_ids, primitives, cohort)
+    print(f"    mortality: {100*y_all.mean():.1f}%")
+    print(f"    SAPS-II:   mean={SAPS_all.mean():.1f}  "
+          f"min={SAPS_all.min():.0f}  max={SAPS_all.max():.0f}")
+
+    # ══════════════════════════════════════════════════════════════════════
+    # [4] 5-fold CV — 5-term formula
+    # ══════════════════════════════════════════════════════════════════════
+    print(f"\n[4] 5-fold CV — 5-term continuous formula")
+    subject_ids = sorted(set(cohort[s]["subject_id"] for s in all_ids))
+    rng = np.random.default_rng(42)
+    rng.shuffle(subject_ids)
+    subj_arr = np.array(subject_ids)
+    kf = KFold(n_splits=N_FOLDS, shuffle=False)
+
+    fold_aucs, fold_aucs_score, fold_aucs_saps, fold_coefs = [], [], [], []
+    for k, (tr_idx, te_idx) in enumerate(kf.split(subj_arr)):
+        tr_subs = set(subj_arr[tr_idx].tolist())
+        tr_ids = [s for s in all_ids if cohort[s]["subject_id"] in tr_subs]
+        te_ids = [s for s in all_ids if cohort[s]["subject_id"] not in tr_subs]
+        X_tr, y_tr, S_tr, _, _ = build_matrix(tr_ids, primitives, cohort)
+        X_te, y_te, S_te, SAPS_te, _ = build_matrix(te_ids, primitives, cohort)
+
+        mu = X_tr.mean(0); sd = X_tr.std(0) + 1e-9
+        lr = LogisticRegression(C=1.0, max_iter=1000, random_state=42)
+        lr.fit((X_tr - mu) / sd, y_tr)
+        pred = lr.predict_proba((X_te - mu) / sd)[:, 1]
+        auc_f = roc_auc_score(y_te, pred)
+        auc_s = roc_auc_score(y_te, S_te)        # Ordinal score AUROC
+        auc_saps = roc_auc_score(y_te, SAPS_te)  # SAPS-II baseline AUROC
+
+        raw_beta = lr.coef_[0] / sd
+        raw_int  = lr.intercept_[0] - sum(lr.coef_[0][i]*mu[i]/sd[i] for i in range(len(sd)))
+        fold_aucs.append(auc_f)
+        fold_aucs_score.append(auc_s)
+        fold_aucs_saps.append(auc_saps)
+        fold_coefs.append([raw_int] + list(raw_beta))
+        print(f"    fold {k+1}: formula AUROC={auc_f:.4f}  "
+              f"ordinal AUROC={auc_s:.4f}  SAPS-II AUROC={auc_saps:.4f}")
+
+    fa = np.array(fold_aucs); fas = np.array(fold_aucs_score)
+    fsap = np.array(fold_aucs_saps)
+    print(f"\n    5-term formula CV AUROC:  {fa.mean():.4f} ± {fa.std():.4f}  "
+          f"(range {fa.min():.4f}–{fa.max():.4f})")
+    print(f"    Ordinal score  CV AUROC:  {fas.mean():.4f} ± {fas.std():.4f}  "
+          f"(range {fas.min():.4f}–{fas.max():.4f})")
+    print(f"    SAPS-II        CV AUROC:  {fsap.mean():.4f} ± {fsap.std():.4f}  "
+          f"(range {fsap.min():.4f}–{fsap.max():.4f})")
+    print(f"    Ordinal loss:             {fa.mean()-fas.mean():+.4f}")
+    print(f"    Δ vs SAPS-II (formula):   {fa.mean()-fsap.mean():+.4f}")
+    print(f"    Δ vs SAPS-II (ordinal):   {fas.mean()-fsap.mean():+.4f}")
+
+    # SAPS-II is a fixed pre-computed score (no parameters fit here), so an
+    # in-cohort AUROC is not optimistic — report it on the full Kepler cohort
+    # for a single, directly comparable headline number.
+    saps_auc_overall = roc_auc_score(y_all, SAPS_all)
+    print(f"\n    SAPS-II AUROC on full Kepler cohort (n={len(y_all):,}): "
+          f"{saps_auc_overall:.4f}")
+
+    fc = np.array(fold_coefs)
+    print(f"\n    Coefficient stability (5 terms):")
+    names = ["intercept"] + FEATURE_LABELS
+    for i, name in enumerate(names):
+        col = fc[:, i]
+        flips = sum(1 for k in range(1, len(col)) if col[k]*col[k-1] < 0)
+        print(f"      {name:35s}  {col.mean():>+9.4f} ± {col.std():>7.4f}  flips={flips}")
+
+    # ══════════════════════════════════════════════════════════════════════
+    # [5] Bootstrap CIs — 5-term formula
+    # ══════════════════════════════════════════════════════════════════════
+    print(f"\n[5] Bootstrap CIs — 5-term formula ({N_BOOTSTRAP} resamples)")
+    mu_all = X_all.mean(0); sd_all = X_all.std(0) + 1e-9
+    n = len(y_all)
+    boot_coefs = []
+    rng_b = np.random.default_rng(42)
+    for b in range(N_BOOTSTRAP):
+        idx = rng_b.integers(0, n, n)
+        X_b = X_all[idx]; y_b = y_all[idx]
+        if len(set(y_b.tolist())) < 2: continue
+        try:
+            lr = LogisticRegression(C=1.0, max_iter=500, random_state=42)
+            lr.fit((X_b - mu_all) / sd_all, y_b)
+            raw = lr.coef_[0] / sd_all
+            intc = lr.intercept_[0] - sum(lr.coef_[0][i]*mu_all[i]/sd_all[i] for i in range(len(sd_all)))
+            boot_coefs.append([intc] + list(raw))
+        except Exception: continue
+        if (b+1) % 250 == 0: print(f"    {b+1}/{N_BOOTSTRAP}...")
+
+    bc = np.array(boot_coefs)
+    lr_full = LogisticRegression(C=1.0, max_iter=1000, random_state=42)
+    lr_full.fit((X_all - mu_all) / sd_all, y_all)
+    raw_full = lr_full.coef_[0] / sd_all
+    intc_full = lr_full.intercept_[0] - sum(lr_full.coef_[0][i]*mu_all[i]/sd_all[i] for i in range(len(sd_all)))
+    point_all = [intc_full] + list(raw_full)
+
+    print(f"\n    {'term':35s}  {'point':>9s}  {'95% CI':>22s}")
+    ci_results = []
+    for i, name in enumerate(names):
+        col = bc[:, i]
+        lo = np.percentile(col, 2.5)
+        hi = np.percentile(col, 97.5)
+        crosses_zero = "✗" if (lo < 0 < hi) else "✓"
+        ci_str = f"({lo:+.4f}, {hi:+.4f}) {crosses_zero}"
+        print(f"    {name:35s}  {point_all[i]:>+9.4f}  {ci_str:>22s}")
+        ci_results.append({"term": name, "point": float(point_all[i]),
+                           "ci_lo": float(lo), "ci_hi": float(hi),
+                           "crosses_zero": bool(lo < 0 < hi)})
+
+    # ══════════════════════════════════════════════════════════════════════
+    # [6] Ordinal score distribution + mortality per score
+    # ══════════════════════════════════════════════════════════════════════
+    print(f"\n[6] Ordinal bedside score distribution + mortality per score")
+    print(f"    {'score':>5s}  {'n':>5s}  {'mortality':>10s}  {'cum n':>6s}")
+    score_buckets = defaultdict(lambda: {"n": 0, "died": 0})
+    for s, y in zip(S_all, y_all):
+        score_buckets[int(s)]["n"] += 1
+        score_buckets[int(s)]["died"] += int(y)
+
+    cum_n = 0
+    score_rows = []
+    for s in sorted(score_buckets.keys()):
+        b = score_buckets[s]
+        mort = 100 * b["died"] / b["n"] if b["n"] > 0 else 0
+        cum_n += b["n"]
+        bar = "█" * int(mort / 3)
+        print(f"    {s:>5d}  {b['n']:>5d}  {mort:>8.1f}%  {cum_n:>6d}  {bar}")
+        score_rows.append({"score": s, "n": b["n"], "mortality_pct": mort})
+
+    # ══════════════════════════════════════════════════════════════════════
+    # [7] Risk bands from ordinal score (clinical cut-points)
+    # ══════════════════════════════════════════════════════════════════════
+    print(f"\n[7] Suggested risk bands (clinically meaningful cutpoints)")
+    # Group into LOW / MID / HIGH by score
+    def band(s):
+        if s <= 3:  return "low"
+        if s <= 7:  return "mid"
+        return "high"
+
+    bands = defaultdict(lambda: {"n": 0, "died": 0})
+    for s, y in zip(S_all, y_all):
+        b = band(int(s))
+        bands[b]["n"] += 1
+        bands[b]["died"] += int(y)
+
+    print(f"    {'band':6s}  {'range':>7s}  {'n':>5s}  {'mort%':>7s}")
+    band_results = {}
+    for bname in ["low", "mid", "high"]:
+        b = bands[bname]
+        if b["n"] == 0: continue
+        rng_str = {"low": "0–3", "mid": "4–7", "high": "8+"}[bname]
+        mort = 100 * b["died"] / b["n"]
+        print(f"    {bname:6s}  {rng_str:>7s}  {b['n']:>5d}  {mort:>6.1f}%")
+        band_results[bname] = {"n": b["n"], "mortality_pct": mort}
+
+    # ══════════════════════════════════════════════════════════════════════
+    # [8] Calibration on holdout — both formula and ordinal
+    # ══════════════════════════════════════════════════════════════════════
+    print(f"\n[8] Calibration on 30% holdout (both versions)")
+    subs = list(set(cohort[s]["subject_id"] for s in all_ids))
+    random.Random(42).shuffle(subs)
+    n_tr = int(len(subs) * TRAIN_FRAC)
+    tr_subs = set(subs[:n_tr])
+    tr_ids = [s for s in all_ids if cohort[s]["subject_id"] in tr_subs]
+    te_ids = [s for s in all_ids if cohort[s]["subject_id"] not in tr_subs]
+    X_tr, y_tr, _, _, _ = build_matrix(tr_ids, primitives, cohort)
+    X_te, y_te, S_te, _, _ = build_matrix(te_ids, primitives, cohort)
+    mu = X_tr.mean(0); sd = X_tr.std(0) + 1e-9
+    lr_cal = LogisticRegression(C=1.0, max_iter=1000, random_state=42)
+    lr_cal.fit((X_tr - mu) / sd, y_tr)
+    pred_cal = lr_cal.predict_proba((X_te - mu) / sd)[:, 1]
+
+    order = np.argsort(pred_cal)
+    deciles = np.array_split(order, 10)
+    hl_stat = 0.0
+    print(f"    Formula deciles:")
+    print(f"    {'d':>2s}  {'n':>4s}  {'pred':>7s}  {'obs':>7s}")
+    calib_bins = []
+    for d, idx in enumerate(deciles):
+        n_d = len(idx)
+        p_mean = float(pred_cal[idx].mean())
+        obs = int(y_te[idx].sum())
+        exp = float(pred_cal[idx].sum())
+        if exp > 0 and (n_d - exp) > 0:
+            hl_stat += (obs - exp)**2 / exp + ((n_d - obs) - (n_d - exp))**2 / (n_d - exp)
+        print(f"    {d+1:>2d}  {n_d:>4d}  {p_mean:>6.3f}  {obs/n_d:>6.3f}")
+        calib_bins.append({"decile": d+1, "n": n_d,
+                           "predicted": p_mean, "observed": obs/n_d})
+    print(f"\n    Hosmer-Lemeshow χ²: {hl_stat:.2f}  (critical 15.51)")
+    if hl_stat < 15.51:
+        print(f"    → Well calibrated (p > 0.05)")
+
+    brier = brier_score_loss(y_te, pred_cal)
+    print(f"    Brier: {brier:.4f}")
+
+    # ══════════════════════════════════════════════════════════════════════
+    # [9] FINAL HEADLINE
+    # ══════════════════════════════════════════════════════════════════════
+    print(f"\n[9] ══════════ FINAL HEADLINE (Phase 5b) ══════════")
+    print(f"\n    Cohort: n={len(all_ids):,} sepsis-3 Q4 patients")
+    print(f"    Mortality: {100*y_all.mean():.1f}%")
+    print(f"\n    5-term formula (continuous):")
+    print(f"      5-fold CV AUROC:  {fa.mean():.4f} ± {fa.std():.4f}")
+    print(f"      Hosmer-Lemeshow:  χ² = {hl_stat:.2f} (calibrated)")
+    print(f"      Brier score:      {brier:.4f}")
+    print(f"\n    Ordinal bedside score (0–13 pts):")
+    print(f"      5-fold CV AUROC:  {fas.mean():.4f} ± {fas.std():.4f}")
+    print(f"      AUROC loss vs continuous: {fa.mean()-fas.mean():+.4f}")
+    print(f"\n    SAPS-II baseline (same cohort):")
+    print(f"      Overall AUROC:    {saps_auc_overall:.4f}")
+    print(f"      5-fold CV AUROC:  {fsap.mean():.4f} ± {fsap.std():.4f}")
+    print(f"      Δ formula − SAPS-II: {fa.mean()-fsap.mean():+.4f}")
+    print(f"      Δ ordinal − SAPS-II: {fas.mean()-fsap.mean():+.4f}")
+    print(f"\n    Risk bands (ordinal score):")
+    for bn in ["low", "mid", "high"]:
+        br = band_results.get(bn, {})
+        if br:
+            print(f"      {bn:6s} ({'0–3' if bn=='low' else '4–7' if bn=='mid' else '8+':>4s}): "
+                  f"n={br['n']:>5d}  mortality={br['mortality_pct']:.1f}%")
+
+    # ── Save ────────────────────────────────────────────────────────────────
+    output = {
+        "cohort": {"n": len(all_ids), "mortality": float(y_all.mean())},
+        "formula_5term": {
+            "cv_auroc_mean": float(fa.mean()),
+            "cv_auroc_std": float(fa.std()),
+            "cv_auroc_range": [float(fa.min()), float(fa.max())],
+            "coefficient_cis": ci_results,
+            "calibration": {
+                "hl_chi2": float(hl_stat),
+                "brier": float(brier),
+                "deciles": calib_bins,
+            },
+        },
+        "ordinal_score": {
+            "cv_auroc_mean": float(fas.mean()),
+            "cv_auroc_std": float(fas.std()),
+            "auroc_loss_vs_continuous": float(fa.mean() - fas.mean()),
+            "score_distribution": score_rows,
+            "risk_bands": band_results,
+        },
+        "sapsii_baseline": {
+            "n": int(len(SAPS_all)),
+            "sapsii_mean": float(SAPS_all.mean()),
+            "sapsii_min": float(SAPS_all.min()),
+            "sapsii_max": float(SAPS_all.max()),
+            "auroc_overall": float(saps_auc_overall),
+            "cv_auroc_mean": float(fsap.mean()),
+            "cv_auroc_std": float(fsap.std()),
+            "cv_auroc_range": [float(fsap.min()), float(fsap.max())],
+            "delta_formula_minus_sapsii": float(fa.mean() - fsap.mean()),
+            "delta_ordinal_minus_sapsii": float(fas.mean() - fsap.mean()),
+        },
+    }
+    with open(OUT_FILE, "w") as f:
+        json.dump(output, f, indent=2, default=str)
+    print(f"\n  → Saved: {OUT_FILE}")
+    print("\n" + "█"*78 + "\n")
+
+
+if __name__ == "__main__":
+    main()
--- a/sql/schemas.sql
+++ b/sql/schemas.sql
@@ -0,0 +1,605 @@
+-- ------------------------------------------------------------------
+-- Reference CREATE TABLE schemas for every derived table produced by
+--   sql/build_sapsii.sql
+--   sql/build_sepsis3.sql
+--
+-- This file is documentation only.  The actual build scripts use
+-- `DROP TABLE IF EXISTS ...; CREATE TABLE ... AS SELECT ...`, so
+-- column types are inferred by PostgreSQL at build time from the
+-- MIMIC-III v1.3 base schema and from the expressions in the SELECT.
+-- The types below match what PostgreSQL infers when the build is run
+-- on a stock MIMIC-III v1.3 PostgreSQL restore (where for example
+-- chartevents.valuenum is DOUBLE PRECISION, outputevents.value is
+-- DOUBLE PRECISION, *.charttime is TIMESTAMP(0), etc.).
+--
+-- Use this file as:
+--   * a quick reference for column names and types of each derived
+--     table (handy for downstream consumers that need to know the
+--     output schema without grep'ing through the build SQL);
+--   * a stub for declaring empty derived tables ahead of time (e.g.
+--     in a migration that just `CREATE TABLE IF NOT EXISTS ...`s
+--     them, then later runs the build to populate them);
+--   * a checklist when porting these scripts to another flavor of
+--     MIMIC (e.g. MIMIC-III v1.4 or MIMIC-IV).
+-- ------------------------------------------------------------------
+
+
+-- ==================================================================
+--                            SAPS-II
+-- ==================================================================
+
+-- 1. Helper: all-time urine output (from outputevents).
+DROP TABLE IF EXISTS urine_output;
+CREATE TABLE urine_output (
+    icustay_id  INTEGER,
+    charttime   TIMESTAMP(0),
+    value       DOUBLE PRECISION
+);
+
+-- 2. Ventilation: classification (per charttime) and durations
+--    (per ventilation episode).
+DROP TABLE IF EXISTS ventilation_classification;
+CREATE TABLE ventilation_classification (
+    icustay_id     INTEGER,
+    charttime      TIMESTAMP(0),
+    mechvent       INTEGER,
+    oxygentherapy  INTEGER,
+    extubated      INTEGER,
+    selfextubated  INTEGER
+);
+
+DROP TABLE IF EXISTS ventilation_durations;
+CREATE TABLE ventilation_durations (
+    icustay_id      INTEGER,
+    ventnum         BIGINT,
+    starttime       TIMESTAMP(0),
+    endtime         TIMESTAMP(0),
+    duration_hours  NUMERIC
+);
+
+-- 3. First-day pivots feeding SAPS-II.
+DROP TABLE IF EXISTS blood_gas_first_day;
+CREATE TABLE blood_gas_first_day (
+    subject_id        INTEGER,
+    hadm_id           INTEGER,
+    icustay_id        INTEGER,
+    charttime         TIMESTAMP(0),
+    specimen          VARCHAR(200),
+    aado2             DOUBLE PRECISION,
+    baseexcess        DOUBLE PRECISION,
+    bicarbonate       DOUBLE PRECISION,
+    totalco2          DOUBLE PRECISION,
+    carboxyhemoglobin DOUBLE PRECISION,
+    chloride          DOUBLE PRECISION,
+    calcium           DOUBLE PRECISION,
+    glucose           DOUBLE PRECISION,
+    hematocrit        DOUBLE PRECISION,
+    hemoglobin        DOUBLE PRECISION,
+    intubated         DOUBLE PRECISION,
+    lactate           DOUBLE PRECISION,
+    methemoglobin     DOUBLE PRECISION,
+    o2flow            DOUBLE PRECISION,
+    fio2              DOUBLE PRECISION,
+    so2               DOUBLE PRECISION,
+    pco2              DOUBLE PRECISION,
+    peep              DOUBLE PRECISION,
+    ph                DOUBLE PRECISION,
+    po2               DOUBLE PRECISION,
+    potassium         DOUBLE PRECISION,
+    requiredo2        DOUBLE PRECISION,
+    sodium            DOUBLE PRECISION,
+    temperature       DOUBLE PRECISION,
+    tidalvolume       DOUBLE PRECISION,
+    ventilationrate   DOUBLE PRECISION,
+    ventilator        DOUBLE PRECISION
+);
+
+DROP TABLE IF EXISTS blood_gas_first_day_arterial;
+CREATE TABLE blood_gas_first_day_arterial (
+    subject_id        INTEGER,
+    hadm_id           INTEGER,
+    icustay_id        INTEGER,
+    charttime         TIMESTAMP(0),
+    specimen          VARCHAR(200),
+    specimen_pred     VARCHAR(200),
+    specimen_prob     DOUBLE PRECISION,
+    so2               DOUBLE PRECISION,
+    spo2              DOUBLE PRECISION,
+    po2               DOUBLE PRECISION,
+    pco2              DOUBLE PRECISION,
+    fio2_chartevents  DOUBLE PRECISION,
+    fio2              DOUBLE PRECISION,
+    aado2             DOUBLE PRECISION,
+    aado2_calc        DOUBLE PRECISION,
+    pao2fio2          DOUBLE PRECISION,
+    ph                DOUBLE PRECISION,
+    baseexcess        DOUBLE PRECISION,
+    bicarbonate       DOUBLE PRECISION,
+    totalco2          DOUBLE PRECISION,
+    hematocrit        DOUBLE PRECISION,
+    hemoglobin        DOUBLE PRECISION,
+    carboxyhemoglobin DOUBLE PRECISION,
+    methemoglobin     DOUBLE PRECISION,
+    chloride          DOUBLE PRECISION,
+    calcium           DOUBLE PRECISION,
+    temperature       DOUBLE PRECISION,
+    potassium         DOUBLE PRECISION,
+    sodium            DOUBLE PRECISION,
+    lactate           DOUBLE PRECISION,
+    glucose           DOUBLE PRECISION,
+    intubated         DOUBLE PRECISION,
+    tidalvolume       DOUBLE PRECISION,
+    ventilationrate   DOUBLE PRECISION,
+    ventilator        DOUBLE PRECISION,
+    peep              DOUBLE PRECISION,
+    o2flow            DOUBLE PRECISION,
+    requiredo2        DOUBLE PRECISION
+);
+
+DROP TABLE IF EXISTS gcs_first_day;
+CREATE TABLE gcs_first_day (
+    subject_id    INTEGER,
+    hadm_id       INTEGER,
+    icustay_id    INTEGER,
+    mingcs        DOUBLE PRECISION,
+    gcsmotor      DOUBLE PRECISION,
+    gcsverbal     DOUBLE PRECISION,
+    gcseyes       DOUBLE PRECISION,
+    endotrachflag INTEGER
+);
+
+DROP TABLE IF EXISTS labs_first_day;
+CREATE TABLE labs_first_day (
+    subject_id      INTEGER,
+    hadm_id         INTEGER,
+    icustay_id      INTEGER,
+    aniongap_min    DOUBLE PRECISION,
+    aniongap_max    DOUBLE PRECISION,
+    albumin_min     DOUBLE PRECISION,
+    albumin_max     DOUBLE PRECISION,
+    bands_min       DOUBLE PRECISION,
+    bands_max       DOUBLE PRECISION,
+    bicarbonate_min DOUBLE PRECISION,
+    bicarbonate_max DOUBLE PRECISION,
+    bilirubin_min   DOUBLE PRECISION,
+    bilirubin_max   DOUBLE PRECISION,
+    creatinine_min  DOUBLE PRECISION,
+    creatinine_max  DOUBLE PRECISION,
+    chloride_min    DOUBLE PRECISION,
+    chloride_max    DOUBLE PRECISION,
+    glucose_min     DOUBLE PRECISION,
+    glucose_max     DOUBLE PRECISION,
+    hematocrit_min  DOUBLE PRECISION,
+    hematocrit_max  DOUBLE PRECISION,
+    hemoglobin_min  DOUBLE PRECISION,
+    hemoglobin_max  DOUBLE PRECISION,
+    lactate_min     DOUBLE PRECISION,
+    lactate_max     DOUBLE PRECISION,
+    platelet_min    DOUBLE PRECISION,
+    platelet_max    DOUBLE PRECISION,
+    potassium_min   DOUBLE PRECISION,
+    potassium_max   DOUBLE PRECISION,
+    ptt_min         DOUBLE PRECISION,
+    ptt_max         DOUBLE PRECISION,
+    inr_min         DOUBLE PRECISION,
+    inr_max         DOUBLE PRECISION,
+    pt_min          DOUBLE PRECISION,
+    pt_max          DOUBLE PRECISION,
+    sodium_min      DOUBLE PRECISION,
+    sodium_max      DOUBLE PRECISION,
+    bun_min         DOUBLE PRECISION,
+    bun_max         DOUBLE PRECISION,
+    wbc_min         DOUBLE PRECISION,
+    wbc_max         DOUBLE PRECISION
+);
+
+DROP TABLE IF EXISTS urine_output_first_day;
+CREATE TABLE urine_output_first_day (
+    subject_id  INTEGER,
+    hadm_id     INTEGER,
+    icustay_id  INTEGER,
+    urineoutput DOUBLE PRECISION
+);
+
+DROP TABLE IF EXISTS vitals_first_day;
+CREATE TABLE vitals_first_day (
+    subject_id     INTEGER,
+    hadm_id        INTEGER,
+    icustay_id     INTEGER,
+    heartrate_min  DOUBLE PRECISION,
+    heartrate_max  DOUBLE PRECISION,
+    heartrate_mean DOUBLE PRECISION,
+    sysbp_min      DOUBLE PRECISION,
+    sysbp_max      DOUBLE PRECISION,
+    sysbp_mean     DOUBLE PRECISION,
+    diasbp_min     DOUBLE PRECISION,
+    diasbp_max     DOUBLE PRECISION,
+    diasbp_mean    DOUBLE PRECISION,
+    meanbp_min     DOUBLE PRECISION,
+    meanbp_max     DOUBLE PRECISION,
+    meanbp_mean    DOUBLE PRECISION,
+    resprate_min   DOUBLE PRECISION,
+    resprate_max   DOUBLE PRECISION,
+    resprate_mean  DOUBLE PRECISION,
+    tempc_min      DOUBLE PRECISION,
+    tempc_max      DOUBLE PRECISION,
+    tempc_mean     DOUBLE PRECISION,
+    spo2_min       DOUBLE PRECISION,
+    spo2_max       DOUBLE PRECISION,
+    spo2_mean      DOUBLE PRECISION,
+    glucose_min    DOUBLE PRECISION,
+    glucose_max    DOUBLE PRECISION,
+    glucose_mean   DOUBLE PRECISION
+);
+
+-- 4. Final SAPS-II score table (one row per ICU stay).
+DROP TABLE IF EXISTS sapsii;
+CREATE TABLE sapsii (
+    subject_id           INTEGER,
+    hadm_id              INTEGER,
+    icustay_id           INTEGER,
+    sapsii               INTEGER,
+    sapsii_prob          DOUBLE PRECISION,
+    age_score            INTEGER,
+    hr_score             INTEGER,
+    sysbp_score          INTEGER,
+    temp_score           INTEGER,
+    pao2fio2_score       INTEGER,
+    uo_score             INTEGER,
+    bun_score            INTEGER,
+    wbc_score            INTEGER,
+    potassium_score      INTEGER,
+    sodium_score         INTEGER,
+    bicarbonate_score    INTEGER,
+    bilirubin_score      INTEGER,
+    gcs_score            INTEGER,
+    comorbidity_score    INTEGER,
+    admissiontype_score  INTEGER
+);
+
+
+-- ==================================================================
+--                            Sepsis-3
+-- ==================================================================
+--
+-- Sepsis-3 reuses these SAPS-II tables:
+--   urine_output, ventilation_classification, ventilation_durations
+-- (defined above).  The tables below are the ones added by
+-- build_sepsis3.sql.
+
+-- 1. Echo extraction (used to impute weight when chartevents weight
+--    is missing; also keyed by ROW_ID to the noteevents row).
+DROP TABLE IF EXISTS echo_data;
+CREATE TABLE echo_data (
+    row_id           INTEGER,
+    subject_id       INTEGER,
+    hadm_id          INTEGER,
+    chartdate        TIMESTAMP(0),
+    charttime        TIMESTAMP(3),
+    indication       TEXT,
+    height           NUMERIC,
+    weight           NUMERIC,
+    bsa              NUMERIC,
+    bp               TEXT,
+    bpsys            NUMERIC,
+    bpdias           NUMERIC,
+    hr               NUMERIC,
+    status           TEXT,
+    test             TEXT,
+    doppler          TEXT,
+    contrast         TEXT,
+    technicalquality TEXT
+);
+
+-- 2. Per-stay weight durations (admit + daily + neonate + echo
+--    imputed); used for mcg/kg/min vasopressor unit conversion.
+DROP TABLE IF EXISTS weight_durations;
+CREATE TABLE weight_durations (
+    icustay_id INTEGER,
+    starttime  TIMESTAMP(0),
+    endtime    TIMESTAMP(0),
+    weight     DOUBLE PRECISION
+);
+
+-- 3. Vasopressor dose tables.  All four have the same schema; rates
+--    are merged CareVue + MetaVision and converted to mcg/kg/min.
+DROP TABLE IF EXISTS dobutamine_dose;
+CREATE TABLE dobutamine_dose (
+    icustay_id  INTEGER,
+    starttime   TIMESTAMP(0),
+    endtime     TIMESTAMP(0),
+    vaso_rate   DOUBLE PRECISION,
+    vaso_amount DOUBLE PRECISION
+);
+
+DROP TABLE IF EXISTS dopamine_dose;
+CREATE TABLE dopamine_dose (
+    icustay_id  INTEGER,
+    starttime   TIMESTAMP(0),
+    endtime     TIMESTAMP(0),
+    vaso_rate   DOUBLE PRECISION,
+    vaso_amount DOUBLE PRECISION
+);
+
+DROP TABLE IF EXISTS epinephrine_dose;
+CREATE TABLE epinephrine_dose (
+    icustay_id  INTEGER,
+    starttime   TIMESTAMP(0),
+    endtime     TIMESTAMP(0),
+    vaso_rate   DOUBLE PRECISION,
+    vaso_amount DOUBLE PRECISION
+);
+
+DROP TABLE IF EXISTS norepinephrine_dose;
+CREATE TABLE norepinephrine_dose (
+    icustay_id  INTEGER,
+    starttime   TIMESTAMP(0),
+    endtime     TIMESTAMP(0),
+    vaso_rate   DOUBLE PRECISION,
+    vaso_amount DOUBLE PRECISION
+);
+
+-- 4. All-time pivots feeding hourly SOFA.
+DROP TABLE IF EXISTS blood_gas_arterial;
+CREATE TABLE blood_gas_arterial (
+    subject_id        INTEGER,
+    hadm_id           INTEGER,
+    icustay_id        INTEGER,
+    charttime         TIMESTAMP(0),
+    specimen          VARCHAR(200),
+    specimen_pred     VARCHAR(200),
+    specimen_prob     DOUBLE PRECISION,
+    so2               DOUBLE PRECISION,
+    spo2              DOUBLE PRECISION,
+    po2               DOUBLE PRECISION,
+    pco2              DOUBLE PRECISION,
+    fio2_chartevents  DOUBLE PRECISION,
+    fio2              DOUBLE PRECISION,
+    aado2             DOUBLE PRECISION,
+    aado2_calc        DOUBLE PRECISION,
+    pao2fio2          DOUBLE PRECISION,
+    ph                DOUBLE PRECISION,
+    baseexcess        DOUBLE PRECISION,
+    bicarbonate       DOUBLE PRECISION,
+    totalco2          DOUBLE PRECISION,
+    hematocrit        DOUBLE PRECISION,
+    hemoglobin        DOUBLE PRECISION,
+    carboxyhemoglobin DOUBLE PRECISION,
+    methemoglobin     DOUBLE PRECISION,
+    chloride          DOUBLE PRECISION,
+    calcium           DOUBLE PRECISION,
+    temperature       DOUBLE PRECISION,
+    potassium         DOUBLE PRECISION,
+    sodium            DOUBLE PRECISION,
+    lactate           DOUBLE PRECISION,
+    glucose           DOUBLE PRECISION,
+    intubated         DOUBLE PRECISION,
+    tidalvolume       DOUBLE PRECISION,
+    ventilationrate   DOUBLE PRECISION,
+    ventilator        DOUBLE PRECISION,
+    peep              DOUBLE PRECISION,
+    o2flow            DOUBLE PRECISION,
+    requiredo2        DOUBLE PRECISION
+);
+
+DROP TABLE IF EXISTS gcs_all;
+CREATE TABLE gcs_all (
+    icustay_id    INTEGER,
+    charttime     TIMESTAMP(0),
+    gcs           DOUBLE PRECISION,
+    endotrachflag INTEGER
+);
+
+-- 5. Hourly SOFA pipeline.  Each measurement class is materialised
+--    into a narrow staging table keyed by (icustay_id, hr); these
+--    are kept (not dropped) so each stage can be inspected with
+--    EXPLAIN ANALYZE.
+
+-- 5a. Hourly grid (one row per ICU hour per stay).
+DROP TABLE IF EXISTS sofa_grid;
+CREATE TABLE sofa_grid (
+    subject_id INTEGER,
+    hadm_id    INTEGER,
+    icustay_id INTEGER,
+    hr         INTEGER,
+    starttime  TIMESTAMP(0),
+    endtime    TIMESTAMP(0)
+);
+
+-- 5b. MAP minimum within each hour.
+DROP TABLE IF EXISTS sofa_vs;
+CREATE TABLE sofa_vs (
+    icustay_id INTEGER,
+    hr         INTEGER,
+    meanbp_min DOUBLE PRECISION
+);
+
+-- 5c. GCS minimum within each hour (from gcs_all, which already has
+--     the carry-forward and ET-trach=15 rules applied).
+DROP TABLE IF EXISTS sofa_gcs;
+CREATE TABLE sofa_gcs (
+    icustay_id INTEGER,
+    hr         INTEGER,
+    gcs_min    DOUBLE PRECISION
+);
+
+-- 5d. Bilirubin maximum within each hour.
+DROP TABLE IF EXISTS sofa_bili;
+CREATE TABLE sofa_bili (
+    icustay_id    INTEGER,
+    hr            INTEGER,
+    bilirubin_max DOUBLE PRECISION
+);
+
+-- 5e. Creatinine maximum within each hour.
+DROP TABLE IF EXISTS sofa_cr;
+CREATE TABLE sofa_cr (
+    icustay_id     INTEGER,
+    hr             INTEGER,
+    creatinine_max DOUBLE PRECISION
+);
+
+-- 5f. Platelet minimum within each hour.
+DROP TABLE IF EXISTS sofa_plt;
+CREATE TABLE sofa_plt (
+    icustay_id   INTEGER,
+    hr           INTEGER,
+    platelet_min DOUBLE PRECISION
+);
+
+-- 5g. PaO2/FiO2: split into vent / no-vent based on whether an
+--     active ventilation episode covered the blood gas.
+DROP TABLE IF EXISTS sofa_pf;
+CREATE TABLE sofa_pf (
+    icustay_id      INTEGER,
+    hr              INTEGER,
+    pao2fio2_novent DOUBLE PRECISION,
+    pao2fio2_vent   DOUBLE PRECISION
+);
+
+-- 5h. Urine output rolling sum + count of distinct charted hours
+--     within the past 24 h.
+DROP TABLE IF EXISTS sofa_uo;
+CREATE TABLE sofa_uo (
+    icustay_id INTEGER,
+    hr         INTEGER,
+    uo_24hr    DOUBLE PRECISION,
+    uo_tm_24hr BIGINT
+);
+
+-- 5i. Vasopressor rate snapshot at the hour boundary.
+DROP TABLE IF EXISTS sofa_vaso;
+CREATE TABLE sofa_vaso (
+    icustay_id          INTEGER,
+    hr                  INTEGER,
+    rate_epinephrine    DOUBLE PRECISION,
+    rate_norepinephrine DOUBLE PRECISION,
+    rate_dopamine       DOUBLE PRECISION,
+    rate_dobutamine     DOUBLE PRECISION
+);
+
+-- 5j. Wide assembly: grid LEFT JOINed onto every measurement table.
+DROP TABLE IF EXISTS sofa_wide;
+CREATE TABLE sofa_wide (
+    subject_id          INTEGER,
+    hadm_id             INTEGER,
+    icustay_id          INTEGER,
+    hr                  INTEGER,
+    starttime           TIMESTAMP(0),
+    endtime             TIMESTAMP(0),
+    meanbp_min          DOUBLE PRECISION,
+    gcs_min             DOUBLE PRECISION,
+    bilirubin_max       DOUBLE PRECISION,
+    creatinine_max      DOUBLE PRECISION,
+    platelet_min        DOUBLE PRECISION,
+    pao2fio2_novent     DOUBLE PRECISION,
+    pao2fio2_vent       DOUBLE PRECISION,
+    uo_24hr             DOUBLE PRECISION,
+    uo_tm_24hr          BIGINT,
+    rate_epinephrine    DOUBLE PRECISION,
+    rate_norepinephrine DOUBLE PRECISION,
+    rate_dopamine       DOUBLE PRECISION,
+    rate_dobutamine     DOUBLE PRECISION
+);
+
+-- 5k. Per-hour component scores (no rolling window yet).
+DROP TABLE IF EXISTS sofa_components;
+CREATE TABLE sofa_components (
+    subject_id          INTEGER,
+    hadm_id             INTEGER,
+    icustay_id          INTEGER,
+    hr                  INTEGER,
+    starttime           TIMESTAMP(0),
+    endtime             TIMESTAMP(0),
+    meanbp_min          DOUBLE PRECISION,
+    gcs_min             DOUBLE PRECISION,
+    bilirubin_max       DOUBLE PRECISION,
+    creatinine_max      DOUBLE PRECISION,
+    platelet_min        DOUBLE PRECISION,
+    pao2fio2_novent     DOUBLE PRECISION,
+    pao2fio2_vent       DOUBLE PRECISION,
+    uo_24hr             DOUBLE PRECISION,
+    uo_tm_24hr          BIGINT,
+    rate_epinephrine    DOUBLE PRECISION,
+    rate_norepinephrine DOUBLE PRECISION,
+    rate_dopamine       DOUBLE PRECISION,
+    rate_dobutamine     DOUBLE PRECISION,
+    respiration         INTEGER,
+    coagulation         INTEGER,
+    liver               INTEGER,
+    cardiovascular      INTEGER,
+    cns                 INTEGER,
+    renal               INTEGER
+);
+
+-- 5l. Final hourly SOFA: 24-hour rolling MAX per component, summed.
+DROP TABLE IF EXISTS sofa_hourly;
+CREATE TABLE sofa_hourly (
+    subject_id             INTEGER,
+    hadm_id                INTEGER,
+    icustay_id             INTEGER,
+    hr                     INTEGER,
+    starttime              TIMESTAMP(0),
+    endtime                TIMESTAMP(0),
+    respiration            INTEGER,
+    coagulation            INTEGER,
+    liver                  INTEGER,
+    cardiovascular         INTEGER,
+    cns                    INTEGER,
+    renal                  INTEGER,
+    respiration_24hours    INTEGER,
+    coagulation_24hours    INTEGER,
+    liver_24hours          INTEGER,
+    cardiovascular_24hours INTEGER,
+    cns_24hours            INTEGER,
+    renal_24hours          INTEGER,
+    sofa_24hours           INTEGER
+);
+
+-- 6. Suspicion of infection.
+DROP TABLE IF EXISTS antibiotic;
+CREATE TABLE antibiotic (
+    subject_id INTEGER,
+    hadm_id    INTEGER,
+    icustay_id INTEGER,
+    antibiotic VARCHAR(255),
+    route      VARCHAR(120),
+    -- MIMIC-III prescriptions has DATE-precision startdate / enddate
+    -- (stored as TIMESTAMP(0) but always at 00:00:00).
+    starttime  TIMESTAMP(0),
+    stoptime   TIMESTAMP(0)
+);
+
+DROP TABLE IF EXISTS suspicion_of_infection;
+CREATE TABLE suspicion_of_infection (
+    subject_id               INTEGER,
+    icustay_id               INTEGER,
+    hadm_id                  INTEGER,
+    ab_id                    BIGINT,
+    antibiotic               VARCHAR(255),
+    antibiotic_time          TIMESTAMP,
+    suspected_infection      INTEGER,
+    suspected_infection_time TIMESTAMP,
+    culture_time             TIMESTAMP,
+    specimen                 VARCHAR(100),
+    positive_culture         INTEGER
+);
+
+-- 7. Final Sepsis-3 onset table (one row per ICU stay).
+DROP TABLE IF EXISTS sepsis3;
+CREATE TABLE sepsis3 (
+    subject_id               INTEGER,
+    icustay_id               INTEGER,
+    antibiotic_time          TIMESTAMP,
+    culture_time             TIMESTAMP,
+    suspected_infection_time TIMESTAMP,
+    sofa_time                TIMESTAMP(0),
+    sofa_score               INTEGER,
+    respiration              INTEGER,
+    coagulation              INTEGER,
+    liver                    INTEGER,
+    cardiovascular           INTEGER,
+    cns                      INTEGER,
+    renal                    INTEGER,
+    sepsis3                  BOOLEAN
+);
Author	SHA1	Message	Date
David Madl	5dc1d3baa0	add: diag	2026-05-05 19:35:11 +02:00
David Madl	df7c0350a3	feat: add SAPS-II eval to Kepler score comparison	2026-05-05 18:35:06 +02:00
David Madl	935ce9750d	chore: port to MIMIC-III using AI	2026-05-05 18:19:11 +02:00
David Madl	e84c3ba4c7	add: Kepler score	2026-05-05 18:11:50 +02:00
David Madl	1e904713bd	feat: evaluate stratified by syndrome	2026-05-05 13:55:32 +02:00
David Madl	2d03ff0a42	chore: port to MIMIC-III using AI	2026-05-05 10:59:07 +02:00
David Madl	e51cb777fd	add: sql schemas for sapsii and sepsis3 scores (as ported to MIMIC-III, with caveats)	2026-05-05 10:20:07 +02:00