Compare commits

...

7 Commits

4 changed files with 1828 additions and 34 deletions

View File

@@ -9,12 +9,25 @@ Benutzt die Full-Pop-Prädiktionen aus dem vorherigen Lauf.
Basilakis 2026 · chicxulub.ai
"""
import json, sys, math, time, random
import json, os, sys, math, time, random
from collections import defaultdict
BQ_PROJECT = "goddard-gap"
DATA_PROJECT = "physionet-data"
NE_ITEMID = 221906
# PostgreSQL connection string (libpq DSN). Override with env var.
# e.g. "host=localhost port=5432 dbname=mimic user=postgres password=..."
PG_DSN = os.environ.get("MIMIC_PG_DSN", "dbname=mimic3")
# Schema holding the stock MIMIC-III v1.3 tables (admissions, icustays,
# labevents, chartevents, inputevents_mv, inputevents_cv, prescriptions,
# diagnoses_icd, d_items, ...).
MIMIC_SCHEMA = os.environ.get("MIMIC_SCHEMA", "mimiciii")
# Schema holding the locally built derived tables (sapsii, sepsis3, ...);
# see sql/schemas.sql. Defaults to the same schema as MIMIC-III itself.
DERIVED_SCHEMA = os.environ.get("DERIVED_SCHEMA", MIMIC_SCHEMA)
# MIMIC-III stores Norepinephrine under different itemids in CareVue
# (inputevents_cv: 30047, 30120) and MetaVision (inputevents_mv: 221906).
NE_ITEMIDS_MV = [221906]
NE_ITEMIDS_CV = [30047, 30120]
SAPS_WINDOW = 10
PARAM_KEYS = ["lactate","creatinine","ph","troponin","hemoglobin",
"heart_rate","map_bp","spo2","temperature","ne_dose"]
@@ -52,10 +65,24 @@ GALAXY_PRIORITY = ["sepsis","cardiogenic_shock","post_cardiac_arrest","ards",
"acute_mi","aki","liver_failure","gi_bleeding","stroke","pe","dka",
"heart_failure","pneumonia","copd","afib","post_cardiac_surgery"]
def run_bq(sql):
from google.cloud import bigquery
client = bigquery.Client(project=BQ_PROJECT)
return [dict(r.items()) for r in client.query(sql).result()]
_PG_CONN = None
def _pg_conn():
global _PG_CONN
if _PG_CONN is None or getattr(_PG_CONN, "closed", 0):
import psycopg2
_PG_CONN = psycopg2.connect(PG_DSN)
_PG_CONN.set_session(readonly=True, autocommit=True)
return _PG_CONN
def run_pg(sql):
"""Execute a read-only SQL query and return rows as list[dict]."""
import psycopg2.extras
conn = _pg_conn()
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute(sql)
if cur.description is None:
return []
return [dict(r) for r in cur.fetchall()]
def auc_fast(preds):
if not preds: return 0.5
@@ -70,6 +97,25 @@ def auc_fast(preds):
ties+=(k-j)
return (conc+0.5*ties)/(len(pos)*len(neg))
def auc_fast_gal(gal_preds):
"""Pooled within-stratum concordance (Σ_g conc_g) / (Σ_g n_pos_g·n_neg_g).
Equivalent to a pair-weighted average of per-galaxy AUCs."""
if not gal_preds: return 0.5
conc = 0; ties = 0; pairs = 0
for _, preds in gal_preds.items():
pos = sorted(p["p"] for p in preds if p["a"] == 1)
neg = sorted(p["p"] for p in preds if p["a"] == 0)
if not pos or not neg: continue
pairs += len(pos) * len(neg)
j = 0
for pv in pos:
while j < len(neg) and neg[j] < pv: j += 1
conc += j; k = j
while k < len(neg) and neg[k] == pv: k += 1
ties += (k - j)
if pairs == 0: return 0.5
return (conc + 0.5 * ties) / pairs
def compute_centroid(pts):
s=defaultdict(float);c=defaultdict(int)
for p in pts:
@@ -99,33 +145,63 @@ def td(pv,centroid,weights):
def load_all_icu():
print(" Loading ALL ICU patients...")
ne_mv = ",".join(str(i) for i in NE_ITEMIDS_MV)
ne_cv = ",".join(str(i) for i in NE_ITEMIDS_CV)
sql=f"""WITH icu_pts AS (
SELECT DISTINCT a.hadm_id,a.hospital_expire_flag AS died,s.sapsii,icu.intime,
s.sapsii_prob AS saps_prob
FROM `{DATA_PROJECT}.mimiciv_3_1_hosp.admissions` a
JOIN `{DATA_PROJECT}.mimiciv_3_1_icu.icustays` icu ON a.hadm_id=icu.hadm_id
JOIN `{DATA_PROJECT}.mimiciv_3_1_derived.sapsii` s ON icu.stay_id=s.stay_id
FROM {MIMIC_SCHEMA}.admissions a
JOIN {MIMIC_SCHEMA}.icustays icu ON a.hadm_id=icu.hadm_id
JOIN {DERIVED_SCHEMA}.sapsii s ON icu.icustay_id=s.icustay_id
WHERE s.sapsii BETWEEN 20 AND 90),
l_lac AS (SELECT le.hadm_id,MAX(le.valuenum) AS val FROM `{DATA_PROJECT}.mimiciv_3_1_hosp.labevents` le JOIN icu_pts ip ON le.hadm_id=ip.hadm_id WHERE le.itemid=50813 AND le.valuenum IS NOT NULL AND le.charttime BETWEEN ip.intime AND TIMESTAMP_ADD(ip.intime,INTERVAL 24 HOUR) GROUP BY le.hadm_id),
l_krea AS (SELECT le.hadm_id,MAX(le.valuenum) AS val FROM `{DATA_PROJECT}.mimiciv_3_1_hosp.labevents` le JOIN icu_pts ip ON le.hadm_id=ip.hadm_id WHERE le.itemid=50912 AND le.valuenum IS NOT NULL AND le.charttime BETWEEN ip.intime AND TIMESTAMP_ADD(ip.intime,INTERVAL 24 HOUR) GROUP BY le.hadm_id),
l_ph AS (SELECT le.hadm_id,MIN(le.valuenum) AS val FROM `{DATA_PROJECT}.mimiciv_3_1_hosp.labevents` le JOIN icu_pts ip ON le.hadm_id=ip.hadm_id WHERE le.itemid IN (50820,50831) AND le.valuenum IS NOT NULL AND le.charttime BETWEEN ip.intime AND TIMESTAMP_ADD(ip.intime,INTERVAL 24 HOUR) GROUP BY le.hadm_id),
l_trop AS (SELECT le.hadm_id,MAX(le.valuenum) AS val FROM `{DATA_PROJECT}.mimiciv_3_1_hosp.labevents` le JOIN icu_pts ip ON le.hadm_id=ip.hadm_id WHERE le.itemid IN (51002,51003) AND le.valuenum IS NOT NULL AND le.charttime BETWEEN ip.intime AND TIMESTAMP_ADD(ip.intime,INTERVAL 24 HOUR) GROUP BY le.hadm_id),
l_hb AS (SELECT le.hadm_id,MIN(le.valuenum) AS val FROM `{DATA_PROJECT}.mimiciv_3_1_hosp.labevents` le JOIN icu_pts ip ON le.hadm_id=ip.hadm_id WHERE le.itemid=51222 AND le.valuenum IS NOT NULL AND le.charttime BETWEEN ip.intime AND TIMESTAMP_ADD(ip.intime,INTERVAL 24 HOUR) GROUP BY le.hadm_id),
c_hr AS (SELECT ce.hadm_id,MAX(ce.valuenum) AS val FROM `{DATA_PROJECT}.mimiciv_3_1_icu.chartevents` ce JOIN icu_pts ip ON ce.hadm_id=ip.hadm_id JOIN `{DATA_PROJECT}.mimiciv_3_1_icu.icustays` icu ON ce.stay_id=icu.stay_id WHERE ce.itemid=220045 AND ce.valuenum BETWEEN 20 AND 250 AND ce.charttime BETWEEN icu.intime AND TIMESTAMP_ADD(icu.intime,INTERVAL 24 HOUR) GROUP BY ce.hadm_id),
c_map AS (SELECT ce.hadm_id,MIN(ce.valuenum) AS val FROM `{DATA_PROJECT}.mimiciv_3_1_icu.chartevents` ce JOIN icu_pts ip ON ce.hadm_id=ip.hadm_id JOIN `{DATA_PROJECT}.mimiciv_3_1_icu.icustays` icu ON ce.stay_id=icu.stay_id WHERE ce.itemid IN (220052,220181,225312) AND ce.valuenum BETWEEN 20 AND 200 AND ce.charttime BETWEEN icu.intime AND TIMESTAMP_ADD(icu.intime,INTERVAL 24 HOUR) GROUP BY ce.hadm_id),
c_spo2 AS (SELECT ce.hadm_id,MIN(ce.valuenum) AS val FROM `{DATA_PROJECT}.mimiciv_3_1_icu.chartevents` ce JOIN icu_pts ip ON ce.hadm_id=ip.hadm_id JOIN `{DATA_PROJECT}.mimiciv_3_1_icu.icustays` icu ON ce.stay_id=icu.stay_id WHERE ce.itemid=220277 AND ce.valuenum BETWEEN 50 AND 100 AND ce.charttime BETWEEN icu.intime AND TIMESTAMP_ADD(icu.intime,INTERVAL 24 HOUR) GROUP BY ce.hadm_id),
c_temp AS (SELECT ce.hadm_id,MIN(ce.valuenum) AS val FROM `{DATA_PROJECT}.mimiciv_3_1_icu.chartevents` ce JOIN icu_pts ip ON ce.hadm_id=ip.hadm_id JOIN `{DATA_PROJECT}.mimiciv_3_1_icu.icustays` icu ON ce.stay_id=icu.stay_id WHERE ce.itemid=223762 AND ce.valuenum BETWEEN 28 AND 43 AND ce.charttime BETWEEN icu.intime AND TIMESTAMP_ADD(icu.intime,INTERVAL 24 HOUR) GROUP BY ce.hadm_id),
ne AS (SELECT ie.hadm_id,MAX(ie.rate) AS val FROM `{DATA_PROJECT}.mimiciv_3_1_icu.inputevents` ie JOIN icu_pts ip ON ie.hadm_id=ip.hadm_id JOIN `{DATA_PROJECT}.mimiciv_3_1_icu.icustays` icu ON ie.stay_id=icu.stay_id WHERE ie.itemid={NE_ITEMID} AND ie.rate>0 AND ie.starttime BETWEEN icu.intime AND TIMESTAMP_ADD(icu.intime,INTERVAL 24 HOUR) GROUP BY ie.hadm_id)
l_lac AS (SELECT le.hadm_id,MAX(le.valuenum) AS val FROM {MIMIC_SCHEMA}.labevents le JOIN icu_pts ip ON le.hadm_id=ip.hadm_id WHERE le.itemid=50813 AND le.valuenum IS NOT NULL AND le.charttime BETWEEN ip.intime AND ip.intime + INTERVAL '24 hours' GROUP BY le.hadm_id),
l_krea AS (SELECT le.hadm_id,MAX(le.valuenum) AS val FROM {MIMIC_SCHEMA}.labevents le JOIN icu_pts ip ON le.hadm_id=ip.hadm_id WHERE le.itemid=50912 AND le.valuenum IS NOT NULL AND le.charttime BETWEEN ip.intime AND ip.intime + INTERVAL '24 hours' GROUP BY le.hadm_id),
l_ph AS (SELECT le.hadm_id,MIN(le.valuenum) AS val FROM {MIMIC_SCHEMA}.labevents le JOIN icu_pts ip ON le.hadm_id=ip.hadm_id WHERE le.itemid IN (50820,50831) AND le.valuenum IS NOT NULL AND le.charttime BETWEEN ip.intime AND ip.intime + INTERVAL '24 hours' GROUP BY le.hadm_id),
l_trop AS (SELECT le.hadm_id,MAX(le.valuenum) AS val FROM {MIMIC_SCHEMA}.labevents le JOIN icu_pts ip ON le.hadm_id=ip.hadm_id WHERE le.itemid IN (51002,51003) AND le.valuenum IS NOT NULL AND le.charttime BETWEEN ip.intime AND ip.intime + INTERVAL '24 hours' GROUP BY le.hadm_id),
l_hb AS (SELECT le.hadm_id,MIN(le.valuenum) AS val FROM {MIMIC_SCHEMA}.labevents le JOIN icu_pts ip ON le.hadm_id=ip.hadm_id WHERE le.itemid=51222 AND le.valuenum IS NOT NULL AND le.charttime BETWEEN ip.intime AND ip.intime + INTERVAL '24 hours' GROUP BY le.hadm_id),
c_hr AS (SELECT ce.hadm_id,MAX(ce.valuenum) AS val FROM {MIMIC_SCHEMA}.chartevents ce JOIN icu_pts ip ON ce.hadm_id=ip.hadm_id JOIN {MIMIC_SCHEMA}.icustays icu ON ce.icustay_id=icu.icustay_id WHERE ce.itemid IN (211,220045) AND ce.valuenum BETWEEN 20 AND 250 AND ce.charttime BETWEEN icu.intime AND icu.intime + INTERVAL '24 hours' GROUP BY ce.hadm_id),
c_map AS (SELECT ce.hadm_id,MIN(ce.valuenum) AS val FROM {MIMIC_SCHEMA}.chartevents ce JOIN icu_pts ip ON ce.hadm_id=ip.hadm_id JOIN {MIMIC_SCHEMA}.icustays icu ON ce.icustay_id=icu.icustay_id WHERE ce.itemid IN (52,456,6702,220052,220181,225312) AND ce.valuenum BETWEEN 20 AND 200 AND ce.charttime BETWEEN icu.intime AND icu.intime + INTERVAL '24 hours' GROUP BY ce.hadm_id),
c_spo2 AS (SELECT ce.hadm_id,MIN(ce.valuenum) AS val FROM {MIMIC_SCHEMA}.chartevents ce JOIN icu_pts ip ON ce.hadm_id=ip.hadm_id JOIN {MIMIC_SCHEMA}.icustays icu ON ce.icustay_id=icu.icustay_id WHERE ce.itemid IN (646,220277) AND ce.valuenum BETWEEN 50 AND 100 AND ce.charttime BETWEEN icu.intime AND icu.intime + INTERVAL '24 hours' GROUP BY ce.hadm_id),
-- Temperature: pull all four MIMIC-III itemids (676/223762 nominally
-- Celsius, 678/223761 nominally Fahrenheit) and decide the unit from
-- the value itself. Plausible body temperature in C is ~28..43 and
-- in F is ~82..110; the two ranges don't overlap, so a value in the
-- F band can be safely converted to C even if it was charted under a
-- "Celsius" itemid (and vice versa). Anything outside both bands is
-- treated as sensor noise and dropped.
c_temp AS (
SELECT ce.hadm_id,
MIN(CASE
WHEN ce.valuenum BETWEEN 28 AND 43 THEN ce.valuenum
WHEN ce.valuenum BETWEEN 82 AND 110 THEN (ce.valuenum - 32.0) / 1.8
END) AS val
FROM {MIMIC_SCHEMA}.chartevents ce
JOIN icu_pts ip ON ce.hadm_id=ip.hadm_id
JOIN {MIMIC_SCHEMA}.icustays icu ON ce.icustay_id=icu.icustay_id
WHERE ce.itemid IN (676, 223762, 678, 223761)
AND ce.valuenum IS NOT NULL
AND (ce.valuenum BETWEEN 28 AND 43 OR ce.valuenum BETWEEN 82 AND 110)
AND ce.charttime BETWEEN icu.intime AND icu.intime + INTERVAL '24 hours'
GROUP BY ce.hadm_id),
ne_all AS (
SELECT ie.hadm_id, ie.icustay_id, ie.rate, ie.starttime AS evttime
FROM {MIMIC_SCHEMA}.inputevents_mv ie
WHERE ie.itemid IN ({ne_mv}) AND ie.rate>0
UNION ALL
SELECT ie.hadm_id, ie.icustay_id, ie.rate, ie.charttime AS evttime
FROM {MIMIC_SCHEMA}.inputevents_cv ie
WHERE ie.itemid IN ({ne_cv}) AND ie.rate>0),
ne AS (SELECT ie.hadm_id,MAX(ie.rate) AS val FROM ne_all ie JOIN icu_pts ip ON ie.hadm_id=ip.hadm_id JOIN {MIMIC_SCHEMA}.icustays icu ON ie.icustay_id=icu.icustay_id WHERE ie.evttime BETWEEN icu.intime AND icu.intime + INTERVAL '24 hours' GROUP BY ie.hadm_id)
SELECT ip.hadm_id,ip.died,ip.sapsii,ip.saps_prob,
ll.val AS lactate,lk.val AS creatinine,lp.val AS ph,lt.val AS troponin,lh.val AS hemoglobin,
chr.val AS heart_rate,cma.val AS map_bp,csp.val AS spo2,cte.val AS temperature,ne.val AS ne_dose
chr_.val AS heart_rate,cma.val AS map_bp,csp.val AS spo2,cte.val AS temperature,ne.val AS ne_dose
FROM icu_pts ip
LEFT JOIN l_lac ll ON ip.hadm_id=ll.hadm_id LEFT JOIN l_krea lk ON ip.hadm_id=lk.hadm_id
LEFT JOIN l_ph lp ON ip.hadm_id=lp.hadm_id LEFT JOIN l_trop lt ON ip.hadm_id=lt.hadm_id
LEFT JOIN l_hb lh ON ip.hadm_id=lh.hadm_id LEFT JOIN c_hr chr ON ip.hadm_id=chr.hadm_id
LEFT JOIN l_hb lh ON ip.hadm_id=lh.hadm_id LEFT JOIN c_hr chr_ ON ip.hadm_id=chr_.hadm_id
LEFT JOIN c_map cma ON ip.hadm_id=cma.hadm_id LEFT JOIN c_spo2 csp ON ip.hadm_id=csp.hadm_id
LEFT JOIN c_temp cte ON ip.hadm_id=cte.hadm_id LEFT JOIN ne ON ip.hadm_id=ne.hadm_id"""
rows=run_bq(sql)
rows=run_pg(sql)
pts=[{k:r.get(k) for k in ["hadm_id","died","sapsii","saps_prob"]+PARAM_KEYS}
for r in rows if sum(1 for k in PARAM_KEYS if r.get(k) is not None)>=3 and r.get("died") is not None]
print(f" -> {len(pts)} patients"); return pts
@@ -135,10 +211,13 @@ def assign_galaxies(pts):
hids=[p["hadm_id"] for p in pts];ps=defaultdict(set)
for i in range(0,len(hids),10000):
chunk=hids[i:i+10000]
for r in run_bq(f"SELECT hadm_id,icd_code,icd_version FROM `{DATA_PROJECT}.mimiciv_3_1_hosp.diagnoses_icd` WHERE hadm_id IN ({','.join(str(h) for h in chunk)})"):
# MIMIC-III v1.3 only carries ICD-9 codes (column `icd9_code`).
for r in run_pg(f"SELECT hadm_id,icd9_code FROM {MIMIC_SCHEMA}.diagnoses_icd WHERE hadm_id IN ({','.join(str(h) for h in chunk)})"):
code = r.get("icd9_code")
if code is None: continue
for sk,sd in SYNDROME_ICDS.items():
for rc in sd.get(f"icd_{r['icd_version']}",[]):
if r["icd_code"].startswith(rc): ps[r["hadm_id"]].add(sk);break
for rc in sd.get("icd_9",[]):
if code.startswith(rc): ps[r["hadm_id"]].add(sk);break
for p in pts:
p["galaxy"]=None
for g in GALAXY_PRIORITY:
@@ -147,14 +226,33 @@ def assign_galaxies(pts):
def load_therapy_hadmids(tkey):
t=THERAPIES[tkey]
if tkey=="ne_high":
return set(r["hadm_id"] for r in run_bq(f"SELECT DISTINCT ie.hadm_id FROM `{DATA_PROJECT}.mimiciv_3_1_icu.inputevents` ie JOIN `{DATA_PROJECT}.mimiciv_3_1_icu.icustays` icu ON ie.stay_id=icu.stay_id WHERE ie.itemid={NE_ITEMID} AND ie.rate>=0.5 AND ie.starttime BETWEEN icu.intime AND TIMESTAMP_ADD(icu.intime,INTERVAL 24 HOUR)"))
ne_mv = ",".join(str(i) for i in NE_ITEMIDS_MV)
ne_cv = ",".join(str(i) for i in NE_ITEMIDS_CV)
sql = f"""
SELECT DISTINCT ie.hadm_id
FROM {MIMIC_SCHEMA}.inputevents_mv ie
JOIN {MIMIC_SCHEMA}.icustays icu ON ie.icustay_id=icu.icustay_id
WHERE ie.itemid IN ({ne_mv}) AND ie.rate>=0.5
AND ie.starttime BETWEEN icu.intime AND icu.intime + INTERVAL '24 hours'
UNION
SELECT DISTINCT ie.hadm_id
FROM {MIMIC_SCHEMA}.inputevents_cv ie
JOIN {MIMIC_SCHEMA}.icustays icu ON ie.icustay_id=icu.icustay_id
WHERE ie.itemid IN ({ne_cv}) AND ie.rate>=0.5
AND ie.charttime BETWEEN icu.intime AND icu.intime + INTERVAL '24 hours'
"""
return set(r["hadm_id"] for r in run_pg(sql))
clauses=[]
# MIMIC-III splits inputevents across MetaVision (starttime) and CareVue
# (charttime); we have to query both and UNION the hadm_ids.
for d in t.get("drugs_input",[]):
clauses.append(f"SELECT DISTINCT ie.hadm_id FROM `{DATA_PROJECT}.mimiciv_3_1_icu.inputevents` ie JOIN `{DATA_PROJECT}.mimiciv_3_1_icu.d_items` di ON ie.itemid=di.itemid JOIN `{DATA_PROJECT}.mimiciv_3_1_icu.icustays` icu ON ie.stay_id=icu.stay_id WHERE di.label LIKE '%{d}%' AND ie.starttime BETWEEN icu.intime AND TIMESTAMP_ADD(icu.intime,INTERVAL 24 HOUR)")
clauses.append(f"SELECT DISTINCT ie.hadm_id FROM {MIMIC_SCHEMA}.inputevents_mv ie JOIN {MIMIC_SCHEMA}.d_items di ON ie.itemid=di.itemid JOIN {MIMIC_SCHEMA}.icustays icu ON ie.icustay_id=icu.icustay_id WHERE di.label ILIKE '%{d}%' AND ie.starttime BETWEEN icu.intime AND icu.intime + INTERVAL '24 hours'")
clauses.append(f"SELECT DISTINCT ie.hadm_id FROM {MIMIC_SCHEMA}.inputevents_cv ie JOIN {MIMIC_SCHEMA}.d_items di ON ie.itemid=di.itemid JOIN {MIMIC_SCHEMA}.icustays icu ON ie.icustay_id=icu.icustay_id WHERE di.label ILIKE '%{d}%' AND ie.charttime BETWEEN icu.intime AND icu.intime + INTERVAL '24 hours'")
# MIMIC-III prescriptions uses DATE-precision `startdate` (not `starttime`).
for d in t.get("drugs_rx",[]):
clauses.append(f"SELECT DISTINCT p.hadm_id FROM `{DATA_PROJECT}.mimiciv_3_1_hosp.prescriptions` p JOIN `{DATA_PROJECT}.mimiciv_3_1_icu.icustays` icu ON p.hadm_id=icu.hadm_id WHERE p.drug LIKE '%{d}%' AND p.starttime BETWEEN icu.intime AND TIMESTAMP_ADD(icu.intime,INTERVAL 24 HOUR)")
clauses.append(f"SELECT DISTINCT p.hadm_id FROM {MIMIC_SCHEMA}.prescriptions p JOIN {MIMIC_SCHEMA}.icustays icu ON p.hadm_id=icu.hadm_id WHERE p.drug ILIKE '%{d}%' AND p.startdate BETWEEN icu.intime AND icu.intime + INTERVAL '24 hours'")
if not clauses: return set()
return set(r["hadm_id"] for r in run_bq(" UNION DISTINCT ".join(clauses)))
return set(r["hadm_id"] for r in run_pg(" UNION ".join(clauses)))
def run_loo(test_pts,ref_pts,therapy_hids,by_gal,label):
"""Returns list of {a, p, g, hadm_id} — includes hadm_id for fair comparison."""
@@ -199,7 +297,9 @@ def main():
print(f" 1: Calibration 2: Fair Comparison 3: Summary")
print(f"{'='*76}\n")
all_pts=load_all_icu();assign_galaxies(all_pts)
all_pts=load_all_icu();
all_pts = [p for p in all_pts if p.get("saps_prob") is not None]
assign_galaxies(all_pts)
by_gal=defaultdict(list)
for p in all_pts:
if p["galaxy"]: by_gal[p["galaxy"]].append(p)
@@ -214,10 +314,33 @@ def main():
results = {}
# ── Full pop LOO ───────────────────────────────────────────────
print(f"\n Running full-population LOO...")
print(f"\n Running s-population LOO...")
p_full = run_loo(all_pts, all_pts, therapy_hids, by_gal, "FULL-POP")
a_td = auc_fast(p_full)
print(f" * TD full pop: AUC {a_td:.4f} n={len(p_full):,}")
print(f" * TD s-pop: AUC {a_td:.4f} n={len(p_full):,}")
# SAPS score for comparison
saps_preds_all = [{"a": p["died"], "p": p["saps_prob"]}
for p in all_pts if p.get("saps_prob") is not None]
a_td_s = auc_fast(saps_preds_all)
print(f" * SAPS-II s-pop: AUC {a_td_s:.4f} n={len(saps_preds_all):,}")
saps_preds_by_gal = {
gal: [{"a": p["died"], "p": p["saps_prob"]}
for p in gal_pts if p.get("saps_prob") is not None]
for gal, gal_pts in by_gal.items()
}
a_td_sg = auc_fast_gal(saps_preds_by_gal)
ns = str([len(gal_pts) for (gal, gal_pts) in saps_preds_by_gal.items()])
print(f" * SAPS-II by gal: AUC {a_td_sg:.4f} n={ns}")
td_preds_by_gal = defaultdict(list)
for pred in p_full:
if pred.get("g"):
td_preds_by_gal[pred["g"]].append({"a": pred["a"], "p": pred["p"]})
a_td_g = auc_fast_gal(dict(td_preds_by_gal))
ns = str([len(gal_pts) for (gal, gal_pts) in td_preds_by_gal.items()])
print(f" * TD by gal: AUC {a_td_g:.4f} n={ns}")
# ══════════════════════════════════════════════════════════════
# 1. CALIBRATION (10 Dezile)

View File

@@ -0,0 +1,447 @@
"""
ICD-coded sepsis prevalence — inclusion vs exclusion cohort
═════════════════════════════════════════════════════════════════════════════
Companion analysis for paper3_phase5b_refined.py.
The Phase 5b cohort SQL (`q_cohort()`) keeps an ICU stay only when ALL of:
1. sepsis3.sepsis3 = TRUE (Sepsis-3 derived flag)
2. ICU length-of-stay ≥ 24h (H_SNAPSHOT)
3. sapsii IS NOT NULL AND sapsii ≥ 48 (SAPS-II Q4)
This script computes the prevalence of *explicit* ICD-coded sepsis on:
(a) the INCLUSION cohort — stays that satisfy all three filters,
(b) the EXCLUSION cohort — every other ICU stay in MIMIC-III (fails
at least one of the three filters above), and
(c) ALL ICU STAYS — the full mimiciii.icustays universe
(= inclusion exclusion).
Note: stay-level totals partition cleanly (incl + excl = all), but at the
admission and subject level a single hadm_id / subject can have ICU stays
in both buckets, so the "all" row is computed via SQL GROUPING SETS rather
than by summing.
ICD-coded sepsis is evaluated at the hospital-admission level (a stay is
"ICD-sepsis +" if its parent hadm_id carries any of the codes below):
- Explicit sepsis (ICD-9): 995.91, 995.92, 785.52
(matches paper2_festung_teil3.py SYNDROME_ICDS["sepsis"]["icd_9"])
- Angus septicemia (ICD-9): 038.* (any 038-prefixed code)
- Any of the above (union)
For each cohort × definition combination we report:
n positives, prevalence, Wilson 95% CI.
The inclusion vs exclusion difference is reported with a normal-approx
95% CI and a Pearson χ² statistic (no scipy dependency).
Usage:
python paper3_phase5b_icd_sepsis_prevalence.py
"""
import json, math, os, sys, time
# Reuse the same env-var contract as paper3_phase5b_refined.py.
PG_DSN = os.environ.get("MIMIC_PG_DSN", "dbname=mimic3")
MIMIC_SCHEMA = os.environ.get("MIMIC_SCHEMA", "mimiciii")
DERIVED_SCHEMA = os.environ.get("DERIVED_SCHEMA", MIMIC_SCHEMA)
H_SNAPSHOT = 24 # ICU LOS threshold, hours (matches paper3 phase 5b)
SAPSII_MIN = 48 # SAPS-II Q4 cutoff (matches paper3 phase 5b)
OUT_FILE = "paper3_phase5b_icd_sepsis_prevalence.json"
# Explicit sepsis codes (ICD-9, MIMIC-III stores them WITHOUT decimal point):
# 995.91 → '99591' Sepsis
# 995.92 → '99592' Severe sepsis
# 785.52 → '78552' Septic shock
EXPLICIT_SEPSIS_ICD9 = ("99591", "99592", "78552")
# Angus-style broad septicemia bucket: any ICD-9 starting with 038.
SEPTICEMIA_PREFIX = "038"
_PG_CONN = None
def _pg_conn():
global _PG_CONN
if _PG_CONN is None or getattr(_PG_CONN, "closed", 0):
import psycopg2
_PG_CONN = psycopg2.connect(PG_DSN)
_PG_CONN.set_session(readonly=True, autocommit=True)
return _PG_CONN
def run_pg(sql, label=""):
import psycopg2.extras
conn = _pg_conn()
t0 = time.time()
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute(sql)
rows = [dict(r) for r in cur.fetchall()] if cur.description else []
print(f" {label:40s} {len(rows):>8,d} rows ({time.time()-t0:.1f}s)")
return rows
# ── SQL ─────────────────────────────────────────────────────────────────────
#
# One pass: classify every ICU stay as "inclusion" or "exclusion" using the
# Phase 5b filter, then left-join the two ICD code sets at the hadm_id level
# and aggregate. This mirrors q_cohort() exactly for the inclusion bucket
# (sepsis3 = TRUE AND LOS ≥ 24h AND sapsii ≥ 48), and treats every other
# ICU stay in mimiciii.icustays as exclusion. GROUPING SETS adds a third
# row (cohort = NULL → 'all') aggregated over the full ICU universe so that
# admission- and subject-level distinct counts are correct (a single hadm_id
# may straddle both buckets, so we cannot just sum incl + excl).
def q_prevalence():
explicit = ",".join(f"'{c}'" for c in EXPLICIT_SEPSIS_ICD9)
return f"""
WITH icu AS (
SELECT icu.icustay_id,
icu.hadm_id,
icu.subject_id,
EXTRACT(EPOCH FROM (icu.outtime - icu.intime)) / 3600.0 AS los_h,
COALESCE(s3.sepsis3, FALSE) AS is_sepsis3,
saps.sapsii AS sapsii
FROM {MIMIC_SCHEMA}.icustays icu
LEFT JOIN {DERIVED_SCHEMA}.sepsis3 s3 ON s3.icustay_id = icu.icustay_id
LEFT JOIN {DERIVED_SCHEMA}.sapsii saps ON saps.icustay_id = icu.icustay_id
WHERE icu.icustay_id IS NOT NULL
AND icu.hadm_id IS NOT NULL
),
classified AS (
SELECT icustay_id, hadm_id, subject_id,
is_sepsis3, los_h, sapsii,
CASE WHEN is_sepsis3 = TRUE
AND los_h >= {H_SNAPSHOT}
AND sapsii IS NOT NULL
AND sapsii >= {SAPSII_MIN}
THEN 'inclusion' ELSE 'exclusion' END AS cohort
FROM icu
),
explicit_sepsis AS (
SELECT DISTINCT hadm_id
FROM {MIMIC_SCHEMA}.diagnoses_icd
WHERE icd9_code IN ({explicit})
),
septicemia AS (
SELECT DISTINCT hadm_id
FROM {MIMIC_SCHEMA}.diagnoses_icd
WHERE icd9_code LIKE '{SEPTICEMIA_PREFIX}%%'
)
SELECT
COALESCE(c.cohort, 'all') AS cohort,
COUNT(*) AS n_stays,
COUNT(DISTINCT c.hadm_id) AS n_admissions,
COUNT(DISTINCT c.subject_id) AS n_subjects,
SUM(CASE WHEN e.hadm_id IS NOT NULL
THEN 1 ELSE 0 END) AS n_stays_explicit,
COUNT(DISTINCT CASE WHEN e.hadm_id IS NOT NULL
THEN c.hadm_id END) AS n_adm_explicit,
SUM(CASE WHEN s.hadm_id IS NOT NULL
THEN 1 ELSE 0 END) AS n_stays_septicemia,
COUNT(DISTINCT CASE WHEN s.hadm_id IS NOT NULL
THEN c.hadm_id END) AS n_adm_septicemia,
SUM(CASE WHEN e.hadm_id IS NOT NULL
OR s.hadm_id IS NOT NULL
THEN 1 ELSE 0 END) AS n_stays_any,
COUNT(DISTINCT CASE WHEN e.hadm_id IS NOT NULL
OR s.hadm_id IS NOT NULL
THEN c.hadm_id END) AS n_adm_any
FROM classified c
LEFT JOIN explicit_sepsis e ON e.hadm_id = c.hadm_id
LEFT JOIN septicemia s ON s.hadm_id = c.hadm_id
GROUP BY GROUPING SETS ((c.cohort), ())
"""
def q_exclusion_breakdown():
"""How many excluded stays fail each individual filter (non-exclusive
counts; an excluded stay can violate >1 criterion)."""
return f"""
WITH icu AS (
SELECT icu.icustay_id,
EXTRACT(EPOCH FROM (icu.outtime - icu.intime)) / 3600.0 AS los_h,
COALESCE(s3.sepsis3, FALSE) AS is_sepsis3,
saps.sapsii AS sapsii
FROM {MIMIC_SCHEMA}.icustays icu
LEFT JOIN {DERIVED_SCHEMA}.sepsis3 s3 ON s3.icustay_id = icu.icustay_id
LEFT JOIN {DERIVED_SCHEMA}.sapsii saps ON saps.icustay_id = icu.icustay_id
WHERE icu.icustay_id IS NOT NULL
AND icu.hadm_id IS NOT NULL
)
SELECT
COUNT(*) AS n_total,
SUM(CASE WHEN is_sepsis3 = FALSE
THEN 1 ELSE 0 END) AS n_not_sepsis3,
SUM(CASE WHEN los_h < {H_SNAPSHOT}
THEN 1 ELSE 0 END) AS n_los_short,
SUM(CASE WHEN sapsii IS NULL
THEN 1 ELSE 0 END) AS n_sapsii_null,
SUM(CASE WHEN sapsii IS NOT NULL AND sapsii < {SAPSII_MIN}
THEN 1 ELSE 0 END) AS n_sapsii_below
FROM icu
"""
def q_icd_sepsis_waterfall():
"""Mutually-exclusive waterfall, restricted to ICD-coded sepsis stays
only, showing which inclusion gate eliminated them. Uses the union
definition (explicit sepsis codes 038.* septicemia)."""
explicit = ",".join(f"'{c}'" for c in EXPLICIT_SEPSIS_ICD9)
return f"""
WITH icu AS (
SELECT icu.icustay_id, icu.hadm_id,
EXTRACT(EPOCH FROM (icu.outtime - icu.intime)) / 3600.0 AS los_h,
COALESCE(s3.sepsis3, FALSE) AS is_sepsis3,
saps.sapsii AS sapsii
FROM {MIMIC_SCHEMA}.icustays icu
LEFT JOIN {DERIVED_SCHEMA}.sepsis3 s3 ON s3.icustay_id = icu.icustay_id
LEFT JOIN {DERIVED_SCHEMA}.sapsii saps ON saps.icustay_id = icu.icustay_id
WHERE icu.icustay_id IS NOT NULL
AND icu.hadm_id IS NOT NULL
),
icd_pos AS (
SELECT DISTINCT hadm_id
FROM {MIMIC_SCHEMA}.diagnoses_icd
WHERE icd9_code IN ({explicit})
OR icd9_code LIKE '{SEPTICEMIA_PREFIX}%%'
),
icd_stays AS (
SELECT i.* FROM icu i JOIN icd_pos x ON x.hadm_id = i.hadm_id
)
SELECT
COUNT(*) AS n_total_stays,
COUNT(DISTINCT hadm_id) AS n_total_adm,
-- Waterfall: each stay is counted exactly once, in the order of the
-- inclusion filter (sepsis3 → LOS → sapsii NULL → sapsii < 48 → pass).
SUM(CASE WHEN NOT is_sepsis3
THEN 1 ELSE 0 END) AS n_fail_sepsis3,
SUM(CASE WHEN is_sepsis3 AND los_h < {H_SNAPSHOT}
THEN 1 ELSE 0 END) AS n_fail_los,
SUM(CASE WHEN is_sepsis3 AND los_h >= {H_SNAPSHOT}
AND sapsii IS NULL
THEN 1 ELSE 0 END) AS n_fail_sapsii_null,
SUM(CASE WHEN is_sepsis3 AND los_h >= {H_SNAPSHOT}
AND sapsii IS NOT NULL
AND sapsii < {SAPSII_MIN}
THEN 1 ELSE 0 END) AS n_fail_sapsii_below,
SUM(CASE WHEN is_sepsis3 AND los_h >= {H_SNAPSHOT}
AND sapsii IS NOT NULL
AND sapsii >= {SAPSII_MIN}
THEN 1 ELSE 0 END) AS n_pass
FROM icd_stays
"""
# ── Stats helpers (no scipy) ────────────────────────────────────────────────
def wilson_ci(k, n, z=1.959963984540054):
"""Wilson score 95% CI for a binomial proportion. Returns (lo, hi)."""
if n <= 0:
return (float("nan"), float("nan"))
p = k / n
denom = 1.0 + z*z/n
centre = (p + z*z/(2.0*n)) / denom
half = (z * math.sqrt(p*(1.0 - p)/n + z*z/(4.0*n*n))) / denom
return (max(0.0, centre - half), min(1.0, centre + half))
def diff_ci(k1, n1, k2, n2, z=1.959963984540054):
"""Normal-approx 95% CI for (p1 p2). Returns (delta, lo, hi)."""
if n1 <= 0 or n2 <= 0:
return (float("nan"),) * 3
p1, p2 = k1 / n1, k2 / n2
se = math.sqrt(p1*(1.0 - p1)/n1 + p2*(1.0 - p2)/n2)
d = p1 - p2
return (d, d - z*se, d + z*se)
def chi2_2x2(k1, n1, k2, n2):
"""Pearson χ² for the 2×2 table (sepsis± × cohort). Returns (chi2, dof=1).
Critical value at p=0.05 is 3.841."""
a, b = k1, n1 - k1 # incl: sepsis+, sepsis
c, d = k2, n2 - k2 # excl: sepsis+, sepsis
n = n1 + n2
if n == 0: return (float("nan"), 1)
row1, row2 = a + b, c + d
col1, col2 = a + c, b + d
chi2 = 0.0
for obs, r, col in ((a, row1, col1), (b, row1, col2),
(c, row2, col1), (d, row2, col2)):
exp = r * col / n
if exp > 0:
chi2 += (obs - exp) ** 2 / exp
return (chi2, 1)
def fmt_pct(p): return f"{100.0*p:5.2f}%"
def fmt_ci(lo,hi): return f"[{100.0*lo:5.2f}, {100.0*hi:5.2f}]"
# ── Main ────────────────────────────────────────────────────────────────────
def main():
print("\n" + ""*78)
print(" ICD-coded sepsis prevalence — Phase 5b inclusion vs exclusion")
print(""*78)
print(f"\n PG DSN: {PG_DSN}")
print(f" MIMIC schema: {MIMIC_SCHEMA}")
print(f" Derived schema: {DERIVED_SCHEMA}")
print(f" Inclusion: sepsis3=TRUE AND LOS≥{H_SNAPSHOT}h AND SAPS-II≥{SAPSII_MIN}")
print(f" Explicit ICD-9: {', '.join(EXPLICIT_SEPSIS_ICD9)} "
f"(995.91 / 995.92 / 785.52)")
print(f" Septicemia: ICD-9 {SEPTICEMIA_PREFIX}.*")
print(f"\n[1] Querying MIMIC-III...")
rows = run_pg(q_prevalence(), "cohort × ICD prevalence")
bkdwn = run_pg(q_exclusion_breakdown(), "exclusion breakdown")
wfall = run_pg(q_icd_sepsis_waterfall(), "ICD-sepsis waterfall")
if not rows:
print("\n[ERROR] no rows returned. Check PG_DSN / schema permissions.")
sys.exit(1)
by = {r["cohort"]: r for r in rows}
incl = by.get("inclusion", {})
excl = by.get("exclusion", {})
allc = by.get("all", {})
INT_KEYS = ("n_stays","n_admissions","n_subjects",
"n_stays_explicit","n_adm_explicit",
"n_stays_septicemia","n_adm_septicemia",
"n_stays_any","n_adm_any")
for c in (incl, excl, allc):
for k in INT_KEYS:
c[k] = int(c.get(k) or 0)
COHORTS = (("inclusion", incl), ("exclusion", excl), ("all", allc))
# ── [2] Cohort sizes ──────────────────────────────────────────────────
print(f"\n[2] Cohort sizes")
print(f" {'cohort':12s} {'stays':>8s} {'admissions':>11s} {'subjects':>9s}")
for label, c in COHORTS:
print(f" {label:12s} {c['n_stays']:>8,d} "
f"{c['n_admissions']:>11,d} {c['n_subjects']:>9,d}")
# ── [3] Why an ICU stay was excluded ──────────────────────────────────
if bkdwn:
b = bkdwn[0]
print(f"\n[3] Exclusion breakdown (non-exclusive: a stay can fail >1 filter)")
n_total = int(b['n_total'] or 0)
for lbl, k in (("not Sepsis-3", "n_not_sepsis3"),
(f"ICU LOS < {H_SNAPSHOT}h", "n_los_short"),
("SAPS-II is NULL", "n_sapsii_null"),
(f"SAPS-II < {SAPSII_MIN}", "n_sapsii_below")):
n = int(b[k] or 0)
pct = 100.0*n/n_total if n_total else 0.0
print(f" {lbl:24s} {n:>8,d} ({pct:5.2f}% of all ICU stays)")
print(f" {'all ICU stays':24s} {n_total:>8,d}")
# ── [3b] ICD-sepsis-positive waterfall ────────────────────────────────
# Diagnostic: of the ICD-coded sepsis stays (explicit 038.*), which
# inclusion gate eliminated them? Mutually exclusive: each stay is
# counted in the FIRST gate that would reject it, walking in the
# inclusion-filter order (sepsis3 → LOS → SAPS-II NULL → SAPS-II<48).
if wfall:
w = wfall[0]
n_w = int(w["n_total_stays"] or 0)
n_adm = int(w["n_total_adm"] or 0)
print(f"\n[3b] ICD-sepsis-positive waterfall (mutually exclusive,"
f" inclusion-filter order)")
print(f" {'gate':28s} {'n stays':>9s} {'pct':>6s} cumulative")
cum = 0
steps = (
("rejected: not Sepsis-3", "n_fail_sepsis3"),
(f"rejected: LOS < {H_SNAPSHOT}h", "n_fail_los"),
("rejected: SAPS-II is NULL", "n_fail_sapsii_null"),
(f"rejected: SAPS-II < {SAPSII_MIN}", "n_fail_sapsii_below"),
(f"PASS (= inclusion)", "n_pass"),
)
for lbl, kn in steps:
n = int(w[kn] or 0)
cum += n
pct = 100.0*n/n_w if n_w else 0.0
print(f" {lbl:28s} {n:>9,d} {pct:5.2f}% {cum:>9,d}")
print(f" {'TOTAL ICD-sepsis stays':28s} {n_w:>9,d} "
f"({n_adm:,} admissions)")
# ── [4] Prevalence per definition ─────────────────────────────────────
DEFS = (
("Explicit sepsis (995.91 / 995.92 / 785.52)",
"n_stays_explicit", "n_adm_explicit"),
("Angus septicemia (038.*)",
"n_stays_septicemia", "n_adm_septicemia"),
("Any of the above (union)",
"n_stays_any", "n_adm_any"),
)
def _table(title, denom_key_n, denom_key_k):
"""Render the prevalence table for one denominator (stays or
admissions) and append rows to `results[bucket]`."""
print(f"\n{title}")
print(f" {'definition':45s} {'cohort':10s} "
f"{'n+':>7s} {'N':>7s} {'prev':>7s} {'95% CI (Wilson)':>18s}")
out = []
for name, sk, ak in DEFS:
kkey = sk if denom_key_k == "n_stays" else ak
for label, c in COHORTS:
k_, n_ = c[kkey], c[denom_key_n]
p = k_/n_ if n_ else float("nan")
lo, hi = wilson_ci(k_, n_)
print(f" {name:45s} {label:10s} "
f"{k_:>7,d} {n_:>7,d} {fmt_pct(p):>7s} "
f"{fmt_ci(lo,hi):>18s}")
out.append({"definition": name, "cohort": label,
"k": k_, "n": n_, "prevalence": p,
"ci_lo": lo, "ci_hi": hi})
# Inclusion vs exclusion comparison (the "all" row is just a
# weighted average of the two so a Δ against it isn't meaningful).
k1, n1 = incl[kkey], incl[denom_key_n]
k2, n2 = excl[kkey], excl[denom_key_n]
d, dlo, dhi = diff_ci(k1, n1, k2, n2)
chi2, dof = chi2_2x2(k1, n1, k2, n2)
sig = "p<0.05" if chi2 > 3.841 else "n.s."
print(f" {' Δ (incl excl)':45s} {'':10s} "
f"{'':>7s} {'':>7s} {fmt_pct(d):>7s} "
f"{fmt_ci(dlo,dhi):>18s} χ²={chi2:6.2f} ({sig})")
out.append({"definition": name, "cohort": "delta_incl_minus_excl",
"delta": d, "ci_lo": dlo, "ci_hi": dhi,
"chi2": chi2, "dof": dof})
return out
results = {
"by_stays": _table("[4] ICD-coded sepsis prevalence (denominator = ICU STAYS)",
"n_stays", "n_stays"),
"by_admissions": _table("[5] ICD-coded sepsis prevalence (denominator = ADMISSIONS)",
"n_admissions", "n_admissions"),
}
# ── Save ────────────────────────────────────────────────────────────
output = {
"filters": {
"h_snapshot_hours": H_SNAPSHOT,
"sapsii_min": SAPSII_MIN,
"explicit_icd9": list(EXPLICIT_SEPSIS_ICD9),
"septicemia_prefix": SEPTICEMIA_PREFIX,
},
"cohorts": {
"inclusion": {k: incl[k] for k in INT_KEYS},
"exclusion": {k: excl[k] for k in INT_KEYS},
"all": {k: allc[k] for k in INT_KEYS},
},
"exclusion_breakdown": (bkdwn[0] if bkdwn else None),
"icd_sepsis_waterfall": (wfall[0] if wfall else None),
"results": results,
}
with open(OUT_FILE, "w") as f:
json.dump(output, f, indent=2, default=str)
print(f"\n → Saved: {OUT_FILE}")
print("\n" + ""*78 + "\n")
if __name__ == "__main__":
main()

619
paper3_phase5b_refined.py Normal file
View File

@@ -0,0 +1,619 @@
"""
PAPER 3 — PHASE 5b: 5-TERM FORMULA + ORDINAL BEDSIDE SCORE
═════════════════════════════════════════════════════════════════════════════
Two refinements from Phase 5:
1. DROP log(1 + ne_auc). Its bootstrap CI crossed zero, and it is
collinear with I(ne_at_h24 > 0.08). Simpler 5-term formula.
2. BEDSIDE SCORE via ORDINAL BINNING (not coefficient rounding).
Each of the 5 remaining terms gets mapped to 04 points based on
clinically meaningful thresholds. SOFA-style integer score:
Lactate h24 0 if <2.5 | 2 if 2.54 | 4 if >4
Oliguria (ml/kg) 0 if ≥20 | 1 if 1020 | 2 if <10
NE at h24 0 if ≤0.08 | 3 if >0.08
HR deviation 0 if 70100| 1 if 6070/100-120 | 2 if <60/>120
Pressor MAP/NE 0 if >3000 | 1 if 10003000 | 2 if <1000
Max 13 points. Bedside-ready.
Repeats the full Phase 5 validation: CV, multi-seed, bootstrap, calibration,
subgroups — for BOTH the 5-term continuous formula AND the ordinal score.
Usage:
python paper3_phase5b_refined.py
"""
import json, os, sys, math, time, random
from collections import defaultdict
# PostgreSQL connection string (libpq DSN). Override with env var.
# e.g. "host=localhost port=5432 dbname=mimic user=postgres password=..."
PG_DSN = os.environ.get("MIMIC_PG_DSN", "dbname=mimic3")
# Schema holding the stock MIMIC-III v1.3 tables (admissions, icustays,
# labevents, chartevents, inputevents_mv, inputevents_cv, outputevents,
# patients, d_items, ...).
MIMIC_SCHEMA = os.environ.get("MIMIC_SCHEMA", "mimiciii")
# Schema holding the locally built derived tables (sapsii, sepsis3,
# norepinephrine_dose, weight_durations, ...); see sql/schemas.sql.
# Defaults to the same schema as MIMIC-III itself.
DERIVED_SCHEMA = os.environ.get("DERIVED_SCHEMA", MIMIC_SCHEMA)
H_SNAPSHOT = 24
H_PEAK_NE = 12
TRAIN_FRAC = 0.70
N_SEEDS = 10
N_FOLDS = 5
N_BOOTSTRAP = 1000
OUT_FILE = "paper3_phase5b_refined.json"
LACTATE_ID = 50813
# MAP: 52, 456, 6702 = CareVue; 220052, 220181, 225312 = MetaVision.
MAP_ITEMIDS = [52, 456, 6702, 220052, 220181, 225312]
# HR: 211 = CareVue; 220045 = MetaVision.
HR_ITEMIDS = [211, 220045]
_PG_CONN = None
def _pg_conn():
global _PG_CONN
if _PG_CONN is None or getattr(_PG_CONN, "closed", 0):
import psycopg2
_PG_CONN = psycopg2.connect(PG_DSN)
_PG_CONN.set_session(readonly=True, autocommit=True)
return _PG_CONN
def run_pg(sql, label=""):
try:
import psycopg2.extras
conn = _pg_conn()
t0 = time.time()
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute(sql)
rows = [dict(r) for r in cur.fetchall()] if cur.description else []
print(f" {label:32s} {len(rows):>8,d} rows ({time.time()-t0:.1f}s)")
return rows
except Exception as e:
print(f"[PG ERROR] {label}: {e}"); return []
# ── Queries (PostgreSQL / MIMIC-III v1.3, SAPS Q4 pre-filtered) ────────────
#
# Notes on the port from BigQuery / MIMIC-IV:
# * `stay_id` (MIMIC-IV) is `icustay_id` in MIMIC-III; we alias to
# `stay_id` so the downstream Python is unchanged.
# * `mimiciv_3_1_icu.inputevents` (single table, mcg/kg/min) is split
# across `inputevents_mv` and `inputevents_cv` in MIMIC-III with
# different itemids and units. The `norepinephrine_dose` table built
# by sql/build_sepsis3.sql already merges both eras and normalises
# rates to mcg/kg/min, so we use that instead of the raw inputs.
# * Weight in MIMIC-IV is read from chartevents itemids 226512/224639
# (MetaVision-only). In MIMIC-III those itemids cover only the MV
# half of the cohort, so we use the `weight_durations` table built by
# sql/build_sepsis3.sql (admit + daily + neonate + echo, both eras).
# * `pat.anchor_age` (MIMIC-IV) → computed from `pat.dob` against
# `icu.intime`. MIMIC-III shifts dob backwards by ~300 years for
# patients ≥89; we cap the result at 120.
def q_cohort():
return f"""
WITH weight_first AS (
SELECT wd.icustay_id, MIN(wd.weight) AS weight_kg
FROM {DERIVED_SCHEMA}.weight_durations wd
JOIN {MIMIC_SCHEMA}.icustays icu ON icu.icustay_id = wd.icustay_id
WHERE wd.weight BETWEEN 30 AND 300
AND wd.starttime <= icu.intime + INTERVAL '24 hours'
AND wd.endtime >= icu.intime
GROUP BY wd.icustay_id
)
SELECT icu.icustay_id AS stay_id, icu.subject_id, icu.intime,
LEAST(120.0, EXTRACT(EPOCH FROM (icu.intime - pat.dob)) / 31556952.0) AS age,
pat.gender,
saps.sapsii, adm.hospital_expire_flag AS died,
COALESCE(wf.weight_kg, 75.0) AS weight_kg
FROM {DERIVED_SCHEMA}.sepsis3 s3
JOIN {MIMIC_SCHEMA}.icustays icu ON icu.icustay_id = s3.icustay_id
JOIN {MIMIC_SCHEMA}.admissions adm ON adm.hadm_id = icu.hadm_id
JOIN {MIMIC_SCHEMA}.patients pat ON pat.subject_id = icu.subject_id
LEFT JOIN {DERIVED_SCHEMA}.sapsii saps ON saps.icustay_id = icu.icustay_id
LEFT JOIN weight_first wf ON wf.icustay_id = icu.icustay_id
WHERE s3.sepsis3 = TRUE
AND EXTRACT(EPOCH FROM (icu.outtime - icu.intime)) / 3600.0 >= {H_SNAPSHOT}
AND saps.sapsii IS NOT NULL AND saps.sapsii >= 48
"""
def q_ne():
return f"""
SELECT nd.icustay_id AS stay_id,
EXTRACT(EPOCH FROM (nd.starttime - icu.intime)) / 60.0 AS start_min,
EXTRACT(EPOCH FROM (nd.endtime - icu.intime)) / 60.0 AS end_min,
nd.vaso_rate AS rate
FROM {DERIVED_SCHEMA}.norepinephrine_dose nd
JOIN {MIMIC_SCHEMA}.icustays icu ON icu.icustay_id = nd.icustay_id
JOIN {DERIVED_SCHEMA}.sepsis3 s3 ON s3.icustay_id = nd.icustay_id
JOIN {DERIVED_SCHEMA}.sapsii saps ON saps.icustay_id = nd.icustay_id
WHERE s3.sepsis3 = TRUE AND saps.sapsii >= 48
AND nd.vaso_rate > 0
AND nd.starttime BETWEEN icu.intime AND icu.intime + INTERVAL '30 hours'
"""
def q_fluid_out():
return f"""
SELECT oe.icustay_id AS stay_id, SUM(oe.value) AS fluid_out_ml
FROM {MIMIC_SCHEMA}.outputevents oe
JOIN {MIMIC_SCHEMA}.icustays icu ON icu.icustay_id = oe.icustay_id
JOIN {DERIVED_SCHEMA}.sepsis3 s3 ON s3.icustay_id = oe.icustay_id
JOIN {DERIVED_SCHEMA}.sapsii saps ON saps.icustay_id = icu.icustay_id
WHERE s3.sepsis3 = TRUE AND saps.sapsii >= 48
AND oe.value > 0
AND oe.charttime BETWEEN icu.intime AND icu.intime + INTERVAL '{H_SNAPSHOT} hours'
GROUP BY oe.icustay_id
"""
def q_vitals():
ids = ",".join(str(x) for x in MAP_ITEMIDS + HR_ITEMIDS)
return f"""
SELECT ce.icustay_id AS stay_id, ce.itemid, AVG(ce.valuenum) AS val
FROM {MIMIC_SCHEMA}.chartevents ce
JOIN {MIMIC_SCHEMA}.icustays icu ON icu.icustay_id = ce.icustay_id
JOIN {DERIVED_SCHEMA}.sepsis3 s3 ON s3.icustay_id = ce.icustay_id
JOIN {DERIVED_SCHEMA}.sapsii saps ON saps.icustay_id = icu.icustay_id
WHERE s3.sepsis3 = TRUE AND saps.sapsii >= 48
AND ce.itemid IN ({ids})
AND ce.valuenum IS NOT NULL AND ce.valuenum > 0
AND ce.charttime BETWEEN icu.intime + INTERVAL '20 hours'
AND icu.intime + INTERVAL '28 hours'
GROUP BY ce.icustay_id, ce.itemid
"""
def q_lactate():
return f"""
SELECT icu.icustay_id AS stay_id,
EXTRACT(EPOCH FROM (le.charttime - icu.intime)) / 60.0 AS offset_min,
le.valuenum AS val
FROM {MIMIC_SCHEMA}.labevents le
JOIN {MIMIC_SCHEMA}.icustays icu ON icu.hadm_id = le.hadm_id
JOIN {DERIVED_SCHEMA}.sepsis3 s3 ON s3.icustay_id = icu.icustay_id
JOIN {DERIVED_SCHEMA}.sapsii saps ON saps.icustay_id = icu.icustay_id
WHERE s3.sepsis3 = TRUE AND saps.sapsii >= 48
AND le.itemid = {LACTATE_ID}
AND le.valuenum IS NOT NULL
AND le.charttime BETWEEN icu.intime AND icu.intime + INTERVAL '30 hours'
"""
# ── Primitives ──────────────────────────────────────────────────────────────
def build_primitives(cohort, ne_rows, fout_rows, vital_rows, lac_rows):
print(f"\n[3] Building primitives...")
ne_by = defaultdict(list)
for r in ne_rows: ne_by[r["stay_id"]].append(r)
fout = {r["stay_id"]: r["fluid_out_ml"] or 0 for r in fout_rows}
vital_by = defaultdict(dict)
for r in vital_rows:
iid = r["itemid"]
key = "map" if iid in MAP_ITEMIDS else "hr"
if r["stay_id"] is not None:
cur = vital_by[r["stay_id"]].get(key)
vital_by[r["stay_id"]][key] = r["val"] if cur is None else (cur + r["val"])/2
lac_by = defaultdict(list)
for r in lac_rows: lac_by[r["stay_id"]].append(r)
prim = {}
for sid, c in cohort.items():
weight = c.get("weight_kg") or 75.0
events = ne_by.get(sid, [])
ne_h24 = 0.0
for ev in events:
sm, em, rate = ev["start_min"], ev["end_min"], ev["rate"]
if None in (sm, em, rate): continue
if sm <= H_SNAPSHOT*60 <= em and rate > ne_h24: ne_h24 = rate
lacs = sorted(lac_by.get(sid, []), key=lambda x: x["offset_min"] or 0)
lac_h24 = None
if lacs:
near = [r for r in lacs if r["offset_min"] is not None
and 18*60 <= r["offset_min"] <= 28*60]
lac_h24 = near[-1]["val"] if near else lacs[-1]["val"]
v = vital_by.get(sid, {})
map_h24 = v.get("map")
hr_h24 = v.get("hr")
pr = map_h24 / (ne_h24 + 0.01) if map_h24 is not None else None
prim[sid] = {
"ne_at_h24": ne_h24,
"fluid_out_per_kg": fout.get(sid, 0) / weight,
"lactate_h24": lac_h24,
"map_h24": map_h24, "hr_h24": hr_h24,
"pressor_resistance": pr,
}
return prim
# ── 5-term continuous formula ──────────────────────────────────────────────
def formula_features(p):
lac = p.get("lactate_h24")
fout = p.get("fluid_out_per_kg")
ne24 = p.get("ne_at_h24", 0.0) or 0.0
hr = p.get("hr_h24")
pr = p.get("pressor_resistance")
if lac is None or fout is None or hr is None or pr is None:
return None
return [
max(0, lac - 2.5), # lactate hinge
max(0, 20 - fout), # oliguria hinge
1.0 if ne24 > 0.08 else 0.0, # NE persistence
abs(hr - 85) / 20, # HR deviation
math.log(pr + 1.0), # pressor efficiency
]
FEATURE_LABELS = [
"max(0, lactate_h24 2.5)",
"max(0, 20 fluid_out_per_kg)",
"I(ne_at_h24 > 0.08)",
"|hr_h24 85| / 20",
"log(pressor_resistance + 1)",
]
# ── Ordinal bedside score (013 pts) ───────────────────────────────────────
def bedside_score(p):
lac = p.get("lactate_h24")
fout = p.get("fluid_out_per_kg")
ne24 = p.get("ne_at_h24", 0.0) or 0.0
hr = p.get("hr_h24")
pr = p.get("pressor_resistance")
if lac is None or fout is None or hr is None or pr is None:
return None, None
# Lactate (0 / 2 / 4)
if lac < 2.5: pts_lac = 0
elif lac <= 4.0: pts_lac = 2
else: pts_lac = 4
# Oliguria (0 / 1 / 2)
if fout >= 20: pts_olig = 0
elif fout >= 10: pts_olig = 1
else: pts_olig = 2
# NE persistence (0 / 3)
pts_ne = 3 if ne24 > 0.08 else 0
# HR deviation (0 / 1 / 2)
if 70 <= hr <= 100: pts_hr = 0
elif (60 <= hr < 70) or (100 < hr <= 120): pts_hr = 1
else: pts_hr = 2
# Pressor efficiency (0 / 1 / 2)
if pr > 3000: pts_pr = 0
elif pr >= 1000: pts_pr = 1
else: pts_pr = 2
total = pts_lac + pts_olig + pts_ne + pts_hr + pts_pr
breakdown = {
"lactate": pts_lac, "oliguria": pts_olig,
"ne_persist": pts_ne, "hr_dev": pts_hr, "pressor_eff": pts_pr,
}
return total, breakdown
def build_matrix(ids, primitives, cohort):
import numpy as np
X, y, scores, saps, valid = [], [], [], [], []
for sid in ids:
p = primitives.get(sid)
if p is None: continue
f = formula_features(p)
s, _ = bedside_score(p)
if f is None or s is None: continue
sap = cohort[sid].get("sapsii")
if sap is None: continue
X.append(f)
y.append(int(cohort[sid].get("died") or 0))
scores.append(s)
saps.append(float(sap))
valid.append(sid)
return np.array(X), np.array(y), np.array(scores), np.array(saps), valid
# ── Main ────────────────────────────────────────────────────────────────────
def main():
try:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, brier_score_loss
from sklearn.model_selection import KFold
except ImportError as e:
print(f"\nERROR: {e}")
print("Install: pip install scikit-learn numpy")
sys.exit(1)
print("\n" + ""*78)
print(" PAPER 3 — PHASE 5b: 5-term formula + ordinal bedside score")
print(""*78)
print(f"\n[1] Fetching data...")
cohort_rows = run_pg(q_cohort(), "cohort")
ne_rows = run_pg(q_ne(), "NE events")
fout_rows = run_pg(q_fluid_out(), "Fluid out")
vital_rows = run_pg(q_vitals(), "Vitals h20-28")
lac_rows = run_pg(q_lactate(), "Lactate")
cohort = {r["stay_id"]: dict(r) for r in cohort_rows}
print(f"\n[2] Cohort: {len(cohort):,} SAPS Q4 sepsis-3")
primitives = build_primitives(cohort, ne_rows, fout_rows, vital_rows, lac_rows)
all_ids = [s for s in cohort if primitives.get(s)
and formula_features(primitives[s]) is not None]
print(f" usable: {len(all_ids):,}")
X_all, y_all, S_all, SAPS_all, _ = build_matrix(all_ids, primitives, cohort)
print(f" mortality: {100*y_all.mean():.1f}%")
print(f" SAPS-II: mean={SAPS_all.mean():.1f} "
f"min={SAPS_all.min():.0f} max={SAPS_all.max():.0f}")
# ══════════════════════════════════════════════════════════════════════
# [4] 5-fold CV — 5-term formula
# ══════════════════════════════════════════════════════════════════════
print(f"\n[4] 5-fold CV — 5-term continuous formula")
subject_ids = sorted(set(cohort[s]["subject_id"] for s in all_ids))
rng = np.random.default_rng(42)
rng.shuffle(subject_ids)
subj_arr = np.array(subject_ids)
kf = KFold(n_splits=N_FOLDS, shuffle=False)
fold_aucs, fold_aucs_score, fold_aucs_saps, fold_coefs = [], [], [], []
for k, (tr_idx, te_idx) in enumerate(kf.split(subj_arr)):
tr_subs = set(subj_arr[tr_idx].tolist())
tr_ids = [s for s in all_ids if cohort[s]["subject_id"] in tr_subs]
te_ids = [s for s in all_ids if cohort[s]["subject_id"] not in tr_subs]
X_tr, y_tr, S_tr, _, _ = build_matrix(tr_ids, primitives, cohort)
X_te, y_te, S_te, SAPS_te, _ = build_matrix(te_ids, primitives, cohort)
mu = X_tr.mean(0); sd = X_tr.std(0) + 1e-9
lr = LogisticRegression(C=1.0, max_iter=1000, random_state=42)
lr.fit((X_tr - mu) / sd, y_tr)
pred = lr.predict_proba((X_te - mu) / sd)[:, 1]
auc_f = roc_auc_score(y_te, pred)
auc_s = roc_auc_score(y_te, S_te) # Ordinal score AUROC
auc_saps = roc_auc_score(y_te, SAPS_te) # SAPS-II baseline AUROC
raw_beta = lr.coef_[0] / sd
raw_int = lr.intercept_[0] - sum(lr.coef_[0][i]*mu[i]/sd[i] for i in range(len(sd)))
fold_aucs.append(auc_f)
fold_aucs_score.append(auc_s)
fold_aucs_saps.append(auc_saps)
fold_coefs.append([raw_int] + list(raw_beta))
print(f" fold {k+1}: formula AUROC={auc_f:.4f} "
f"ordinal AUROC={auc_s:.4f} SAPS-II AUROC={auc_saps:.4f}")
fa = np.array(fold_aucs); fas = np.array(fold_aucs_score)
fsap = np.array(fold_aucs_saps)
print(f"\n 5-term formula CV AUROC: {fa.mean():.4f} ± {fa.std():.4f} "
f"(range {fa.min():.4f}{fa.max():.4f})")
print(f" Ordinal score CV AUROC: {fas.mean():.4f} ± {fas.std():.4f} "
f"(range {fas.min():.4f}{fas.max():.4f})")
print(f" SAPS-II CV AUROC: {fsap.mean():.4f} ± {fsap.std():.4f} "
f"(range {fsap.min():.4f}{fsap.max():.4f})")
print(f" Ordinal loss: {fa.mean()-fas.mean():+.4f}")
print(f" Δ vs SAPS-II (formula): {fa.mean()-fsap.mean():+.4f}")
print(f" Δ vs SAPS-II (ordinal): {fas.mean()-fsap.mean():+.4f}")
# SAPS-II is a fixed pre-computed score (no parameters fit here), so an
# in-cohort AUROC is not optimistic — report it on the full Kepler cohort
# for a single, directly comparable headline number.
saps_auc_overall = roc_auc_score(y_all, SAPS_all)
print(f"\n SAPS-II AUROC on full Kepler cohort (n={len(y_all):,}): "
f"{saps_auc_overall:.4f}")
fc = np.array(fold_coefs)
print(f"\n Coefficient stability (5 terms):")
names = ["intercept"] + FEATURE_LABELS
for i, name in enumerate(names):
col = fc[:, i]
flips = sum(1 for k in range(1, len(col)) if col[k]*col[k-1] < 0)
print(f" {name:35s} {col.mean():>+9.4f} ± {col.std():>7.4f} flips={flips}")
# ══════════════════════════════════════════════════════════════════════
# [5] Bootstrap CIs — 5-term formula
# ══════════════════════════════════════════════════════════════════════
print(f"\n[5] Bootstrap CIs — 5-term formula ({N_BOOTSTRAP} resamples)")
mu_all = X_all.mean(0); sd_all = X_all.std(0) + 1e-9
n = len(y_all)
boot_coefs = []
rng_b = np.random.default_rng(42)
for b in range(N_BOOTSTRAP):
idx = rng_b.integers(0, n, n)
X_b = X_all[idx]; y_b = y_all[idx]
if len(set(y_b.tolist())) < 2: continue
try:
lr = LogisticRegression(C=1.0, max_iter=500, random_state=42)
lr.fit((X_b - mu_all) / sd_all, y_b)
raw = lr.coef_[0] / sd_all
intc = lr.intercept_[0] - sum(lr.coef_[0][i]*mu_all[i]/sd_all[i] for i in range(len(sd_all)))
boot_coefs.append([intc] + list(raw))
except Exception: continue
if (b+1) % 250 == 0: print(f" {b+1}/{N_BOOTSTRAP}...")
bc = np.array(boot_coefs)
lr_full = LogisticRegression(C=1.0, max_iter=1000, random_state=42)
lr_full.fit((X_all - mu_all) / sd_all, y_all)
raw_full = lr_full.coef_[0] / sd_all
intc_full = lr_full.intercept_[0] - sum(lr_full.coef_[0][i]*mu_all[i]/sd_all[i] for i in range(len(sd_all)))
point_all = [intc_full] + list(raw_full)
print(f"\n {'term':35s} {'point':>9s} {'95% CI':>22s}")
ci_results = []
for i, name in enumerate(names):
col = bc[:, i]
lo = np.percentile(col, 2.5)
hi = np.percentile(col, 97.5)
crosses_zero = "" if (lo < 0 < hi) else ""
ci_str = f"({lo:+.4f}, {hi:+.4f}) {crosses_zero}"
print(f" {name:35s} {point_all[i]:>+9.4f} {ci_str:>22s}")
ci_results.append({"term": name, "point": float(point_all[i]),
"ci_lo": float(lo), "ci_hi": float(hi),
"crosses_zero": bool(lo < 0 < hi)})
# ══════════════════════════════════════════════════════════════════════
# [6] Ordinal score distribution + mortality per score
# ══════════════════════════════════════════════════════════════════════
print(f"\n[6] Ordinal bedside score distribution + mortality per score")
print(f" {'score':>5s} {'n':>5s} {'mortality':>10s} {'cum n':>6s}")
score_buckets = defaultdict(lambda: {"n": 0, "died": 0})
for s, y in zip(S_all, y_all):
score_buckets[int(s)]["n"] += 1
score_buckets[int(s)]["died"] += int(y)
cum_n = 0
score_rows = []
for s in sorted(score_buckets.keys()):
b = score_buckets[s]
mort = 100 * b["died"] / b["n"] if b["n"] > 0 else 0
cum_n += b["n"]
bar = "" * int(mort / 3)
print(f" {s:>5d} {b['n']:>5d} {mort:>8.1f}% {cum_n:>6d} {bar}")
score_rows.append({"score": s, "n": b["n"], "mortality_pct": mort})
# ══════════════════════════════════════════════════════════════════════
# [7] Risk bands from ordinal score (clinical cut-points)
# ══════════════════════════════════════════════════════════════════════
print(f"\n[7] Suggested risk bands (clinically meaningful cutpoints)")
# Group into LOW / MID / HIGH by score
def band(s):
if s <= 3: return "low"
if s <= 7: return "mid"
return "high"
bands = defaultdict(lambda: {"n": 0, "died": 0})
for s, y in zip(S_all, y_all):
b = band(int(s))
bands[b]["n"] += 1
bands[b]["died"] += int(y)
print(f" {'band':6s} {'range':>7s} {'n':>5s} {'mort%':>7s}")
band_results = {}
for bname in ["low", "mid", "high"]:
b = bands[bname]
if b["n"] == 0: continue
rng_str = {"low": "03", "mid": "47", "high": "8+"}[bname]
mort = 100 * b["died"] / b["n"]
print(f" {bname:6s} {rng_str:>7s} {b['n']:>5d} {mort:>6.1f}%")
band_results[bname] = {"n": b["n"], "mortality_pct": mort}
# ══════════════════════════════════════════════════════════════════════
# [8] Calibration on holdout — both formula and ordinal
# ══════════════════════════════════════════════════════════════════════
print(f"\n[8] Calibration on 30% holdout (both versions)")
subs = list(set(cohort[s]["subject_id"] for s in all_ids))
random.Random(42).shuffle(subs)
n_tr = int(len(subs) * TRAIN_FRAC)
tr_subs = set(subs[:n_tr])
tr_ids = [s for s in all_ids if cohort[s]["subject_id"] in tr_subs]
te_ids = [s for s in all_ids if cohort[s]["subject_id"] not in tr_subs]
X_tr, y_tr, _, _, _ = build_matrix(tr_ids, primitives, cohort)
X_te, y_te, S_te, _, _ = build_matrix(te_ids, primitives, cohort)
mu = X_tr.mean(0); sd = X_tr.std(0) + 1e-9
lr_cal = LogisticRegression(C=1.0, max_iter=1000, random_state=42)
lr_cal.fit((X_tr - mu) / sd, y_tr)
pred_cal = lr_cal.predict_proba((X_te - mu) / sd)[:, 1]
order = np.argsort(pred_cal)
deciles = np.array_split(order, 10)
hl_stat = 0.0
print(f" Formula deciles:")
print(f" {'d':>2s} {'n':>4s} {'pred':>7s} {'obs':>7s}")
calib_bins = []
for d, idx in enumerate(deciles):
n_d = len(idx)
p_mean = float(pred_cal[idx].mean())
obs = int(y_te[idx].sum())
exp = float(pred_cal[idx].sum())
if exp > 0 and (n_d - exp) > 0:
hl_stat += (obs - exp)**2 / exp + ((n_d - obs) - (n_d - exp))**2 / (n_d - exp)
print(f" {d+1:>2d} {n_d:>4d} {p_mean:>6.3f} {obs/n_d:>6.3f}")
calib_bins.append({"decile": d+1, "n": n_d,
"predicted": p_mean, "observed": obs/n_d})
print(f"\n Hosmer-Lemeshow χ²: {hl_stat:.2f} (critical 15.51)")
if hl_stat < 15.51:
print(f" → Well calibrated (p > 0.05)")
brier = brier_score_loss(y_te, pred_cal)
print(f" Brier: {brier:.4f}")
# ══════════════════════════════════════════════════════════════════════
# [9] FINAL HEADLINE
# ══════════════════════════════════════════════════════════════════════
print(f"\n[9] ══════════ FINAL HEADLINE (Phase 5b) ══════════")
print(f"\n Cohort: n={len(all_ids):,} sepsis-3 Q4 patients")
print(f" Mortality: {100*y_all.mean():.1f}%")
print(f"\n 5-term formula (continuous):")
print(f" 5-fold CV AUROC: {fa.mean():.4f} ± {fa.std():.4f}")
print(f" Hosmer-Lemeshow: χ² = {hl_stat:.2f} (calibrated)")
print(f" Brier score: {brier:.4f}")
print(f"\n Ordinal bedside score (013 pts):")
print(f" 5-fold CV AUROC: {fas.mean():.4f} ± {fas.std():.4f}")
print(f" AUROC loss vs continuous: {fa.mean()-fas.mean():+.4f}")
print(f"\n SAPS-II baseline (same cohort):")
print(f" Overall AUROC: {saps_auc_overall:.4f}")
print(f" 5-fold CV AUROC: {fsap.mean():.4f} ± {fsap.std():.4f}")
print(f" Δ formula SAPS-II: {fa.mean()-fsap.mean():+.4f}")
print(f" Δ ordinal SAPS-II: {fas.mean()-fsap.mean():+.4f}")
print(f"\n Risk bands (ordinal score):")
for bn in ["low", "mid", "high"]:
br = band_results.get(bn, {})
if br:
print(f" {bn:6s} ({'03' if bn=='low' else '47' if bn=='mid' else '8+':>4s}): "
f"n={br['n']:>5d} mortality={br['mortality_pct']:.1f}%")
# ── Save ────────────────────────────────────────────────────────────────
output = {
"cohort": {"n": len(all_ids), "mortality": float(y_all.mean())},
"formula_5term": {
"cv_auroc_mean": float(fa.mean()),
"cv_auroc_std": float(fa.std()),
"cv_auroc_range": [float(fa.min()), float(fa.max())],
"coefficient_cis": ci_results,
"calibration": {
"hl_chi2": float(hl_stat),
"brier": float(brier),
"deciles": calib_bins,
},
},
"ordinal_score": {
"cv_auroc_mean": float(fas.mean()),
"cv_auroc_std": float(fas.std()),
"auroc_loss_vs_continuous": float(fa.mean() - fas.mean()),
"score_distribution": score_rows,
"risk_bands": band_results,
},
"sapsii_baseline": {
"n": int(len(SAPS_all)),
"sapsii_mean": float(SAPS_all.mean()),
"sapsii_min": float(SAPS_all.min()),
"sapsii_max": float(SAPS_all.max()),
"auroc_overall": float(saps_auc_overall),
"cv_auroc_mean": float(fsap.mean()),
"cv_auroc_std": float(fsap.std()),
"cv_auroc_range": [float(fsap.min()), float(fsap.max())],
"delta_formula_minus_sapsii": float(fa.mean() - fsap.mean()),
"delta_ordinal_minus_sapsii": float(fas.mean() - fsap.mean()),
},
}
with open(OUT_FILE, "w") as f:
json.dump(output, f, indent=2, default=str)
print(f"\n → Saved: {OUT_FILE}")
print("\n" + ""*78 + "\n")
if __name__ == "__main__":
main()

605
sql/schemas.sql Normal file
View File

@@ -0,0 +1,605 @@
-- ------------------------------------------------------------------
-- Reference CREATE TABLE schemas for every derived table produced by
-- sql/build_sapsii.sql
-- sql/build_sepsis3.sql
--
-- This file is documentation only. The actual build scripts use
-- `DROP TABLE IF EXISTS ...; CREATE TABLE ... AS SELECT ...`, so
-- column types are inferred by PostgreSQL at build time from the
-- MIMIC-III v1.3 base schema and from the expressions in the SELECT.
-- The types below match what PostgreSQL infers when the build is run
-- on a stock MIMIC-III v1.3 PostgreSQL restore (where for example
-- chartevents.valuenum is DOUBLE PRECISION, outputevents.value is
-- DOUBLE PRECISION, *.charttime is TIMESTAMP(0), etc.).
--
-- Use this file as:
-- * a quick reference for column names and types of each derived
-- table (handy for downstream consumers that need to know the
-- output schema without grep'ing through the build SQL);
-- * a stub for declaring empty derived tables ahead of time (e.g.
-- in a migration that just `CREATE TABLE IF NOT EXISTS ...`s
-- them, then later runs the build to populate them);
-- * a checklist when porting these scripts to another flavor of
-- MIMIC (e.g. MIMIC-III v1.4 or MIMIC-IV).
-- ------------------------------------------------------------------
-- ==================================================================
-- SAPS-II
-- ==================================================================
-- 1. Helper: all-time urine output (from outputevents).
DROP TABLE IF EXISTS urine_output;
CREATE TABLE urine_output (
icustay_id INTEGER,
charttime TIMESTAMP(0),
value DOUBLE PRECISION
);
-- 2. Ventilation: classification (per charttime) and durations
-- (per ventilation episode).
DROP TABLE IF EXISTS ventilation_classification;
CREATE TABLE ventilation_classification (
icustay_id INTEGER,
charttime TIMESTAMP(0),
mechvent INTEGER,
oxygentherapy INTEGER,
extubated INTEGER,
selfextubated INTEGER
);
DROP TABLE IF EXISTS ventilation_durations;
CREATE TABLE ventilation_durations (
icustay_id INTEGER,
ventnum BIGINT,
starttime TIMESTAMP(0),
endtime TIMESTAMP(0),
duration_hours NUMERIC
);
-- 3. First-day pivots feeding SAPS-II.
DROP TABLE IF EXISTS blood_gas_first_day;
CREATE TABLE blood_gas_first_day (
subject_id INTEGER,
hadm_id INTEGER,
icustay_id INTEGER,
charttime TIMESTAMP(0),
specimen VARCHAR(200),
aado2 DOUBLE PRECISION,
baseexcess DOUBLE PRECISION,
bicarbonate DOUBLE PRECISION,
totalco2 DOUBLE PRECISION,
carboxyhemoglobin DOUBLE PRECISION,
chloride DOUBLE PRECISION,
calcium DOUBLE PRECISION,
glucose DOUBLE PRECISION,
hematocrit DOUBLE PRECISION,
hemoglobin DOUBLE PRECISION,
intubated DOUBLE PRECISION,
lactate DOUBLE PRECISION,
methemoglobin DOUBLE PRECISION,
o2flow DOUBLE PRECISION,
fio2 DOUBLE PRECISION,
so2 DOUBLE PRECISION,
pco2 DOUBLE PRECISION,
peep DOUBLE PRECISION,
ph DOUBLE PRECISION,
po2 DOUBLE PRECISION,
potassium DOUBLE PRECISION,
requiredo2 DOUBLE PRECISION,
sodium DOUBLE PRECISION,
temperature DOUBLE PRECISION,
tidalvolume DOUBLE PRECISION,
ventilationrate DOUBLE PRECISION,
ventilator DOUBLE PRECISION
);
DROP TABLE IF EXISTS blood_gas_first_day_arterial;
CREATE TABLE blood_gas_first_day_arterial (
subject_id INTEGER,
hadm_id INTEGER,
icustay_id INTEGER,
charttime TIMESTAMP(0),
specimen VARCHAR(200),
specimen_pred VARCHAR(200),
specimen_prob DOUBLE PRECISION,
so2 DOUBLE PRECISION,
spo2 DOUBLE PRECISION,
po2 DOUBLE PRECISION,
pco2 DOUBLE PRECISION,
fio2_chartevents DOUBLE PRECISION,
fio2 DOUBLE PRECISION,
aado2 DOUBLE PRECISION,
aado2_calc DOUBLE PRECISION,
pao2fio2 DOUBLE PRECISION,
ph DOUBLE PRECISION,
baseexcess DOUBLE PRECISION,
bicarbonate DOUBLE PRECISION,
totalco2 DOUBLE PRECISION,
hematocrit DOUBLE PRECISION,
hemoglobin DOUBLE PRECISION,
carboxyhemoglobin DOUBLE PRECISION,
methemoglobin DOUBLE PRECISION,
chloride DOUBLE PRECISION,
calcium DOUBLE PRECISION,
temperature DOUBLE PRECISION,
potassium DOUBLE PRECISION,
sodium DOUBLE PRECISION,
lactate DOUBLE PRECISION,
glucose DOUBLE PRECISION,
intubated DOUBLE PRECISION,
tidalvolume DOUBLE PRECISION,
ventilationrate DOUBLE PRECISION,
ventilator DOUBLE PRECISION,
peep DOUBLE PRECISION,
o2flow DOUBLE PRECISION,
requiredo2 DOUBLE PRECISION
);
DROP TABLE IF EXISTS gcs_first_day;
CREATE TABLE gcs_first_day (
subject_id INTEGER,
hadm_id INTEGER,
icustay_id INTEGER,
mingcs DOUBLE PRECISION,
gcsmotor DOUBLE PRECISION,
gcsverbal DOUBLE PRECISION,
gcseyes DOUBLE PRECISION,
endotrachflag INTEGER
);
DROP TABLE IF EXISTS labs_first_day;
CREATE TABLE labs_first_day (
subject_id INTEGER,
hadm_id INTEGER,
icustay_id INTEGER,
aniongap_min DOUBLE PRECISION,
aniongap_max DOUBLE PRECISION,
albumin_min DOUBLE PRECISION,
albumin_max DOUBLE PRECISION,
bands_min DOUBLE PRECISION,
bands_max DOUBLE PRECISION,
bicarbonate_min DOUBLE PRECISION,
bicarbonate_max DOUBLE PRECISION,
bilirubin_min DOUBLE PRECISION,
bilirubin_max DOUBLE PRECISION,
creatinine_min DOUBLE PRECISION,
creatinine_max DOUBLE PRECISION,
chloride_min DOUBLE PRECISION,
chloride_max DOUBLE PRECISION,
glucose_min DOUBLE PRECISION,
glucose_max DOUBLE PRECISION,
hematocrit_min DOUBLE PRECISION,
hematocrit_max DOUBLE PRECISION,
hemoglobin_min DOUBLE PRECISION,
hemoglobin_max DOUBLE PRECISION,
lactate_min DOUBLE PRECISION,
lactate_max DOUBLE PRECISION,
platelet_min DOUBLE PRECISION,
platelet_max DOUBLE PRECISION,
potassium_min DOUBLE PRECISION,
potassium_max DOUBLE PRECISION,
ptt_min DOUBLE PRECISION,
ptt_max DOUBLE PRECISION,
inr_min DOUBLE PRECISION,
inr_max DOUBLE PRECISION,
pt_min DOUBLE PRECISION,
pt_max DOUBLE PRECISION,
sodium_min DOUBLE PRECISION,
sodium_max DOUBLE PRECISION,
bun_min DOUBLE PRECISION,
bun_max DOUBLE PRECISION,
wbc_min DOUBLE PRECISION,
wbc_max DOUBLE PRECISION
);
DROP TABLE IF EXISTS urine_output_first_day;
CREATE TABLE urine_output_first_day (
subject_id INTEGER,
hadm_id INTEGER,
icustay_id INTEGER,
urineoutput DOUBLE PRECISION
);
DROP TABLE IF EXISTS vitals_first_day;
CREATE TABLE vitals_first_day (
subject_id INTEGER,
hadm_id INTEGER,
icustay_id INTEGER,
heartrate_min DOUBLE PRECISION,
heartrate_max DOUBLE PRECISION,
heartrate_mean DOUBLE PRECISION,
sysbp_min DOUBLE PRECISION,
sysbp_max DOUBLE PRECISION,
sysbp_mean DOUBLE PRECISION,
diasbp_min DOUBLE PRECISION,
diasbp_max DOUBLE PRECISION,
diasbp_mean DOUBLE PRECISION,
meanbp_min DOUBLE PRECISION,
meanbp_max DOUBLE PRECISION,
meanbp_mean DOUBLE PRECISION,
resprate_min DOUBLE PRECISION,
resprate_max DOUBLE PRECISION,
resprate_mean DOUBLE PRECISION,
tempc_min DOUBLE PRECISION,
tempc_max DOUBLE PRECISION,
tempc_mean DOUBLE PRECISION,
spo2_min DOUBLE PRECISION,
spo2_max DOUBLE PRECISION,
spo2_mean DOUBLE PRECISION,
glucose_min DOUBLE PRECISION,
glucose_max DOUBLE PRECISION,
glucose_mean DOUBLE PRECISION
);
-- 4. Final SAPS-II score table (one row per ICU stay).
DROP TABLE IF EXISTS sapsii;
CREATE TABLE sapsii (
subject_id INTEGER,
hadm_id INTEGER,
icustay_id INTEGER,
sapsii INTEGER,
sapsii_prob DOUBLE PRECISION,
age_score INTEGER,
hr_score INTEGER,
sysbp_score INTEGER,
temp_score INTEGER,
pao2fio2_score INTEGER,
uo_score INTEGER,
bun_score INTEGER,
wbc_score INTEGER,
potassium_score INTEGER,
sodium_score INTEGER,
bicarbonate_score INTEGER,
bilirubin_score INTEGER,
gcs_score INTEGER,
comorbidity_score INTEGER,
admissiontype_score INTEGER
);
-- ==================================================================
-- Sepsis-3
-- ==================================================================
--
-- Sepsis-3 reuses these SAPS-II tables:
-- urine_output, ventilation_classification, ventilation_durations
-- (defined above). The tables below are the ones added by
-- build_sepsis3.sql.
-- 1. Echo extraction (used to impute weight when chartevents weight
-- is missing; also keyed by ROW_ID to the noteevents row).
DROP TABLE IF EXISTS echo_data;
CREATE TABLE echo_data (
row_id INTEGER,
subject_id INTEGER,
hadm_id INTEGER,
chartdate TIMESTAMP(0),
charttime TIMESTAMP(3),
indication TEXT,
height NUMERIC,
weight NUMERIC,
bsa NUMERIC,
bp TEXT,
bpsys NUMERIC,
bpdias NUMERIC,
hr NUMERIC,
status TEXT,
test TEXT,
doppler TEXT,
contrast TEXT,
technicalquality TEXT
);
-- 2. Per-stay weight durations (admit + daily + neonate + echo
-- imputed); used for mcg/kg/min vasopressor unit conversion.
DROP TABLE IF EXISTS weight_durations;
CREATE TABLE weight_durations (
icustay_id INTEGER,
starttime TIMESTAMP(0),
endtime TIMESTAMP(0),
weight DOUBLE PRECISION
);
-- 3. Vasopressor dose tables. All four have the same schema; rates
-- are merged CareVue + MetaVision and converted to mcg/kg/min.
DROP TABLE IF EXISTS dobutamine_dose;
CREATE TABLE dobutamine_dose (
icustay_id INTEGER,
starttime TIMESTAMP(0),
endtime TIMESTAMP(0),
vaso_rate DOUBLE PRECISION,
vaso_amount DOUBLE PRECISION
);
DROP TABLE IF EXISTS dopamine_dose;
CREATE TABLE dopamine_dose (
icustay_id INTEGER,
starttime TIMESTAMP(0),
endtime TIMESTAMP(0),
vaso_rate DOUBLE PRECISION,
vaso_amount DOUBLE PRECISION
);
DROP TABLE IF EXISTS epinephrine_dose;
CREATE TABLE epinephrine_dose (
icustay_id INTEGER,
starttime TIMESTAMP(0),
endtime TIMESTAMP(0),
vaso_rate DOUBLE PRECISION,
vaso_amount DOUBLE PRECISION
);
DROP TABLE IF EXISTS norepinephrine_dose;
CREATE TABLE norepinephrine_dose (
icustay_id INTEGER,
starttime TIMESTAMP(0),
endtime TIMESTAMP(0),
vaso_rate DOUBLE PRECISION,
vaso_amount DOUBLE PRECISION
);
-- 4. All-time pivots feeding hourly SOFA.
DROP TABLE IF EXISTS blood_gas_arterial;
CREATE TABLE blood_gas_arterial (
subject_id INTEGER,
hadm_id INTEGER,
icustay_id INTEGER,
charttime TIMESTAMP(0),
specimen VARCHAR(200),
specimen_pred VARCHAR(200),
specimen_prob DOUBLE PRECISION,
so2 DOUBLE PRECISION,
spo2 DOUBLE PRECISION,
po2 DOUBLE PRECISION,
pco2 DOUBLE PRECISION,
fio2_chartevents DOUBLE PRECISION,
fio2 DOUBLE PRECISION,
aado2 DOUBLE PRECISION,
aado2_calc DOUBLE PRECISION,
pao2fio2 DOUBLE PRECISION,
ph DOUBLE PRECISION,
baseexcess DOUBLE PRECISION,
bicarbonate DOUBLE PRECISION,
totalco2 DOUBLE PRECISION,
hematocrit DOUBLE PRECISION,
hemoglobin DOUBLE PRECISION,
carboxyhemoglobin DOUBLE PRECISION,
methemoglobin DOUBLE PRECISION,
chloride DOUBLE PRECISION,
calcium DOUBLE PRECISION,
temperature DOUBLE PRECISION,
potassium DOUBLE PRECISION,
sodium DOUBLE PRECISION,
lactate DOUBLE PRECISION,
glucose DOUBLE PRECISION,
intubated DOUBLE PRECISION,
tidalvolume DOUBLE PRECISION,
ventilationrate DOUBLE PRECISION,
ventilator DOUBLE PRECISION,
peep DOUBLE PRECISION,
o2flow DOUBLE PRECISION,
requiredo2 DOUBLE PRECISION
);
DROP TABLE IF EXISTS gcs_all;
CREATE TABLE gcs_all (
icustay_id INTEGER,
charttime TIMESTAMP(0),
gcs DOUBLE PRECISION,
endotrachflag INTEGER
);
-- 5. Hourly SOFA pipeline. Each measurement class is materialised
-- into a narrow staging table keyed by (icustay_id, hr); these
-- are kept (not dropped) so each stage can be inspected with
-- EXPLAIN ANALYZE.
-- 5a. Hourly grid (one row per ICU hour per stay).
DROP TABLE IF EXISTS sofa_grid;
CREATE TABLE sofa_grid (
subject_id INTEGER,
hadm_id INTEGER,
icustay_id INTEGER,
hr INTEGER,
starttime TIMESTAMP(0),
endtime TIMESTAMP(0)
);
-- 5b. MAP minimum within each hour.
DROP TABLE IF EXISTS sofa_vs;
CREATE TABLE sofa_vs (
icustay_id INTEGER,
hr INTEGER,
meanbp_min DOUBLE PRECISION
);
-- 5c. GCS minimum within each hour (from gcs_all, which already has
-- the carry-forward and ET-trach=15 rules applied).
DROP TABLE IF EXISTS sofa_gcs;
CREATE TABLE sofa_gcs (
icustay_id INTEGER,
hr INTEGER,
gcs_min DOUBLE PRECISION
);
-- 5d. Bilirubin maximum within each hour.
DROP TABLE IF EXISTS sofa_bili;
CREATE TABLE sofa_bili (
icustay_id INTEGER,
hr INTEGER,
bilirubin_max DOUBLE PRECISION
);
-- 5e. Creatinine maximum within each hour.
DROP TABLE IF EXISTS sofa_cr;
CREATE TABLE sofa_cr (
icustay_id INTEGER,
hr INTEGER,
creatinine_max DOUBLE PRECISION
);
-- 5f. Platelet minimum within each hour.
DROP TABLE IF EXISTS sofa_plt;
CREATE TABLE sofa_plt (
icustay_id INTEGER,
hr INTEGER,
platelet_min DOUBLE PRECISION
);
-- 5g. PaO2/FiO2: split into vent / no-vent based on whether an
-- active ventilation episode covered the blood gas.
DROP TABLE IF EXISTS sofa_pf;
CREATE TABLE sofa_pf (
icustay_id INTEGER,
hr INTEGER,
pao2fio2_novent DOUBLE PRECISION,
pao2fio2_vent DOUBLE PRECISION
);
-- 5h. Urine output rolling sum + count of distinct charted hours
-- within the past 24 h.
DROP TABLE IF EXISTS sofa_uo;
CREATE TABLE sofa_uo (
icustay_id INTEGER,
hr INTEGER,
uo_24hr DOUBLE PRECISION,
uo_tm_24hr BIGINT
);
-- 5i. Vasopressor rate snapshot at the hour boundary.
DROP TABLE IF EXISTS sofa_vaso;
CREATE TABLE sofa_vaso (
icustay_id INTEGER,
hr INTEGER,
rate_epinephrine DOUBLE PRECISION,
rate_norepinephrine DOUBLE PRECISION,
rate_dopamine DOUBLE PRECISION,
rate_dobutamine DOUBLE PRECISION
);
-- 5j. Wide assembly: grid LEFT JOINed onto every measurement table.
DROP TABLE IF EXISTS sofa_wide;
CREATE TABLE sofa_wide (
subject_id INTEGER,
hadm_id INTEGER,
icustay_id INTEGER,
hr INTEGER,
starttime TIMESTAMP(0),
endtime TIMESTAMP(0),
meanbp_min DOUBLE PRECISION,
gcs_min DOUBLE PRECISION,
bilirubin_max DOUBLE PRECISION,
creatinine_max DOUBLE PRECISION,
platelet_min DOUBLE PRECISION,
pao2fio2_novent DOUBLE PRECISION,
pao2fio2_vent DOUBLE PRECISION,
uo_24hr DOUBLE PRECISION,
uo_tm_24hr BIGINT,
rate_epinephrine DOUBLE PRECISION,
rate_norepinephrine DOUBLE PRECISION,
rate_dopamine DOUBLE PRECISION,
rate_dobutamine DOUBLE PRECISION
);
-- 5k. Per-hour component scores (no rolling window yet).
DROP TABLE IF EXISTS sofa_components;
CREATE TABLE sofa_components (
subject_id INTEGER,
hadm_id INTEGER,
icustay_id INTEGER,
hr INTEGER,
starttime TIMESTAMP(0),
endtime TIMESTAMP(0),
meanbp_min DOUBLE PRECISION,
gcs_min DOUBLE PRECISION,
bilirubin_max DOUBLE PRECISION,
creatinine_max DOUBLE PRECISION,
platelet_min DOUBLE PRECISION,
pao2fio2_novent DOUBLE PRECISION,
pao2fio2_vent DOUBLE PRECISION,
uo_24hr DOUBLE PRECISION,
uo_tm_24hr BIGINT,
rate_epinephrine DOUBLE PRECISION,
rate_norepinephrine DOUBLE PRECISION,
rate_dopamine DOUBLE PRECISION,
rate_dobutamine DOUBLE PRECISION,
respiration INTEGER,
coagulation INTEGER,
liver INTEGER,
cardiovascular INTEGER,
cns INTEGER,
renal INTEGER
);
-- 5l. Final hourly SOFA: 24-hour rolling MAX per component, summed.
DROP TABLE IF EXISTS sofa_hourly;
CREATE TABLE sofa_hourly (
subject_id INTEGER,
hadm_id INTEGER,
icustay_id INTEGER,
hr INTEGER,
starttime TIMESTAMP(0),
endtime TIMESTAMP(0),
respiration INTEGER,
coagulation INTEGER,
liver INTEGER,
cardiovascular INTEGER,
cns INTEGER,
renal INTEGER,
respiration_24hours INTEGER,
coagulation_24hours INTEGER,
liver_24hours INTEGER,
cardiovascular_24hours INTEGER,
cns_24hours INTEGER,
renal_24hours INTEGER,
sofa_24hours INTEGER
);
-- 6. Suspicion of infection.
DROP TABLE IF EXISTS antibiotic;
CREATE TABLE antibiotic (
subject_id INTEGER,
hadm_id INTEGER,
icustay_id INTEGER,
antibiotic VARCHAR(255),
route VARCHAR(120),
-- MIMIC-III prescriptions has DATE-precision startdate / enddate
-- (stored as TIMESTAMP(0) but always at 00:00:00).
starttime TIMESTAMP(0),
stoptime TIMESTAMP(0)
);
DROP TABLE IF EXISTS suspicion_of_infection;
CREATE TABLE suspicion_of_infection (
subject_id INTEGER,
icustay_id INTEGER,
hadm_id INTEGER,
ab_id BIGINT,
antibiotic VARCHAR(255),
antibiotic_time TIMESTAMP,
suspected_infection INTEGER,
suspected_infection_time TIMESTAMP,
culture_time TIMESTAMP,
specimen VARCHAR(100),
positive_culture INTEGER
);
-- 7. Final Sepsis-3 onset table (one row per ICU stay).
DROP TABLE IF EXISTS sepsis3;
CREATE TABLE sepsis3 (
subject_id INTEGER,
icustay_id INTEGER,
antibiotic_time TIMESTAMP,
culture_time TIMESTAMP,
suspected_infection_time TIMESTAMP,
sofa_time TIMESTAMP(0),
sofa_score INTEGER,
respiration INTEGER,
coagulation INTEGER,
liver INTEGER,
cardiovascular INTEGER,
cns INTEGER,
renal INTEGER,
sepsis3 BOOLEAN
);