-- ------------------------------------------------------------------ -- Sepsis-3 sanity checks for MIMIC-III v1.3. -- -- Usage: -- psql -d mimic -v ON_ERROR_STOP=1 \ -- -c 'SET search_path TO mimiciii, public;' \ -- -f sql/sepsis/sanity_checks.sql -- -- Each section prints a short result set. Compare against the -- "EXPECTED" comment. None of these are pass/fail tests; they are -- bounds-style checks designed to catch obvious upstream breakage -- (an empty staging table, an off-by-one in the hourly grid, a -- vasopressor unit-conversion error, etc.). -- -- Reference numbers come from: -- Seymour CW et al., JAMA 2016 (the Sepsis-3 paper) -- Johnson AEW et al., Crit Care Med 2018 ("A Comparative Analysis -- of Sepsis Identification Methods in an Electronic Database", -- which reproduces Sepsis-3 on MIMIC-III) -- ------------------------------------------------------------------ \set ON_ERROR_STOP on \timing on \echo \echo '==================================================================' \echo ' 1. Row counts of every table in the pipeline' \echo '==================================================================' \echo "EXPECTED (MIMIC-III v1.3 full restore, all 61.5k ICU stays):" \echo " icustays ~ 61 532" \echo " sofa_grid ~ 6 - 8 M (60k stays * ~4d mean LOS * 24h)" \echo " sofa_hourly same as sofa_grid" \echo " blood_gas_arterial ~ 500 k - 1 M" \echo " gcs_all ~ 4 - 6 M" \echo " antibiotic ~ 500 k - 700 k prescription rows" \echo " suspicion_of_infection same as antibiotic" \echo " sepsis3 ~ 20 k - 30 k rows (one row per ICU stay" \echo " that ever had any abx + qualifying SOFA)" \echo SELECT 'icustays' AS table_name, count(*) AS n FROM icustays UNION ALL SELECT 'sofa_grid', count(*) FROM sofa_grid UNION ALL SELECT 'sofa_vs', count(*) FROM sofa_vs UNION ALL SELECT 'sofa_gcs', count(*) FROM sofa_gcs UNION ALL SELECT 'sofa_bili', count(*) FROM sofa_bili UNION ALL SELECT 'sofa_cr', count(*) FROM sofa_cr UNION ALL SELECT 'sofa_plt', count(*) FROM sofa_plt UNION ALL SELECT 'sofa_pf', count(*) FROM sofa_pf UNION ALL SELECT 'sofa_uo', count(*) FROM sofa_uo UNION ALL SELECT 'sofa_vaso', count(*) FROM sofa_vaso UNION ALL SELECT 'sofa_wide', count(*) FROM sofa_wide UNION ALL SELECT 'sofa_components', count(*) FROM sofa_components UNION ALL SELECT 'sofa_hourly', count(*) FROM sofa_hourly UNION ALL SELECT 'blood_gas_arterial', count(*) FROM blood_gas_arterial UNION ALL SELECT 'gcs_all', count(*) FROM gcs_all UNION ALL SELECT 'antibiotic', count(*) FROM antibiotic UNION ALL SELECT 'suspicion_of_infection', count(*) FROM suspicion_of_infection UNION ALL SELECT 'sepsis3', count(*) FROM sepsis3 ORDER BY 1; \echo \echo '==================================================================' \echo ' 2. Hourly grid integrity' \echo '==================================================================' \echo "EXPECTED:" \echo " bad_hr_seq = 0 (hours per stay must be 1..N consecutive)" \echo " bad_endtime = 0 (endtime > starttime)" \echo " duplicate_grid = 0 (no (icustay_id, hr) duplicates)" \echo " grid_eq_hourly = 0 (sofa_grid and sofa_hourly row counts match)" \echo WITH per_stay AS ( SELECT icustay_id , min(hr) AS hr_min , max(hr) AS hr_max , count(*) AS n FROM sofa_grid GROUP BY icustay_id ) SELECT (SELECT count(*) FROM per_stay WHERE hr_min != 1 OR hr_max != n) AS bad_hr_seq , (SELECT count(*) FROM sofa_grid WHERE endtime <= starttime) AS bad_endtime , (SELECT count(*) - count(DISTINCT (icustay_id, hr)) FROM sofa_grid) AS duplicate_grid , (SELECT count(*) FROM sofa_grid) - (SELECT count(*) FROM sofa_hourly) AS grid_eq_hourly; \echo \echo '==================================================================' \echo ' 3. Per-component SOFA score ranges' \echo '==================================================================' \echo "EXPECTED: every per-hour component score is in [0, 4] or NULL." \echo " Any value outside that range indicates a logic bug." \echo SELECT 'respiration' AS component, min(respiration) AS min, max(respiration) AS max FROM sofa_components UNION ALL SELECT 'coagulation', min(coagulation), max(coagulation) FROM sofa_components UNION ALL SELECT 'liver', min(liver), max(liver) FROM sofa_components UNION ALL SELECT 'cardiovascular', min(cardiovascular), max(cardiovascular) FROM sofa_components UNION ALL SELECT 'cns', min(cns), max(cns) FROM sofa_components UNION ALL SELECT 'renal', min(renal), max(renal) FROM sofa_components ORDER BY 1; \echo \echo '==================================================================' \echo ' 4. 24-hour rolling SOFA distribution' \echo '==================================================================' \echo "EXPECTED:" \echo " total_sofa min = 0, max ~ 20-24" \echo " median per-hour total_sofa ~ 2-4" \echo " Distribution should be heavy-tailed; ~70-80% of hours <= 5," \echo " ~5-10% of hours >= 10." \echo SELECT min(sofa_24hours) AS sofa_min , max(sofa_24hours) AS sofa_max , round(avg(sofa_24hours)::numeric, 2) AS sofa_mean , percentile_disc(0.50) WITHIN GROUP (ORDER BY sofa_24hours) AS sofa_p50 , percentile_disc(0.90) WITHIN GROUP (ORDER BY sofa_24hours) AS sofa_p90 , percentile_disc(0.99) WITHIN GROUP (ORDER BY sofa_24hours) AS sofa_p99 FROM sofa_hourly; \echo \echo '==================================================================' \echo ' 5. Day-1 max SOFA per stay (compare with SAPS-II severity)' \echo '==================================================================' \echo "EXPECTED for adult ICU (per Singer 2016, Vincent 1996):" \echo " median day-1 SOFA ~ 4-6" \echo " ~60-70% of stays have day-1 SOFA >= 2 (Sepsis-3 organ-dys threshold)" \echo WITH d1 AS ( SELECT icustay_id, max(sofa_24hours) AS day1_sofa FROM sofa_hourly WHERE hr <= 24 GROUP BY icustay_id ) SELECT count(*) AS n_stays , round(avg(day1_sofa)::numeric, 2) AS mean_d1_sofa , percentile_disc(0.50) WITHIN GROUP (ORDER BY day1_sofa) AS p50 , percentile_disc(0.90) WITHIN GROUP (ORDER BY day1_sofa) AS p90 , round(100.0 * sum(CASE WHEN day1_sofa >= 2 THEN 1 ELSE 0 END) / count(*), 1) AS pct_ge2 , round(100.0 * sum(CASE WHEN day1_sofa >= 6 THEN 1 ELSE 0 END) / count(*), 1) AS pct_ge6 FROM d1; \echo \echo '==================================================================' \echo ' 6. Component-input sanity (raw ranges)' \echo '==================================================================' \echo "EXPECTED ranges (after our valuenum filters):" \echo " meanbp_min 30 - 200 mmHg" \echo " gcs_min 3 - 15" \echo " bilirubin_max 0 - 80 mg/dL" \echo " creatinine_max 0 - 30 mg/dL (capped at 150 in pipeline)" \echo " platelet_min 0 - 1500 K/uL" \echo " pao2fio2_* 50 - 700" \echo " uo_24hr 0 - 20000 mL" \echo " rate_norepi etc. 0 - 5 mcg/kg/min (rates above ~3 are very rare)" \echo SELECT 'meanbp_min' AS metric , min(meanbp_min)::text AS min , max(meanbp_min)::text AS max , round(avg(meanbp_min)::numeric, 1)::text AS mean FROM sofa_components WHERE meanbp_min IS NOT NULL UNION ALL SELECT 'gcs_min', min(gcs_min)::text, max(gcs_min)::text, avg(gcs_min)::numeric(10,1)::text FROM sofa_components WHERE gcs_min IS NOT NULL UNION ALL SELECT 'bilirubin_max', min(bilirubin_max)::text, max(bilirubin_max)::text, avg(bilirubin_max)::numeric(10,2)::text FROM sofa_components WHERE bilirubin_max IS NOT NULL UNION ALL SELECT 'creatinine_max', min(creatinine_max)::text, max(creatinine_max)::text, avg(creatinine_max)::numeric(10,2)::text FROM sofa_components WHERE creatinine_max IS NOT NULL UNION ALL SELECT 'platelet_min', min(platelet_min)::text, max(platelet_min)::text, avg(platelet_min)::numeric(10,1)::text FROM sofa_components WHERE platelet_min IS NOT NULL UNION ALL SELECT 'pao2fio2_vent', min(pao2fio2_vent)::text, max(pao2fio2_vent)::text, avg(pao2fio2_vent)::numeric(10,1)::text FROM sofa_components WHERE pao2fio2_vent IS NOT NULL UNION ALL SELECT 'pao2fio2_novent', min(pao2fio2_novent)::text, max(pao2fio2_novent)::text, avg(pao2fio2_novent)::numeric(10,1)::text FROM sofa_components WHERE pao2fio2_novent IS NOT NULL UNION ALL SELECT 'uo_24hr', min(uo_24hr)::text, max(uo_24hr)::text, avg(uo_24hr)::numeric(10,1)::text FROM sofa_components WHERE uo_24hr IS NOT NULL UNION ALL SELECT 'rate_norepinephrine', min(rate_norepinephrine)::text, max(rate_norepinephrine)::text, avg(rate_norepinephrine)::numeric(10,3)::text FROM sofa_components WHERE rate_norepinephrine IS NOT NULL ORDER BY 1; \echo \echo '==================================================================' \echo ' 7. Vasopressor coverage' \echo '==================================================================' \echo "EXPECTED: ~25-35% of adult ICU stays receive at least one" \echo " vasopressor (norepi most common, then epi/dop/dob)." \echo SELECT 'any vaso' AS group , count(DISTINCT icustay_id) AS n_stays FROM sofa_vaso UNION ALL SELECT 'norepi', count(DISTINCT icustay_id) FROM sofa_vaso WHERE rate_norepinephrine IS NOT NULL UNION ALL SELECT 'epi', count(DISTINCT icustay_id) FROM sofa_vaso WHERE rate_epinephrine IS NOT NULL UNION ALL SELECT 'dop', count(DISTINCT icustay_id) FROM sofa_vaso WHERE rate_dopamine IS NOT NULL UNION ALL SELECT 'dob', count(DISTINCT icustay_id) FROM sofa_vaso WHERE rate_dobutamine IS NOT NULL; \echo \echo '==================================================================' \echo ' 8. Antibiotic prescriptions: top 15 drugs' \echo '==================================================================' \echo "EXPECTED: vancomycin, piperacillin/tazobactam (zosyn)," \echo " ceftriaxone, levofloxacin, metronidazole near the top." \echo SELECT antibiotic, count(*) AS n FROM antibiotic GROUP BY antibiotic ORDER BY n DESC LIMIT 15; \echo \echo '==================================================================' \echo ' 9. Suspicion of infection: matching rate' \echo '==================================================================' \echo "EXPECTED:" \echo " ~50-70% of antibiotic rows are matched to a culture" \echo " (i.e. suspected_infection = 1). Top specimens should be:" \echo " BLOOD CULTURE, URINE, MRSA SCREEN, SPUTUM, SWAB." \echo SELECT count(*) AS n_total , sum(suspected_infection) AS n_suspected , round(100.0 * sum(suspected_infection) / count(*), 1) AS pct_suspected FROM suspicion_of_infection; SELECT specimen, count(*) AS n FROM suspicion_of_infection WHERE suspected_infection = 1 GROUP BY specimen ORDER BY n DESC LIMIT 10; \echo \echo '==================================================================' \echo '10. Sepsis-3 prevalence at the ICU-stay level' \echo '==================================================================' \echo "EXPECTED (Johnson 2018, MIMIC-III all-cohort):" \echo " total stays in sepsis3 table : 25 - 35 k" \echo " (every stay with any abx and a qualifying SOFA window)" \echo " sepsis3 = TRUE : 18 - 24 k (~30-40% of all ICU stays)" \echo SELECT count(*) AS n_rows , sum(CASE WHEN sepsis3 THEN 1 ELSE 0 END) AS n_sepsis3 , round(100.0 * sum(CASE WHEN sepsis3 THEN 1 ELSE 0 END) / NULLIF(count(*),0), 1) AS pct_sepsis3_among_rows , round(100.0 * sum(CASE WHEN sepsis3 THEN 1 ELSE 0 END) / NULLIF((SELECT count(*) FROM icustays), 0), 1) AS pct_sepsis3_of_all_icustays FROM sepsis3; \echo \echo '==================================================================' \echo '11. Sepsis-3 onset timing' \echo '==================================================================' \echo "EXPECTED:" \echo " Most onsets occur early in the stay; median onset is on" \echo " day 0-1 (~0-24h after intime). A long right tail exists" \echo " for ICU-acquired sepsis." \echo " sofa_time should be within [-48h, +24h] of" \echo " suspected_infection_time by construction." \echo SELECT round(avg(EXTRACT(EPOCH FROM (suspected_infection_time - ie.intime))/3600)::numeric, 1) AS mean_hours_to_onset , percentile_disc(0.50) WITHIN GROUP ( ORDER BY EXTRACT(EPOCH FROM (suspected_infection_time - ie.intime))/3600 ) AS p50_hours , percentile_disc(0.90) WITHIN GROUP ( ORDER BY EXTRACT(EPOCH FROM (suspected_infection_time - ie.intime))/3600 ) AS p90_hours , min(EXTRACT(EPOCH FROM (sofa_time - suspected_infection_time))/3600) AS min_sofa_offset_h , max(EXTRACT(EPOCH FROM (sofa_time - suspected_infection_time))/3600) AS max_sofa_offset_h FROM sepsis3 s JOIN icustays ie ON ie.icustay_id = s.icustay_id WHERE s.sepsis3 = TRUE; \echo \echo '==================================================================' \echo '12. Mortality stratified by Sepsis-3 status' \echo '==================================================================' \echo "EXPECTED:" \echo " In-hospital mortality among Sepsis-3 = TRUE: ~25-35%" \echo " Among Sepsis-3 = FALSE / no row in sepsis3: ~5-10%" \echo WITH cohort AS ( SELECT ie.icustay_id, ie.hadm_id , CASE WHEN s.sepsis3 IS TRUE THEN 'sepsis3' ELSE 'not sepsis3' END AS sepsis_status FROM icustays ie LEFT JOIN sepsis3 s ON s.icustay_id = ie.icustay_id ) SELECT c.sepsis_status , count(*) AS n_stays , sum(CASE WHEN adm.hospital_expire_flag = 1 THEN 1 ELSE 0 END) AS n_died , round(100.0 * sum(CASE WHEN adm.hospital_expire_flag = 1 THEN 1 ELSE 0 END) / count(*), 1) AS pct_died FROM cohort c JOIN admissions adm ON adm.hadm_id = c.hadm_id GROUP BY c.sepsis_status ORDER BY c.sepsis_status DESC; \echo \echo '==================================================================' \echo '13. Sepsis-3 vs SAPS-II (cross-score validation)' \echo '==================================================================' \echo "EXPECTED:" \echo " Septic patients should have higher mean SAPS-II than non-septic" \echo " (typically by ~10-15 points)." \echo " This sanity check requires that you have already run" \echo " build_sapsii.sql. If sapsii does not exist, this section" \echo " will error -- skip it with -v ON_ERROR_STOP=0." \echo SELECT CASE WHEN s.sepsis3 IS TRUE THEN 'sepsis3' ELSE 'not sepsis3' END AS sepsis_status , count(*) AS n , round(avg(sa.sapsii)::numeric, 1) AS mean_sapsii , round(avg(sa.sapsii_prob)::numeric, 3) AS mean_predicted_mortality FROM icustays ie LEFT JOIN sepsis3 s ON s.icustay_id = ie.icustay_id LEFT JOIN sapsii sa ON sa.icustay_id = ie.icustay_id GROUP BY (s.sepsis3 IS TRUE) ORDER BY 1 DESC; \echo \echo '==================================================================' \echo '14. Spot-check a few stays end-to-end' \echo '==================================================================' \echo "Pulls 5 random Sepsis-3 = TRUE stays and shows you the trajectory" \echo "of sofa_24hours alongside the suspected_infection_time. Eyeball:" \echo " - sofa_24hours should be >= 2 at hours surrounding the onset" \echo " - sofa_24hours should plausibly rise then fall over the stay" \echo " - hour numbering should be consecutive" \echo WITH picks AS ( SELECT icustay_id, suspected_infection_time FROM sepsis3 WHERE sepsis3 = TRUE ORDER BY md5(icustay_id::text) LIMIT 5 ) SELECT p.icustay_id , p.suspected_infection_time , h.hr , h.endtime , h.respiration_24hours AS resp , h.coagulation_24hours AS coag , h.liver_24hours AS liv , h.cardiovascular_24hours AS cardio , h.cns_24hours AS cns , h.renal_24hours AS ren , h.sofa_24hours AS sofa FROM picks p JOIN sofa_hourly h ON h.icustay_id = p.icustay_id WHERE h.endtime BETWEEN p.suspected_infection_time - INTERVAL '6 hours' AND p.suspected_infection_time + INTERVAL '6 hours' ORDER BY p.icustay_id, h.hr; \echo \echo 'All sanity checks complete. Anything way off the expected ranges' \echo 'above is worth investigating before relying on the sepsis-3 cohort.'