Files
mimic-iv-concepts-port/sql/sepsis/sanity_checks.sql

394 lines
18 KiB
MySQL
Raw Normal View History

2026-05-05 10:22:17 +02:00
-- ------------------------------------------------------------------
-- Sepsis-3 sanity checks for MIMIC-III v1.3.
--
-- Usage:
-- psql -d mimic -v ON_ERROR_STOP=1 \
-- -c 'SET search_path TO mimiciii, public;' \
-- -f sql/sepsis/sanity_checks.sql
--
-- Each section prints a short result set. Compare against the
-- "EXPECTED" comment. None of these are pass/fail tests; they are
-- bounds-style checks designed to catch obvious upstream breakage
-- (an empty staging table, an off-by-one in the hourly grid, a
-- vasopressor unit-conversion error, etc.).
--
-- Reference numbers come from:
-- Seymour CW et al., JAMA 2016 (the Sepsis-3 paper)
-- Johnson AEW et al., Crit Care Med 2018 ("A Comparative Analysis
-- of Sepsis Identification Methods in an Electronic Database",
-- which reproduces Sepsis-3 on MIMIC-III)
-- ------------------------------------------------------------------
\set ON_ERROR_STOP on
\timing on
\echo
\echo '=================================================================='
\echo ' 1. Row counts of every table in the pipeline'
\echo '=================================================================='
\echo "EXPECTED (MIMIC-III v1.3 full restore, all 61.5k ICU stays):"
\echo " icustays ~ 61 532"
\echo " sofa_grid ~ 6 - 8 M (60k stays * ~4d mean LOS * 24h)"
\echo " sofa_hourly same as sofa_grid"
\echo " blood_gas_arterial ~ 500 k - 1 M"
\echo " gcs_all ~ 4 - 6 M"
\echo " antibiotic ~ 500 k - 700 k prescription rows"
\echo " suspicion_of_infection same as antibiotic"
\echo " sepsis3 ~ 20 k - 30 k rows (one row per ICU stay"
\echo " that ever had any abx + qualifying SOFA)"
\echo
SELECT 'icustays' AS table_name, count(*) AS n FROM icustays
UNION ALL SELECT 'sofa_grid', count(*) FROM sofa_grid
UNION ALL SELECT 'sofa_vs', count(*) FROM sofa_vs
UNION ALL SELECT 'sofa_gcs', count(*) FROM sofa_gcs
UNION ALL SELECT 'sofa_bili', count(*) FROM sofa_bili
UNION ALL SELECT 'sofa_cr', count(*) FROM sofa_cr
UNION ALL SELECT 'sofa_plt', count(*) FROM sofa_plt
UNION ALL SELECT 'sofa_pf', count(*) FROM sofa_pf
UNION ALL SELECT 'sofa_uo', count(*) FROM sofa_uo
UNION ALL SELECT 'sofa_vaso', count(*) FROM sofa_vaso
UNION ALL SELECT 'sofa_wide', count(*) FROM sofa_wide
UNION ALL SELECT 'sofa_components', count(*) FROM sofa_components
UNION ALL SELECT 'sofa_hourly', count(*) FROM sofa_hourly
UNION ALL SELECT 'blood_gas_arterial', count(*) FROM blood_gas_arterial
UNION ALL SELECT 'gcs_all', count(*) FROM gcs_all
UNION ALL SELECT 'antibiotic', count(*) FROM antibiotic
UNION ALL SELECT 'suspicion_of_infection', count(*) FROM suspicion_of_infection
UNION ALL SELECT 'sepsis3', count(*) FROM sepsis3
ORDER BY 1;
\echo
\echo '=================================================================='
\echo ' 2. Hourly grid integrity'
\echo '=================================================================='
\echo "EXPECTED:"
\echo " bad_hr_seq = 0 (hours per stay must be 1..N consecutive)"
\echo " bad_endtime = 0 (endtime > starttime)"
\echo " duplicate_grid = 0 (no (icustay_id, hr) duplicates)"
\echo " grid_eq_hourly = 0 (sofa_grid and sofa_hourly row counts match)"
\echo
WITH per_stay AS (
SELECT icustay_id
, min(hr) AS hr_min
, max(hr) AS hr_max
, count(*) AS n
FROM sofa_grid
GROUP BY icustay_id
)
SELECT
(SELECT count(*) FROM per_stay
WHERE hr_min != 1 OR hr_max != n) AS bad_hr_seq
, (SELECT count(*) FROM sofa_grid
WHERE endtime <= starttime) AS bad_endtime
, (SELECT count(*) - count(DISTINCT (icustay_id, hr))
FROM sofa_grid) AS duplicate_grid
, (SELECT count(*) FROM sofa_grid)
- (SELECT count(*) FROM sofa_hourly) AS grid_eq_hourly;
\echo
\echo '=================================================================='
\echo ' 3. Per-component SOFA score ranges'
\echo '=================================================================='
\echo "EXPECTED: every per-hour component score is in [0, 4] or NULL."
\echo " Any value outside that range indicates a logic bug."
\echo
SELECT 'respiration' AS component, min(respiration) AS min, max(respiration) AS max FROM sofa_components
UNION ALL SELECT 'coagulation', min(coagulation), max(coagulation) FROM sofa_components
UNION ALL SELECT 'liver', min(liver), max(liver) FROM sofa_components
UNION ALL SELECT 'cardiovascular', min(cardiovascular), max(cardiovascular) FROM sofa_components
UNION ALL SELECT 'cns', min(cns), max(cns) FROM sofa_components
UNION ALL SELECT 'renal', min(renal), max(renal) FROM sofa_components
ORDER BY 1;
\echo
\echo '=================================================================='
\echo ' 4. 24-hour rolling SOFA distribution'
\echo '=================================================================='
\echo "EXPECTED:"
\echo " total_sofa min = 0, max ~ 20-24"
\echo " median per-hour total_sofa ~ 2-4"
\echo " Distribution should be heavy-tailed; ~70-80% of hours <= 5,"
\echo " ~5-10% of hours >= 10."
\echo
SELECT min(sofa_24hours) AS sofa_min
, max(sofa_24hours) AS sofa_max
, round(avg(sofa_24hours)::numeric, 2) AS sofa_mean
, percentile_disc(0.50) WITHIN GROUP (ORDER BY sofa_24hours) AS sofa_p50
, percentile_disc(0.90) WITHIN GROUP (ORDER BY sofa_24hours) AS sofa_p90
, percentile_disc(0.99) WITHIN GROUP (ORDER BY sofa_24hours) AS sofa_p99
FROM sofa_hourly;
\echo
\echo '=================================================================='
\echo ' 5. Day-1 max SOFA per stay (compare with SAPS-II severity)'
\echo '=================================================================='
\echo "EXPECTED for adult ICU (per Singer 2016, Vincent 1996):"
\echo " median day-1 SOFA ~ 4-6"
\echo " ~60-70% of stays have day-1 SOFA >= 2 (Sepsis-3 organ-dys threshold)"
\echo
WITH d1 AS (
SELECT icustay_id, max(sofa_24hours) AS day1_sofa
FROM sofa_hourly
WHERE hr <= 24
GROUP BY icustay_id
)
SELECT count(*) AS n_stays
, round(avg(day1_sofa)::numeric, 2) AS mean_d1_sofa
, percentile_disc(0.50) WITHIN GROUP (ORDER BY day1_sofa) AS p50
, percentile_disc(0.90) WITHIN GROUP (ORDER BY day1_sofa) AS p90
, round(100.0 * sum(CASE WHEN day1_sofa >= 2 THEN 1 ELSE 0 END)
/ count(*), 1) AS pct_ge2
, round(100.0 * sum(CASE WHEN day1_sofa >= 6 THEN 1 ELSE 0 END)
/ count(*), 1) AS pct_ge6
FROM d1;
\echo
\echo '=================================================================='
\echo ' 6. Component-input sanity (raw ranges)'
\echo '=================================================================='
\echo "EXPECTED ranges (after our valuenum filters):"
\echo " meanbp_min 30 - 200 mmHg"
\echo " gcs_min 3 - 15"
\echo " bilirubin_max 0 - 80 mg/dL"
\echo " creatinine_max 0 - 30 mg/dL (capped at 150 in pipeline)"
\echo " platelet_min 0 - 1500 K/uL"
\echo " pao2fio2_* 50 - 700"
\echo " uo_24hr 0 - 20000 mL"
\echo " rate_norepi etc. 0 - 5 mcg/kg/min (rates above ~3 are very rare)"
\echo
SELECT 'meanbp_min' AS metric
, min(meanbp_min)::text AS min
, max(meanbp_min)::text AS max
, round(avg(meanbp_min)::numeric, 1)::text AS mean
FROM sofa_components WHERE meanbp_min IS NOT NULL
UNION ALL SELECT 'gcs_min',
min(gcs_min)::text, max(gcs_min)::text, avg(gcs_min)::numeric(10,1)::text
FROM sofa_components WHERE gcs_min IS NOT NULL
UNION ALL SELECT 'bilirubin_max',
min(bilirubin_max)::text, max(bilirubin_max)::text,
avg(bilirubin_max)::numeric(10,2)::text
FROM sofa_components WHERE bilirubin_max IS NOT NULL
UNION ALL SELECT 'creatinine_max',
min(creatinine_max)::text, max(creatinine_max)::text,
avg(creatinine_max)::numeric(10,2)::text
FROM sofa_components WHERE creatinine_max IS NOT NULL
UNION ALL SELECT 'platelet_min',
min(platelet_min)::text, max(platelet_min)::text,
avg(platelet_min)::numeric(10,1)::text
FROM sofa_components WHERE platelet_min IS NOT NULL
UNION ALL SELECT 'pao2fio2_vent',
min(pao2fio2_vent)::text, max(pao2fio2_vent)::text,
avg(pao2fio2_vent)::numeric(10,1)::text
FROM sofa_components WHERE pao2fio2_vent IS NOT NULL
UNION ALL SELECT 'pao2fio2_novent',
min(pao2fio2_novent)::text, max(pao2fio2_novent)::text,
avg(pao2fio2_novent)::numeric(10,1)::text
FROM sofa_components WHERE pao2fio2_novent IS NOT NULL
UNION ALL SELECT 'uo_24hr',
min(uo_24hr)::text, max(uo_24hr)::text,
avg(uo_24hr)::numeric(10,1)::text
FROM sofa_components WHERE uo_24hr IS NOT NULL
UNION ALL SELECT 'rate_norepinephrine',
min(rate_norepinephrine)::text, max(rate_norepinephrine)::text,
avg(rate_norepinephrine)::numeric(10,3)::text
FROM sofa_components WHERE rate_norepinephrine IS NOT NULL
ORDER BY 1;
\echo
\echo '=================================================================='
\echo ' 7. Vasopressor coverage'
\echo '=================================================================='
\echo "EXPECTED: ~25-35% of adult ICU stays receive at least one"
\echo " vasopressor (norepi most common, then epi/dop/dob)."
\echo
SELECT 'any vaso' AS group
, count(DISTINCT icustay_id) AS n_stays
FROM sofa_vaso
UNION ALL SELECT 'norepi',
count(DISTINCT icustay_id) FROM sofa_vaso WHERE rate_norepinephrine IS NOT NULL
UNION ALL SELECT 'epi',
count(DISTINCT icustay_id) FROM sofa_vaso WHERE rate_epinephrine IS NOT NULL
UNION ALL SELECT 'dop',
count(DISTINCT icustay_id) FROM sofa_vaso WHERE rate_dopamine IS NOT NULL
UNION ALL SELECT 'dob',
count(DISTINCT icustay_id) FROM sofa_vaso WHERE rate_dobutamine IS NOT NULL;
\echo
\echo '=================================================================='
\echo ' 8. Antibiotic prescriptions: top 15 drugs'
\echo '=================================================================='
\echo "EXPECTED: vancomycin, piperacillin/tazobactam (zosyn),"
\echo " ceftriaxone, levofloxacin, metronidazole near the top."
\echo
SELECT antibiotic, count(*) AS n
FROM antibiotic
GROUP BY antibiotic
ORDER BY n DESC
LIMIT 15;
\echo
\echo '=================================================================='
\echo ' 9. Suspicion of infection: matching rate'
\echo '=================================================================='
\echo "EXPECTED:"
\echo " ~50-70% of antibiotic rows are matched to a culture"
\echo " (i.e. suspected_infection = 1). Top specimens should be:"
\echo " BLOOD CULTURE, URINE, MRSA SCREEN, SPUTUM, SWAB."
\echo
SELECT count(*) AS n_total
, sum(suspected_infection) AS n_suspected
, round(100.0 * sum(suspected_infection)
/ count(*), 1) AS pct_suspected
FROM suspicion_of_infection;
SELECT specimen, count(*) AS n
FROM suspicion_of_infection
WHERE suspected_infection = 1
GROUP BY specimen
ORDER BY n DESC
LIMIT 10;
\echo
\echo '=================================================================='
\echo '10. Sepsis-3 prevalence at the ICU-stay level'
\echo '=================================================================='
\echo "EXPECTED (Johnson 2018, MIMIC-III all-cohort):"
\echo " total stays in sepsis3 table : 25 - 35 k"
\echo " (every stay with any abx and a qualifying SOFA window)"
\echo " sepsis3 = TRUE : 18 - 24 k (~30-40% of all ICU stays)"
\echo
SELECT count(*) AS n_rows
, sum(CASE WHEN sepsis3 THEN 1 ELSE 0 END) AS n_sepsis3
, round(100.0 * sum(CASE WHEN sepsis3 THEN 1 ELSE 0 END)
/ NULLIF(count(*),0), 1) AS pct_sepsis3_among_rows
, round(100.0 * sum(CASE WHEN sepsis3 THEN 1 ELSE 0 END)
/ NULLIF((SELECT count(*) FROM icustays), 0), 1)
AS pct_sepsis3_of_all_icustays
FROM sepsis3;
\echo
\echo '=================================================================='
\echo '11. Sepsis-3 onset timing'
\echo '=================================================================='
\echo "EXPECTED:"
\echo " Most onsets occur early in the stay; median onset is on"
\echo " day 0-1 (~0-24h after intime). A long right tail exists"
\echo " for ICU-acquired sepsis."
\echo " sofa_time should be within [-48h, +24h] of"
\echo " suspected_infection_time by construction."
\echo
SELECT round(avg(EXTRACT(EPOCH FROM (suspected_infection_time - ie.intime))/3600)::numeric, 1)
AS mean_hours_to_onset
, percentile_disc(0.50) WITHIN GROUP (
ORDER BY EXTRACT(EPOCH FROM (suspected_infection_time - ie.intime))/3600
) AS p50_hours
, percentile_disc(0.90) WITHIN GROUP (
ORDER BY EXTRACT(EPOCH FROM (suspected_infection_time - ie.intime))/3600
) AS p90_hours
, min(EXTRACT(EPOCH FROM (sofa_time - suspected_infection_time))/3600)
AS min_sofa_offset_h
, max(EXTRACT(EPOCH FROM (sofa_time - suspected_infection_time))/3600)
AS max_sofa_offset_h
FROM sepsis3 s
JOIN icustays ie ON ie.icustay_id = s.icustay_id
WHERE s.sepsis3 = TRUE;
\echo
\echo '=================================================================='
\echo '12. Mortality stratified by Sepsis-3 status'
\echo '=================================================================='
\echo "EXPECTED:"
\echo " In-hospital mortality among Sepsis-3 = TRUE: ~25-35%"
\echo " Among Sepsis-3 = FALSE / no row in sepsis3: ~5-10%"
\echo
WITH cohort AS (
SELECT ie.icustay_id, ie.hadm_id
, CASE WHEN s.sepsis3 IS TRUE THEN 'sepsis3'
ELSE 'not sepsis3' END AS sepsis_status
FROM icustays ie
LEFT JOIN sepsis3 s ON s.icustay_id = ie.icustay_id
)
SELECT c.sepsis_status
, count(*) AS n_stays
, sum(CASE WHEN adm.hospital_expire_flag = 1 THEN 1 ELSE 0 END)
AS n_died
, round(100.0 * sum(CASE WHEN adm.hospital_expire_flag = 1 THEN 1 ELSE 0 END)
/ count(*), 1) AS pct_died
FROM cohort c
JOIN admissions adm ON adm.hadm_id = c.hadm_id
GROUP BY c.sepsis_status
ORDER BY c.sepsis_status DESC;
\echo
\echo '=================================================================='
\echo '13. Sepsis-3 vs SAPS-II (cross-score validation)'
\echo '=================================================================='
\echo "EXPECTED:"
\echo " Septic patients should have higher mean SAPS-II than non-septic"
\echo " (typically by ~10-15 points)."
\echo " This sanity check requires that you have already run"
\echo " build_sapsii.sql. If sapsii does not exist, this section"
\echo " will error -- skip it with -v ON_ERROR_STOP=0."
\echo
SELECT CASE WHEN s.sepsis3 IS TRUE THEN 'sepsis3' ELSE 'not sepsis3' END
AS sepsis_status
, count(*) AS n
, round(avg(sa.sapsii)::numeric, 1) AS mean_sapsii
, round(avg(sa.sapsii_prob)::numeric, 3) AS mean_predicted_mortality
FROM icustays ie
LEFT JOIN sepsis3 s ON s.icustay_id = ie.icustay_id
LEFT JOIN sapsii sa ON sa.icustay_id = ie.icustay_id
GROUP BY (s.sepsis3 IS TRUE)
ORDER BY 1 DESC;
\echo
\echo '=================================================================='
\echo '14. Spot-check a few stays end-to-end'
\echo '=================================================================='
\echo "Pulls 5 random Sepsis-3 = TRUE stays and shows you the trajectory"
\echo "of sofa_24hours alongside the suspected_infection_time. Eyeball:"
\echo " - sofa_24hours should be >= 2 at hours surrounding the onset"
\echo " - sofa_24hours should plausibly rise then fall over the stay"
\echo " - hour numbering should be consecutive"
\echo
WITH picks AS (
SELECT icustay_id, suspected_infection_time
FROM sepsis3 WHERE sepsis3 = TRUE
ORDER BY md5(icustay_id::text)
LIMIT 5
)
SELECT p.icustay_id
, p.suspected_infection_time
, h.hr
, h.endtime
, h.respiration_24hours AS resp
, h.coagulation_24hours AS coag
, h.liver_24hours AS liv
, h.cardiovascular_24hours AS cardio
, h.cns_24hours AS cns
, h.renal_24hours AS ren
, h.sofa_24hours AS sofa
FROM picks p
JOIN sofa_hourly h ON h.icustay_id = p.icustay_id
WHERE h.endtime BETWEEN p.suspected_infection_time - INTERVAL '6 hours'
AND p.suspected_infection_time + INTERVAL '6 hours'
ORDER BY p.icustay_id, h.hr;
\echo
\echo 'All sanity checks complete. Anything way off the expected ranges'
\echo 'above is worth investigating before relying on the sepsis-3 cohort.'