# -------------------- IMPORT LIBRARIES --------------------
%matplotlib inline
import pandas as pd
import numpy as np
import statsmodels.api as sm
from patsy import dmatrices

# -------------------- LOAD RAW DATA --------------------

# these data load from the working directory but the source 
# data come from here: https://datadryad.org/stash/dataset/doi:10.5061/dryad.0p2ngf1zd

d = pd.read_csv("data01.csv") 

# -------------------- SELECT COLUMNS --------------------
data = d.loc[:,["outcome",
     "gendera",
     "age",
     "BMI",
     "hypertensive",
     "Hyperlipemia",
     "atrialfibrillation",
     "CHD with no MI",
     "COPD",
     "diabetes",
     "deficiencyanemias",
     "depression",
     "Renal failure",
     "heart rate",
     "Systolic blood pressure",
     "Diastolic blood pressure",
     "Respiratory rate",
     "temperature",
     "SP O2",
     "Urine output"]]

# -------------------- RENAME COLUMNS --------------------
data.rename(columns={"outcome":"Mortality",
                    "gendera":"Gender",
                    "age":"Age",
                    "hypertensive":"Hypertensive",
                    "hyperlipemia":"Hyperlipemia",
                    "atrialfibrillation":"Atrial_fibrillation",
                    "CHD with no MI":"CHD_no_MI",
                    "diabetes":"Diabetes",
                    "deficiencyanemias":"Deficiency_anemias",
                    "depression":"Depression",
                    "Renal failure":"Renal_failure",
                    "heart rate":"Heart_rate",
                    "Systolic blood pressure":"Systolic_BP",
                    "Diastolic blood pressure":"Diastolic_BP",
                    "Respiratory rate":"Respiratory_rate",
                    "temperature":"Temperature",
                    "SP O2":"SpO2",
                    "Urine output":"Urine_output"}, inplace=True)

# -------------------- RECODE VARIABLES --------------------
data['Male'] = data['Gender'] # ----- Dummy variable for male
def recode(series):
    if series == 2:
        return 1
    else:
        return 0
data['Male'] = data['Male'].apply(recode)

data['Male*Age'] = data['Male']*data['Age'] # ----- Male x Age interaction term

def recode(genders): # ----- Gender with labels
    if genders == 1:
        return 'Female'
    else:
        return "Male"
data['Gender'] = data['Gender'].apply(recode)

def recode(death): # ----- Mortality with labels
    if death == 0:
        return 'Alive'
    else:
        return 'Deceased'
data['Death'] = data['Mortality'].apply(recode)

# -------------------- INSPECT THE DATA FRAME --------------------
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1177 entries, 0 to 1176
Data columns (total 23 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Mortality            1176 non-null   float64
 1   Gender               1177 non-null   object 
 2   Age                  1177 non-null   int64  
 3   BMI                  962 non-null    float64
 4   Hypertensive         1177 non-null   int64  
 5   Hyperlipemia         1177 non-null   int64  
 6   Atrial_fibrillation  1177 non-null   int64  
 7   CHD_no_MI            1177 non-null   int64  
 8   COPD                 1177 non-null   int64  
 9   Diabetes             1177 non-null   int64  
 10  Deficiency_anemias   1177 non-null   int64  
 11  Depression           1177 non-null   int64  
 12  Renal_failure        1177 non-null   int64  
 13  Heart_rate           1164 non-null   float64
 14  Systolic_BP          1161 non-null   float64
 15  Diastolic_BP         1161 non-null   float64
 16  Respiratory_rate     1164 non-null   float64
 17  Temperature          1158 non-null   float64
 18  SpO2                 1164 non-null   float64
 19  Urine_output         1141 non-null   float64
 20  Male                 1177 non-null   int64  
 21  Male*Age             1177 non-null   int64  
 22  Death                1177 non-null   object 
dtypes: float64(9), int64(12), object(2)
memory usage: 211.6+ KB


# -------------------- TABLE 1: DEMOGRAPHIC CHARACTERISTICS --------------------
from tableone import TableOne

t1_columns = ['Death','Gender', 'Age', 'BMI', 'Atrial_fibrillation', 'CHD_no_MI', 'COPD', 'Diabetes', 'Deficiency_anemias', 'Depression', 'Hyperlipemia', 'Hypertensive', 'Renal_failure',
             'Heart_rate','Systolic_BP','Diastolic_BP','Respiratory_rate','Temperature','SpO2','Urine_output']
t1_categorical = ['Gender', 'Atrial_fibrillation' , 'CHD_no_MI', 'COPD', 'Diabetes', 'Deficiency_anemias', 'Depression', 'Hyperlipemia', 'Hypertensive', 'Renal_failure']
t1_groupby = ['Death']
#t1_nonnormal = ['Age']
#t1_labels={'death': 'mortality'}
#t1 = TableOne(data, columns=t1_columns, categorical=t1_categorical, groupby=t1_groupby, nonnormal=nonnormal, rename=labels, pval=False)
t1 = TableOne(data, columns=t1_columns, categorical=t1_categorical, groupby=t1_groupby, pval=True)
print()
print("Table 1: Sample Characteristics")
t1

Table 1: Sample Characteristics


# -------------------- FIGURE 1. BAR CHART --------------------
import matplotlib.pyplot as plt
import seaborn as sns
print()
print('Figure 1: Bar Chart of Gender, Age, and Mortality')
print()
sns.set_theme(style="whitegrid")
g = sns.catplot(
    data=data, kind="bar",
    x="Death", y="Age", hue="Gender",
    errorbar="sd", palette="dark", alpha=.6, height=6
)
g.despine(left=True)
g.set_axis_labels("", "Age")
g.legend.set_title("")

Figure 1: Bar Chart of Gender, Age, and Mortality


# -------------------- TABLE 2. LOGISTIC REGRESSION --------------------

# ---------- Demographics, comorbidities, and vitals
y, X = dmatrices('Mortality ~ Male + Age + Male*Age + BMI + Atrial_fibrillation + CHD_no_MI + COPD + Diabetes + Deficiency_anemias + Depression + Hyperlipemia + Hypertensive + Renal_failure + Heart_rate + Systolic_BP + Diastolic_BP + Respiratory_rate + Temperature + SpO2 + Urine_output', data=data, return_type='dataframe')

mod = sm.Logit(y, X) # ----- Describe model
res = mod.fit() # ----- Fit model
print()
print('Table 2: Results of Logistic Regression')
print()
print(res.summary())

Optimization terminated successfully.
         Current function value: 0.295019
         Iterations 8

Table 2: Results of Logistic Regression

                           Logit Regression Results                           
==============================================================================
Dep. Variable:              Mortality   No. Observations:                  927
Model:                          Logit   Df Residuals:                      906
Method:                           MLE   Df Model:                           20
Date:                Wed, 02 Aug 2023   Pseudo R-squ.:                  0.1948
Time:                        20:19:13   Log-Likelihood:                -273.48
converged:                       True   LL-Null:                       -339.66
Covariance Type:            nonrobust   LLR p-value:                 1.407e-18
=======================================================================================
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept              33.3027      8.450      3.941      0.000      16.741      49.864
Male                   -1.3877      1.419     -0.978      0.328      -4.170       1.394
Age                     0.0028      0.014      0.205      0.837      -0.024       0.030
Male:Age                0.0119      0.018      0.648      0.517      -0.024       0.048
BMI                     0.0041      0.015      0.272      0.785      -0.025       0.033
Atrial_fibrillation     0.1143      0.241      0.474      0.636      -0.359       0.587
CHD_no_MI               0.2494      0.405      0.616      0.538      -0.544       1.043
COPD                   -0.7732      0.474     -1.632      0.103      -1.702       0.156
Diabetes                0.2632      0.245      1.072      0.284      -0.218       0.744
Deficiency_anemias     -1.0143      0.286     -3.542      0.000      -1.575      -0.453
Depression             -0.7153      0.462     -1.550      0.121      -1.620       0.189
Hyperlipemia           -0.3282      0.247     -1.329      0.184      -0.812       0.156
Hypertensive           -0.0977      0.261     -0.375      0.708      -0.609       0.413
Renal_failure          -0.7933      0.272     -2.919      0.004      -1.326      -0.261
Heart_rate              0.0400      0.008      4.737      0.000       0.023       0.057
Systolic_BP             0.0052      0.008      0.636      0.525      -0.011       0.021
Diastolic_BP           -0.0466      0.014     -3.321      0.001      -0.074      -0.019
Respiratory_rate        0.0580      0.030      1.960      0.050    7.86e-06       0.116
Temperature            -0.8845      0.201     -4.403      0.000      -1.278      -0.491
SpO2                   -0.0430      0.047     -0.916      0.360      -0.135       0.049
Urine_output           -0.0006      0.000     -4.327      0.000      -0.001      -0.000
=======================================================================================

	Gender	Age	BMI	Hypertensive	Hyperlipemia	COPD	Diabetes	...	Heart_rate	Systolic_BP	Diastolic_BP	Respiratory_rate	Temperature	SpO2	Urine_output	Male	Male*Age	Death
0	Female	72	37.588179	0	1	0	1	...	68.837838	155.866667	68.333333	16.621622	36.714286	98.394737	2155.0	0	0	Alive
1	Male	75	NaN	0	0	1	0	...	101.370370	140.000000	65.000000	20.851852	36.682540	96.923077	1425.0	1	75	Alive
2	Male	83	26.572634	0	0	0	0	...	72.318182	135.333333	61.375000	23.640000	36.453704	95.291667	2425.0	1	83	Alive
3	Male	43	83.264629	0	0	0	0	...	94.500000	126.400000	73.200000	21.857143	36.287037	93.846154	8760.0	1	43	Alive
4	Male	75	31.824842	1	0	1	0	...	67.920000	156.560000	58.120000	21.360000	36.761905	99.280000	4455.0	1	75	Alive

		Grouped by Death
		Missing	Overall	Alive	Deceased	P-Value
n			1177	1017	160
Gender, n (%)	Female	0	559 (47.5)	478 (47.0)	81 (50.6)	0.442
Gender, n (%)	Male		618 (52.5)	539 (53.0)	79 (49.4)
Age, mean (SD)		0	74.1 (13.4)	73.7 (13.4)	76.3 (13.2)	0.023
BMI, mean (SD)		215	30.2 (9.3)	30.4 (9.2)	28.6 (9.9)	0.070
Atrial_fibrillation, n (%)	0	0	646 (54.9)	578 (56.8)	68 (42.5)	0.001
Atrial_fibrillation, n (%)	1		531 (45.1)	439 (43.2)	92 (57.5)
CHD_no_MI, n (%)	0	0	1076 (91.4)	928 (91.2)	148 (92.5)	0.709
CHD_no_MI, n (%)	1		101 (8.6)	89 (8.8)	12 (7.5)
COPD, n (%)	0	0	1088 (92.4)	935 (91.9)	153 (95.6)	0.139
COPD, n (%)	1		89 (7.6)	82 (8.1)	7 (4.4)
Diabetes, n (%)	0	0	681 (57.9)	579 (56.9)	102 (63.7)	0.124
Diabetes, n (%)	1		496 (42.1)	438 (43.1)	58 (36.2)
Deficiency_anemias, n (%)	0	0	778 (66.1)	653 (64.2)	125 (78.1)	0.001
Deficiency_anemias, n (%)	1		399 (33.9)	364 (35.8)	35 (21.9)
Depression, n (%)	0	0	1037 (88.1)	888 (87.3)	149 (93.1)	0.048
Depression, n (%)	1		140 (11.9)	129 (12.7)	11 (6.9)
Hyperlipemia, n (%)	0	0	730 (62.0)	620 (61.0)	110 (68.8)	0.072
Hyperlipemia, n (%)	1		447 (38.0)	397 (39.0)	50 (31.2)
Hypertensive, n (%)	0	0	332 (28.2)	274 (26.9)	58 (36.2)	0.019
Hypertensive, n (%)	1		845 (71.8)	743 (73.1)	102 (63.7)
Renal_failure, n (%)	0	0	747 (63.5)	625 (61.5)	122 (76.2)	<0.001
Renal_failure, n (%)	1		430 (36.5)	392 (38.5)	38 (23.8)
Heart_rate, mean (SD)		13	84.6 (16.0)	83.8 (16.0)	89.8 (15.2)	<0.001
Systolic_BP, mean (SD)		16	118.0 (17.4)	118.9 (17.3)	112.2 (16.6)	<0.001
Diastolic_BP, mean (SD)		16	59.5 (10.7)	59.9 (10.9)	57.2 (8.9)	0.001
Respiratory_rate, mean (SD)		13	20.8 (4.0)	20.6 (3.9)	22.0 (4.4)	<0.001
Temperature, mean (SD)		19	36.7 (0.6)	36.7 (0.6)	36.5 (0.7)	0.006
SpO2, mean (SD)		13	96.3 (2.3)	96.3 (2.1)	95.9 (3.1)	0.064
Urine_output, mean (SD)		36	1899.3 (1272.4)	1986.9 (1271.2)	1346.0 (1136.6)	<0.001

Predicting Mortality in Heart Failure Patients admitted to ICU¶

Introduction¶

Methods¶

Data Source¶

Measures¶

Data Analysis¶

Data Preparation¶

Results¶

Discussion¶

References¶