Hospitalized Pneumonia Analysis

Question

wmonth=0, .., 28代表從不哺乳以及哺乳1~28個月; 那sfmonth=0, … ,18代表從不哺乳以及哺乳1~28個月
有資料chldage<agepn，有肺炎的年齡比看醫生的年齡小，代表沒看醫生就先知道自己有肺炎?

1
2
3
4
5
6
library(KMsurv)
library(magrittr)
library(ggplot2)
library(gridExtra)
library(survival)
library(survminer)

Read Data

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
data(pneumon)


# wmonth==0 is not breast feeding
pneumon$z_fed <- ifelse(pneumon$wmonth==0,0,1)
pneumon$z_alcohol <- ifelse(pneumon$alcohol==0,0,1)
pneumon$z_smoke <- ifelse(pneumon$smoke==0,0,1)

# region==1 is northeast
pneumon$z_ne <- ifelse(pneumon$region==1,1,0) 
# region==2 is north central
pneumon$z_nc <- ifelse(pneumon$region==2,1,0)
# region==3 is south
# region==4 is west, z_ne=z_nc=z_south=0
pneumon$z_south <- ifelse(pneumon$region==3,1,0)

# race==1 is white
pneumon$z_white <- ifelse(pneumon$race==1,1,0)
# race==2 is black
# race==3 is other, z_white=z_black=0
pneumon$z_black <- ifelse(pneumon$race==2,1,0)

# nsibs==0 is no sibling
pneumon$z_sib_zero <- ifelse(pneumon$nsibs==0,1,0)
# nsibs==2/3/4/5/6 is more than 1 sibling, z_sib_zero=z_sib_one=0
pneumon$z_sib_one <- ifelse(pneumon$nsibs==1,1,0)
pneumon$sib_threetype <- ifelse(pneumon$nsibs==0,0,ifelse(pneumon$nsibs==1,1,2))

variable_num <- ncol(pneumon)

Histogram

看所有變數的長條圖

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20

variable_num <- ncol(pneumon)


# 將圖形存儲到列表中
plots <- lapply(1:variable_num, function(i) {
  pneumon %>%
    ggplot(aes_string(x = names(pneumon)[i])) +
    geom_bar(fill = "skyblue", color = "black", width = 0.7) +
    labs(x = paste(names(pneumon)[i]), y = "Frequency", title = paste( names(pneumon)[i])) +
    theme_minimal() +
    coord_flip()
})

# 合併圖形
grid.arrange(grobs = plots[1:4], nrow = 2, ncol = 2)
grid.arrange(grobs = plots[5:8], nrow = 2, ncol = 2)
grid.arrange(grobs = plots[9:12], nrow = 2, ncol = 2)
grid.arrange(grobs = plots[13:15], nrow = 2, ncol = 2)
rm(plots)

Covariates Percentage

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# 獲取變數總數
variable_num <- ncol(pneumon)

for (i in 1:variable_num) {
  # 變數名稱
  var_name <- names(pneumon)[i]
  
  # 計算該變數的類型頻數和百分比
  freq <- table(pneumon[, i])         # 頻數
  percentages <- prop.table(freq) * 100  # 百分比
  
  # 格式化百分比
  percentages <- sprintf("%.2f%%", percentages)
  
  # 打印結果
  cat(var_name, ":\n")
  for (j in 1:length(freq)) {
    cat("  ", names(freq)[j], ":", freq[j], "(",percentages[j],")\n")
  }
  cat("\n")  # 分隔每個變數的輸出
}

rm(freq,i,j,percentages,var_name)

Percentage Transfer to Latex

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# 自動生成合併表格的 LaTeX 代碼
cat("\\begin{longtable}{|l|l|r|r|}\n")
cat("\\caption{變數分布表} \\label{tab:variable_distribution} \\\\ \\hline\n")
cat("\\textbf{變數名稱} & \\textbf{種類} & \\textbf{數量} & \\textbf{百分比} \\\\ \\hline\n")
cat("\\endfirsthead\n")
cat("\\hline\n")
cat("\\textbf{變數名稱} & \\textbf{種類} & \\textbf{數量} & \\textbf{百分比} \\\\ \\hline\n")
cat("\\endhead\n")

# 遍歷所有變數
for (i in 1:ncol(pneumon)) {
  var_name <- names(pneumon)[i]
  freq <- table(pneumon[, i])
  percentages <- prop.table(freq) * 100
  
  for (j in seq_along(freq)) {
    cat(var_name, "&", names(freq)[j], "&", freq[j], "&", sprintf("%.2f\\%%", percentages[j]), "\\\\ \\hline\n")
  }
}

cat("\\end{longtable}\n")

rm(freq,i,j,percentages,var_name)

Special Details

subset(pneumon,hospital==1 ) chldage以12為主。但有住院治療的人 (73個)，大多chldage=1 (21個)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14

test <- lapply(1, function(i) {
  subset(pneumon,hospital==1 ) %>%
    ggplot(aes_string(x = names(pneumon)[i])) +
    geom_bar(fill = "skyblue", color = "black", width = 0.7) +
    labs(x = paste(names(pneumon)[i]), y = "Frequency", title = "As hospital=1") +
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5)) +  # 標題置中
    coord_flip()
})

# 合併圖形
grid.arrange(grobs = test[1] )
rm(p, plot, test, std, i, plots)

subset(pneumon,agepn==0 ) chldage以12為主。但12月內沒有看醫生的人 (81個)，大多chldage=0.5 (80個)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14

test <- lapply(1, function(i) {
  subset(pneumon,agepn==0 ) %>%
    ggplot(aes_string(x = names(pneumon)[i])) +
    geom_bar(fill = "skyblue", color = "black", width = 0.7) +
    labs(x = paste(names(pneumon)[i]), y = "Frequency",title = "As agepn=0") +
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5)) +  # 標題置中
    coord_flip()
})

# 合併圖形
grid.arrange(grobs = test[1])
rm(test)

High Correlated

相關係數0.54，而且有2261筆資料，agepn=chldage
如果用chldage當成變數，一定會是重要變數

1
2
3
4
5
6
7
library(DataExplorer)
plot_correlation(pneumon[,c(1, 16, 3, 4, 8, 9, 11, 17, 18, 19, 20, 21, 22, 23, 25, 24, 15)])
table( pneumon$agepn,pneumon$chldage)

# 2261筆資料，agepn=chldage
table(pneumon$agepn-pneumon$chldage) %>% 
  barplot(xlab="agepn-chldage", ylab="Frequency")

Survival Curve

We define function, plotting survival curve for different category covariates.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
process_surfit <- function(covariate, color){
  covariate <- deparse(substitute(covariate)) # 取得變數名稱字串
  
  if(color==1){
    survfit(as.formula(paste("Surv(agepn, hospital) ~", covariate)), data = pneumon) %>% 
      plot( col = c("blue"), lty = 1,
      xlab = "agepn", ylab = "Survival Probability", 
      main = "Survival Curves",
      ylim = c(0.9, 1))
  }
  else if(color==2){
    survfit(as.formula(paste("Surv(agepn, hospital) ~", covariate)), data = pneumon) %>% 
      plot( col = c("blue", "red"), lty = 1,
            xlab = "agepn", ylab = "Survival Probability", 
            main = paste("Survival Curves by", covariate),
            ylim = c(0.9, 1),
            conf.int = FALSE)
      legend("bottomright", legend = c("0","1"), 
             col = c("blue", "red"), lty = 1, 
             title = covariate)
  }
  else if(color==3){
    survfit(as.formula(paste("Surv(agepn, hospital) ~", covariate)), data = pneumon) %>% 
      plot( col = c("blue", "red", "purple"), lty = 1,
            xlab = "agepn", ylab = "Survival Probability", 
            main = paste("Survival Curves by", covariate),
            ylim = c(0.9, 1),
            conf.int = FALSE)
      legend("bottomright", legend = c("0","1","2"), 
             col = c("blue", "red", "purple"), lty = 1, 
             title = covariate)
  }
  else if(color==4){
    survfit(as.formula(paste("Surv(agepn, hospital) ~", covariate)), data = pneumon) %>% 
      plot( col = c("blue", "red", "purple", "orange"), lty = 1,
            xlab = "agepn", ylab = "Survival Probability", 
            main = paste("Survival Curves by", covariate),
            ylim = c(0.9, 1),
            conf.int = FALSE)
      legend("bottomright", legend = c("0","1","2","3"), 
             col = c("blue", "red", "purple", "orange"), lty = 1, 
             title = covariate)
  }
}

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
process_surfit(z_fed, 2)
process_surfit(z_alcohol, 2)
process_surfit(z_smoke, 2)
process_surfit(urban, 2)
process_surfit(poverty, 2)
process_surfit(bweight, 2)
process_surfit(race, 3)
process_surfit(sib_threetype, 3)
process_surfit(region, 4)

# 對連續變數以中位數切一半，劃出生存曲線
median(pneumon$chldage)
median(pneumon$mthage)
median(pneumon$education)
median(pneumon$wmonth)
median(pneumon$sfmonth)

process_surfit(chldage == 12,2)
process_surfit(mthage  >= 22,2)
process_surfit(education  >= 12, 2)
process_surfit(wmonth  == 0, 2)
process_surfit(sfmonth  == 0, 2)

Log Rank Test

$H_0:$ each group survival function are same
若p value小於 $\alpha=0.05$ ，可以拒絕 $H_0$ ，代表各組存活函數有差異
Expected/Observed 是 Expected/Observed event number，如果Expected>Observed，代表該組表現比預期的好，event(因肺炎住院)發生的少

結果 $\alpha=0.05$ 之下
有差異的:
是否哺餵母乳、是否抽菸、小孩體重是否過輕、0個手足跟1個手足以上

沒差異的:
是否喝酒、是否住城市、是否貧窮、不同種族、不同區域

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
# 定義一個函數來自動處理 survdiff
process_survdiff <- function(formula, data1, method = NULL) {
  if (is.null(method)) {
    # 使用 survdiff
    survdiff(formula, data = data1) %>% 
      print()
  } else {
    # 使用 pairwise_survdiff
    pairwise_survdiff(formula, data = data1, p.adjust.method = method) %>% 
      print()
  }
  cat("\n")
}

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
# 對於單一變數的 survdiff
for (var in c("z_fed", "z_alcohol", "z_smoke", "urban", "poverty", "bweight")) {
  process_survdiff(as.formula(paste("Surv(agepn, hospital) ~", var)), data = pneumon)
}

# 對於需要 pairwise_survdiff 的變數
for (var in c("race", "sib_threetype", "region")) {
  process_survdiff(as.formula(paste("Surv(agepn, hospital) ~", var)), data = pneumon, method = "bonferroni")
}

# 以median為切點，
for (var in c("(chldage == 12)", "(mthage  >= 22)", "(education  >= 12)", "(wmonth  == 0)", "(sfmonth  == 0)")) {
  process_survdiff(as.formula(paste("Surv(agepn, hospital) ~", var)), data = pneumon)
}

rm(var)

1
2
3
4
5
6
7
8
# Call:
# survdiff(formula = Surv(agepn, hospital) ~ z_fed, data = pneumon)
# 
#            N Observed Expected (O-E)^2/E (O-E)^2/V
# z_fed=0 2036       59     43.8      5.27      13.4
# z_fed=1 1434       14     29.2      7.90      13.4
# 
#  Chisq= 13.4  on 1 degrees of freedom, p= 3e-04 

AIC

step_forward:(從 full model 開始) z_fed + z_smoke + z_sib_zero + z_black + bweight AIC=1080.62

step_backward: z_fed + education + z_smoke + z_nc + z_black + z_sib_zero AIC=1080.63

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
# 定義空模型（僅包含截距）
null_model <- coxph(Surv(agepn, hospital) ~ 1, data = pneumon)

# 定義全模型（包含所有變數）
full_model <- coxph(Surv(agepn, hospital) ~  z_fed+ mthage + urban + poverty + bweight + education + z_alcohol + z_smoke + z_ne + z_nc + z_south + z_white + z_black + z_sib_one + z_sib_zero , data=pneumon)

# 執行前向選擇法
step_forward <- step(null_model, 
                     scope = list(lower = null_model, upper = full_model), 
                     direction = "forward", 
                     trace = TRUE)

# 查看最終模型
summary(step_forward)

step(full_model, 
                     scope = list(lower = null_model, upper = full_model), 
                     direction = "backward", 
                     trace = TRUE) %>% 
  summary()

rm(full_model,null_model,step_forward)

Proportional Hazard Model (Cox)

Use backward stepwise AIC to choose covariate and put in cox
agepn with tie, use efron partial likelihood
If we standardize education, the explanation of exp(coef) will be like increase one unit standardized education…, it’s hard to understand.

1
2
3
cox_aic <- coxph(Surv(agepn, hospital) ~z_fed + education + z_smoke + z_nc + z_black + z_sib_zero, data = pneumon,ties="efron")

summary(cox_aic)

Martingale Residuals

Martingale Residuals 對連續變數 education 作圖，loess 曲線幾乎是水平的，所以符合假設。可以放入 education 的一次項

1
2
3
4
5
6
7
8
9
plot(pneumon$education, residuals(cox_aic, type = "martingale"))

ggplot(data = pneumon, aes(x = education, y = residuals(cox_aic, type = "martingale"))) +
  geom_point() +
  geom_smooth(method = "loess", col = "blue") +
  theme_minimal() +
  labs(x = "Education", y = "Martingale Residuals", title = "Residuals vs Education")

cox.zph(cox_aic)

Check Cox Assumption

cox的假設是，所有變數的風險比應隨時間保持恆定

可以用 survival curve by group, log(-log survival curves) 兩圖來檢查，或是code cox.zph()
cox加入time dependent的變數,若是這變數係數顯著，代表hazard ratio 會隨時間變動，不符合假設

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
#定義函數，針對不同變數畫出log hazard
loghazard_check <- function(covariate, data1) {
  covariate <- deparse(substitute(covariate)) # 取得變數名稱字串

  survfit(as.formula(paste("Surv(agepn, hospital) ~", covariate)), data = data1) %>% 
      plot( col = c("blue", "red"), lty = 1,
            xlab = "agepn", ylab="log(-log S(t))", 
            main = paste("Log Hazard by", covariate),
            fun = "cloglog",
            conf.int = FALSE)
      legend("bottomright", legend = c("0","1"), 
             col = c("blue", "red"), lty = 1, 
             title = covariate)

}

使用log(-log(s(t))) 對 agepn 作圖，兩類曲線沒有相交，代表符合假設

1
2
3
4
5
loghazard_check(z_fed, pneumon)
loghazard_check(z_smoke, pneumon)
loghazard_check(z_nc, pneumon)
loghazard_check(z_black, pneumon)
loghazard_check(z_sib_zero, pneumon)

Reference

Type of censoring
KMsurv