Hospitalized Pneumonia Analysis

Question

  1. wmonth=0, .., 28代表從不哺乳以及哺乳1~28個月; 那sfmonth=0, … ,18代表從不哺乳以及哺乳1~28個月
  2. 有資料chldage<agepn,有肺炎的年齡比看醫生的年齡小,代表沒看醫生就先知道自己有肺炎?
1
2
3
4
5
6
library(KMsurv)
library(magrittr)
library(ggplot2)
library(gridExtra)
library(survival)
library(survminer)

Read Data

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
data(pneumon)


# wmonth==0 is not breast feeding
pneumon$z_fed <- ifelse(pneumon$wmonth==0,0,1)
pneumon$z_alcohol <- ifelse(pneumon$alcohol==0,0,1)
pneumon$z_smoke <- ifelse(pneumon$smoke==0,0,1)

# region==1 is northeast
pneumon$z_ne <- ifelse(pneumon$region==1,1,0) 
# region==2 is north central
pneumon$z_nc <- ifelse(pneumon$region==2,1,0)
# region==3 is south
# region==4 is west, z_ne=z_nc=z_south=0
pneumon$z_south <- ifelse(pneumon$region==3,1,0)

# race==1 is white
pneumon$z_white <- ifelse(pneumon$race==1,1,0)
# race==2 is black
# race==3 is other, z_white=z_black=0
pneumon$z_black <- ifelse(pneumon$race==2,1,0)

# nsibs==0 is no sibling
pneumon$z_sib_zero <- ifelse(pneumon$nsibs==0,1,0)
# nsibs==2/3/4/5/6 is more than 1 sibling, z_sib_zero=z_sib_one=0
pneumon$z_sib_one <- ifelse(pneumon$nsibs==1,1,0)
pneumon$sib_threetype <- ifelse(pneumon$nsibs==0,0,ifelse(pneumon$nsibs==1,1,2))

variable_num <- ncol(pneumon)

Histogram

看所有變數的長條圖

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20

variable_num <- ncol(pneumon)


# 將圖形存儲到列表中
plots <- lapply(1:variable_num, function(i) {
  pneumon %>%
    ggplot(aes_string(x = names(pneumon)[i])) +
    geom_bar(fill = "skyblue", color = "black", width = 0.7) +
    labs(x = paste(names(pneumon)[i]), y = "Frequency", title = paste( names(pneumon)[i])) +
    theme_minimal() +
    coord_flip()
})

# 合併圖形
grid.arrange(grobs = plots[1:4], nrow = 2, ncol = 2)
grid.arrange(grobs = plots[5:8], nrow = 2, ncol = 2)
grid.arrange(grobs = plots[9:12], nrow = 2, ncol = 2)
grid.arrange(grobs = plots[13:15], nrow = 2, ncol = 2)
rm(plots)

Covariates Percentage

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# 獲取變數總數
variable_num <- ncol(pneumon)

for (i in 1:variable_num) {
  # 變數名稱
  var_name <- names(pneumon)[i]
  
  # 計算該變數的類型頻數和百分比
  freq <- table(pneumon[, i])         # 頻數
  percentages <- prop.table(freq) * 100  # 百分比
  
  # 格式化百分比
  percentages <- sprintf("%.2f%%", percentages)
  
  # 打印結果
  cat(var_name, ":\n")
  for (j in 1:length(freq)) {
    cat("  ", names(freq)[j], ":", freq[j], "(",percentages[j],")\n")
  }
  cat("\n")  # 分隔每個變數的輸出
}

rm(freq,i,j,percentages,var_name)

Percentage Transfer to Latex

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# 自動生成合併表格的 LaTeX 代碼
cat("\\begin{longtable}{|l|l|r|r|}\n")
cat("\\caption{變數分布表} \\label{tab:variable_distribution} \\\\ \\hline\n")
cat("\\textbf{變數名稱} & \\textbf{種類} & \\textbf{數量} & \\textbf{百分比} \\\\ \\hline\n")
cat("\\endfirsthead\n")
cat("\\hline\n")
cat("\\textbf{變數名稱} & \\textbf{種類} & \\textbf{數量} & \\textbf{百分比} \\\\ \\hline\n")
cat("\\endhead\n")

# 遍歷所有變數
for (i in 1:ncol(pneumon)) {
  var_name <- names(pneumon)[i]
  freq <- table(pneumon[, i])
  percentages <- prop.table(freq) * 100
  
  for (j in seq_along(freq)) {
    cat(var_name, "&", names(freq)[j], "&", freq[j], "&", sprintf("%.2f\\%%", percentages[j]), "\\\\ \\hline\n")
  }
}

cat("\\end{longtable}\n")

rm(freq,i,j,percentages,var_name)

Special Details

  1. subset(pneumon,hospital==1 ) chldage以12為主。但有住院治療的人 (73個),大多chldage=1 (21個)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14

test <- lapply(1, function(i) {
  subset(pneumon,hospital==1 ) %>%
    ggplot(aes_string(x = names(pneumon)[i])) +
    geom_bar(fill = "skyblue", color = "black", width = 0.7) +
    labs(x = paste(names(pneumon)[i]), y = "Frequency", title = "As hospital=1") +
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5)) +  # 標題置中
    coord_flip()
})

# 合併圖形
grid.arrange(grobs = test[1] )
rm(p, plot, test, std, i, plots)
  1. subset(pneumon,agepn==0 ) chldage以12為主。但12月內沒有看醫生的人 (81個),大多chldage=0.5 (80個)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14

test <- lapply(1, function(i) {
  subset(pneumon,agepn==0 ) %>%
    ggplot(aes_string(x = names(pneumon)[i])) +
    geom_bar(fill = "skyblue", color = "black", width = 0.7) +
    labs(x = paste(names(pneumon)[i]), y = "Frequency",title = "As agepn=0") +
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5)) +  # 標題置中
    coord_flip()
})

# 合併圖形
grid.arrange(grobs = test[1])
rm(test)

High Correlated

  • 相關係數0.54,而且有2261筆資料,agepn=chldage
  • 如果用chldage當成變數,一定會是重要變數
1
2
3
4
5
6
7
library(DataExplorer)
plot_correlation(pneumon[,c(1, 16, 3, 4, 8, 9, 11, 17, 18, 19, 20, 21, 22, 23, 25, 24, 15)])
table( pneumon$agepn,pneumon$chldage)

# 2261筆資料,agepn=chldage
table(pneumon$agepn-pneumon$chldage) %>% 
  barplot(xlab="agepn-chldage", ylab="Frequency")

Survival Curve

We define function, plotting survival curve for different category covariates.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
process_surfit <- function(covariate, color){
  covariate <- deparse(substitute(covariate)) # 取得變數名稱字串
  
  if(color==1){
    survfit(as.formula(paste("Surv(agepn, hospital) ~", covariate)), data = pneumon) %>% 
      plot( col = c("blue"), lty = 1,
      xlab = "agepn", ylab = "Survival Probability", 
      main = "Survival Curves",
      ylim = c(0.9, 1))
  }
  else if(color==2){
    survfit(as.formula(paste("Surv(agepn, hospital) ~", covariate)), data = pneumon) %>% 
      plot( col = c("blue", "red"), lty = 1,
            xlab = "agepn", ylab = "Survival Probability", 
            main = paste("Survival Curves by", covariate),
            ylim = c(0.9, 1),
            conf.int = FALSE)
      legend("bottomright", legend = c("0","1"), 
             col = c("blue", "red"), lty = 1, 
             title = covariate)
  }
  else if(color==3){
    survfit(as.formula(paste("Surv(agepn, hospital) ~", covariate)), data = pneumon) %>% 
      plot( col = c("blue", "red", "purple"), lty = 1,
            xlab = "agepn", ylab = "Survival Probability", 
            main = paste("Survival Curves by", covariate),
            ylim = c(0.9, 1),
            conf.int = FALSE)
      legend("bottomright", legend = c("0","1","2"), 
             col = c("blue", "red", "purple"), lty = 1, 
             title = covariate)
  }
  else if(color==4){
    survfit(as.formula(paste("Surv(agepn, hospital) ~", covariate)), data = pneumon) %>% 
      plot( col = c("blue", "red", "purple", "orange"), lty = 1,
            xlab = "agepn", ylab = "Survival Probability", 
            main = paste("Survival Curves by", covariate),
            ylim = c(0.9, 1),
            conf.int = FALSE)
      legend("bottomright", legend = c("0","1","2","3"), 
             col = c("blue", "red", "purple", "orange"), lty = 1, 
             title = covariate)
  }
}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
process_surfit(z_fed, 2)
process_surfit(z_alcohol, 2)
process_surfit(z_smoke, 2)
process_surfit(urban, 2)
process_surfit(poverty, 2)
process_surfit(bweight, 2)
process_surfit(race, 3)
process_surfit(sib_threetype, 3)
process_surfit(region, 4)

# 對連續變數以中位數切一半,劃出生存曲線
median(pneumon$chldage)
median(pneumon$mthage)
median(pneumon$education)
median(pneumon$wmonth)
median(pneumon$sfmonth)

process_surfit(chldage == 12,2)
process_surfit(mthage  >= 22,2)
process_surfit(education  >= 12, 2)
process_surfit(wmonth  == 0, 2)
process_surfit(sfmonth  == 0, 2)

Log Rank Test

  • H0:H_0: each group survival function are same
  • 若p value小於α=0.05\alpha=0.05,可以拒絕H0H_0,代表各組存活函數有差異
  • Expected/Observed 是 Expected/Observed event number,如果Expected>Observed,代表該組表現比預期的好,event(因肺炎住院)發生的少

結果 α=0.05\alpha=0.05之下
有差異的:
是否哺餵母乳、是否抽菸、小孩體重是否過輕、0個手足跟1個手足以上

沒差異的:
是否喝酒、是否住城市、是否貧窮、不同種族、不同區域

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
# 定義一個函數來自動處理 survdiff
process_survdiff <- function(formula, data1, method = NULL) {
  if (is.null(method)) {
    # 使用 survdiff
    survdiff(formula, data = data1) %>% 
      print()
  } else {
    # 使用 pairwise_survdiff
    pairwise_survdiff(formula, data = data1, p.adjust.method = method) %>% 
      print()
  }
  cat("\n")
}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
# 對於單一變數的 survdiff
for (var in c("z_fed", "z_alcohol", "z_smoke", "urban", "poverty", "bweight")) {
  process_survdiff(as.formula(paste("Surv(agepn, hospital) ~", var)), data = pneumon)
}

# 對於需要 pairwise_survdiff 的變數
for (var in c("race", "sib_threetype", "region")) {
  process_survdiff(as.formula(paste("Surv(agepn, hospital) ~", var)), data = pneumon, method = "bonferroni")
}

# 以median為切點,
for (var in c("(chldage == 12)", "(mthage  >= 22)", "(education  >= 12)", "(wmonth  == 0)", "(sfmonth  == 0)")) {
  process_survdiff(as.formula(paste("Surv(agepn, hospital) ~", var)), data = pneumon)
}

rm(var)
1
2
3
4
5
6
7
8
# Call:
# survdiff(formula = Surv(agepn, hospital) ~ z_fed, data = pneumon)
# 
#            N Observed Expected (O-E)^2/E (O-E)^2/V
# z_fed=0 2036       59     43.8      5.27      13.4
# z_fed=1 1434       14     29.2      7.90      13.4
# 
#  Chisq= 13.4  on 1 degrees of freedom, p= 3e-04 

AIC

step_forward:(從 full model 開始) z_fed + z_smoke + z_sib_zero + z_black + bweight AIC=1080.62

step_backward: z_fed + education + z_smoke + z_nc + z_black + z_sib_zero AIC=1080.63

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
# 定義空模型(僅包含截距)
null_model <- coxph(Surv(agepn, hospital) ~ 1, data = pneumon)

# 定義全模型(包含所有變數)
full_model <- coxph(Surv(agepn, hospital) ~  z_fed+ mthage + urban + poverty + bweight + education + z_alcohol + z_smoke + z_ne + z_nc + z_south + z_white + z_black + z_sib_one + z_sib_zero , data=pneumon)

# 執行前向選擇法
step_forward <- step(null_model, 
                     scope = list(lower = null_model, upper = full_model), 
                     direction = "forward", 
                     trace = TRUE)

# 查看最終模型
summary(step_forward)

step(full_model, 
                     scope = list(lower = null_model, upper = full_model), 
                     direction = "backward", 
                     trace = TRUE) %>% 
  summary()

rm(full_model,null_model,step_forward)

Proportional Hazard Model (Cox)

  • Use backward stepwise AIC to choose covariate and put in cox
  • agepn with tie, use efron partial likelihood
  • If we standardize education, the explanation of exp(coef) will be like increase one unit standardized education…, it’s hard to understand.
1
2
3
cox_aic <- coxph(Surv(agepn, hospital) ~z_fed + education + z_smoke + z_nc + z_black + z_sib_zero, data = pneumon,ties="efron")

summary(cox_aic)

Martingale Residuals

  • Martingale Residuals 對連續變數 education 作圖,loess 曲線幾乎是水平的,所以符合假設。可以放入 education 的一次項
1
2
3
4
5
6
7
8
9
plot(pneumon$education, residuals(cox_aic, type = "martingale"))

ggplot(data = pneumon, aes(x = education, y = residuals(cox_aic, type = "martingale"))) +
  geom_point() +
  geom_smooth(method = "loess", col = "blue") +
  theme_minimal() +
  labs(x = "Education", y = "Martingale Residuals", title = "Residuals vs Education")

cox.zph(cox_aic)

Check Cox Assumption

cox的假設是,所有變數的風險比應隨時間保持恆定

  • 可以用 survival curve by group, log(-log survival curves) 兩圖來檢查,或是code cox.zph()
  • cox加入time dependent的變數,若是這變數係數顯著,代表hazard ratio 會隨時間變動,不符合假設
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
#定義函數,針對不同變數畫出log hazard
loghazard_check <- function(covariate, data1) {
  covariate <- deparse(substitute(covariate)) # 取得變數名稱字串

  survfit(as.formula(paste("Surv(agepn, hospital) ~", covariate)), data = data1) %>% 
      plot( col = c("blue", "red"), lty = 1,
            xlab = "agepn", ylab="log(-log S(t))", 
            main = paste("Log Hazard by", covariate),
            fun = "cloglog",
            conf.int = FALSE)
      legend("bottomright", legend = c("0","1"), 
             col = c("blue", "red"), lty = 1, 
             title = covariate)

}

使用log(-log(s(t))) 對 agepn 作圖,兩類曲線沒有相交,代表符合假設

1
2
3
4
5
loghazard_check(z_fed, pneumon)
loghazard_check(z_smoke, pneumon)
loghazard_check(z_nc, pneumon)
loghazard_check(z_black, pneumon)
loghazard_check(z_sib_zero, pneumon)

Reference

Type of censoring
KMsurv

Licensed under CC BY-NC-SA 4.0
使用 Hugo 建立
主題 StackJimmy 設計