Boston Housing

#　Boston Housing 分析 Boston Housing dataset.

Target: “clean air” had an influence on house prices

It can be clearly seen that the values of X1 and X9 are much more concentrated around 0. Therefore, it makes sense to consider transformations of the original data.
X 1 = log (X1) would reveal that two subgroups might exist with different mean values
Most of the variables exhibit an asymmetry with a higher density on the left-hand side. Taking log helps to reduce the asymmetry. Due to the fact that lower values move further away from each other, whereas the distance between greater values is reduced by log transformations.

EDA

X1: 人均犯罪率
X2: 大地塊住宅用地比例(豪宅)
X3: 非零售商業用地面積比例
X4: 查爾斯河（與河流相鄰是1，否則0）
X5: 一氧化氮濃度
X6: 每套房屋的平均房間數
X7: 1940 年以前建造的自住房屋比例
X8: 到5個就業地區的加權距離
X9: 公路可達性指數
X10: 每 10,000 美元的全額房產稅率
X11: 教師/學生比例
X12: $1000(B − 0.63)^2 I(B < 0.63)$ B 是非裔美國人的比例
X13: 地位較低人口比例
X14: 自住房屋的平均價值（以 1000 美元計）

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
rm(list = ls())
house <- fread("bostonh.dat")

# 丟掉na,null
house <- na.omit(house)
house <- Filter(Negate(is.null), house)

colnames(house) <- c("crime", "large_lots", "nonretail_business", "river", "nitric_oxides", "room", 
                     "owner_occupied_previous", "employment_center_distance", "highways_accessibility",
                     "tax", "teacher_pupil", "African_American", "lower_status", "owner_occupied_value")

# price_median:房價大於median給2，否則1
house$price_median <- as.numeric(house$owner_occupied_value > median(house$owner_occupied_value)) + 1

house_trans <- house
# 變數調整
house_trans$crime <- house_trans$crime %>% 
  log()
house_trans$large_lots <- house_trans$large_lots/10
house_trans$nonretail_business <- house_trans$nonretail_business %>% 
  log()
# 4 variable do not change
house_trans$nitric_oxides <- house_trans$nitric_oxides %>% 
  log()
house_trans$room <- house_trans$room %>% 
  log()
house_trans$owner_occupied_previous <- house_trans$owner_occupied_previous^(2.5)/1000

house_trans$employment_center_distance <- house_trans$employment_center_distance %>% 
  log()
house_trans$highways_accessibility <- house_trans$highways_accessibility %>% 
  log()
house_trans$tax <- house_trans$tax %>% 
  log()

house_trans$teacher_pupil <- (0.4*house_trans$teacher_pupil) %>% 
  exp()/1000
house_trans$African_American <- house_trans$African_American/100
house_trans$lower_status <- house_trans$lower_status %>% 
  sqrt()
house_trans$owner_occupied_value <- house_trans$owner_occupied_value %>% 
  log()

house %>% 
  scale() %>% 
  boxplot()

house_trans %>% 
  scale() %>% 
  boxplot()

  
# pairs plot，blue= owner_occupied_value> median
# 大地塊住宅用地比例高，犯罪率幾乎是 0
pairs(house[, c(1,2)], col = c("red", "blue")[house$price_median], pch = 1)
# 昂貴的房子位於large_lots 大的區域 
pairs(house[, c(14,2)], pch = 1)
# 非零售商業用地面積比例高，房價越低。可能是非零售商業會造成噪音等汙染
pairs(house[, c(14,3)], pch = 1)
# 一氧化氮濃度高，房價越低。一氧化氮主要來自交通工具、工業排放，低房價的主因可能是交通工具跟工業區的噪音
pairs(house[, c(14,5)], pch = 1)
# 平均房間數高，房價越高。平均房間數可能跟房子總面積有關
pairs(house[, c(14,6)], pch = 1)
# 波士頓地區，新的房子離就業地區比較遠
pairs(house[, c(7,8)], pch = 1)
# 每名學生的教師比例低，房價低
pairs(house[, c(14,11)], pch = 1)

X1 的兩個子組可能對應於兩個不同價格水平的房屋
X3和X14之間呈負相關，非零售業務有時會產生惱人的噪音和其他污染。KDE plot 明顯有兩個峰
查爾斯河畔的昂貴房屋比廉價房屋多。查看原始資料集，可以清楚地看到 X4 等於 1的房屋是彼此接近的區域。代表查爾斯河並沒有流經許多不同的地區。因此，較昂貴的地區靠近查爾斯河可能純屬巧合
作者的目的是污染是否對房價產生影響，所以考慮 X5 是否可以作為價格 X14 的解釋變數。但氮氧化物的排放通常伴隨著噪音污染，也可能人們逃避的是噪音污染
X12 的值都在 390 左右。由于 B 不能大于 0.63，因此只有 B 接近于零才能导致这样的值。因此，X12 越高，非裔美国人的实际比例就越低。在观测值 405 到 470 中，有相当多的观测值的 X12 远低于 390。这意味着，在这些地区，非裔美国人的比例高于零。我们可以在 log (X12) 的散点图中观察到两个点簇：一个簇的 X12 接近 390，另一个簇的 X12 介于 3 和 100 之间

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
for (i in 1:13) {
  print(
    house_trans %>% 
      ggplot(aes(x = house_trans[[1]],
                 color = factor(price_median, labels = c("Over", "Lower")),
                 fill = factor(price_median, labels = c("Over", "Lower"))
                 )) +
      geom_density(kernel = "gaussian", adjust = 0.5,  alpha = 0.3) + # adjust調整bandwidth
      labs(title = "Density Plot of house_trans", 
           x = names(house_trans)[i], 
           y = "Density") +  # 修改圖例標題
      theme_minimal()
  )
}

for (i in 1:13) {
  print(
    house_trans %>% 
      ggplot(aes(x = house_trans[[1]],
                 color = factor(price_median, labels = c("Over", "Lower")),
                 fill = factor(price_median, labels = c("Over", "Lower"))
                 )) +
      geom_density(kernel = "gaussian", adjust = 0.5,  alpha = 0.3) + # adjust調整bandwidth
      labs(title = "Density Plot of house_trans", 
           x = names(house_trans)[i], 
           y = "Density") +  # 修改圖例標題
      theme_minimal()
  )
}

geom_density跟KDE是一樣的

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
## geom_density
print(
    house_trans %>% 
      ggplot(aes(x = house_trans[[1]],
                 color = factor(price_median, labels = c("Over", "Lower")),
                 fill = factor(price_median, labels = c("Over", "Lower"))
                 )) +
      geom_density(kernel = "gaussian", adjust = 0.5,  alpha = 0.3) + # adjust調整bandwidth
      labs(title = "Density Plot of house_trans", 
           x = names(house_trans)[i], 
           y = "Density") +  # 修改圖例標題
      theme_minimal()+
      xlim(-5, 5)
  )


## KDE
# 把 price_median 轉成因子
group <- factor(house_trans$price_median, labels = c("Over", "Lower"))

# 分別計算每一群的 density
dens1 <- density(house_trans[[1]][group == "Over"], kernel = "gaussian", adjust = 0.5)
dens2 <- density(house_trans[[1]][group == "Lower"], kernel = "gaussian", adjust = 0.5)

# 畫第一組密度曲線
plot(dens1, col = "blue", lwd = 2, 
     main = "Density by Price Median", 
     xlab = "House Feature", 
     ylab = "Density",
     xlim = c(-5, 5), 
     ylim = range(c(dens1$y, dens2$y)))

# 畫第二組密度曲線
lines(dens2, col = "red", lwd = 2)

# 加圖例
legend("topright", legend = c("Over", "Lower"),
       col = c("blue", "red"), lwd = 2)

PCA

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
house_pca = prcomp(house[,-c(14,15)], center = TRUE, scale. = TRUE)
summary(house_pca)

# plot Proportion of Variance for each PC
plot(house_pca)
pairs(house_pca$x[, 1:4], col = house$price_median)

## house_trans
house_trans_pca = prcomp(house_trans[,-c(14,15)], center = TRUE, scale. = TRUE)
summary(house_trans_pca)

# plot Proportion of Variance for each PC
plot(house_trans_pca)
pairs(house_trans_pca$x[, 1:4], col = house_trans$price_median)

Reg(house)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# house <- house[,-15]
# house_trans <- house_trans[,-15]

piece <- 5
# 創建一個列表來存儲模型
reg_model <- list()

# acc_house 存平均的MSE, r^2
acc_house <- list()
y_colname <- "owner_occupied_value"
  
for (i in 1:piece) {
  # sampling
  set.seed(i)
  trainIndex = createDataPartition(house[[y_colname]], p = 1/piece, list = FALSE)
  train_data = house[-trainIndex,] 
  test_data = house[trainIndex,]
  
  # 正確存入 model_name
  model_name <- paste("reg", i, sep = "_")  

  # as.formula 把字串轉成公式response ~ predictors，適用於lm, glm
  model <- y_colname %>% 
    paste("~ .") %>% 
    as.formula() %>% 
    lm(data = train_data)

  # 把模型存list 
  reg_model[[model_name]] <- model  
  
  # 預測y 值
  predictions <- predict(model, newdata = test_data)

  acc_house[[i]] <- list(
    mse =  (test_data[[y_colname]] - predictions)^2 %>% 
      mean(),
    r_squared = summary(model)$r.squared
  )
  
}

result

1
2
3
4
5
6
7
for (i in 1:5) {
  print(acc_house[[i]]$mse)
}

for (i in 1:5) {
  print(acc_house[[i]]$r_squared)
}

Reg(house_trans)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# 創建一個列表來存儲模型
reg_trans_model <- list()

# acc_house_trans 存平均的MSE, r^2
acc_house_trans <- list()

for (i in 1:piece) {
  # sampling
  set.seed(i)
  trainIndex = createDataPartition(house_trans[[y_colname]], p = 1/piece, list = FALSE)
  train_data = house_trans[-trainIndex,] 
  test_data = house_trans[trainIndex,]
  
  # 正確存入 model_name
  model_name <- paste("reg", i, sep = "_")  

  # as.formula 把字串轉成公式response ~ predictors，適用於lm, glm
  model <- y_colname %>% 
    paste("~ .") %>% 
    as.formula() %>% 
    lm(data = train_data)

  # 把模型存list 
  reg_trans_model[[model_name]] <- model  

  # 預測y 值
  predictions <- predict(model, newdata = test_data)

  acc_house_trans[[i]] <- list(
    mse =  (exp(test_data[[y_colname]]) - exp(predictions))^2 %>% 
      mean(),
    r_squared = summary(model)$r.squared
  )
}

result

1
2
3
4
5
6
7
for (i in 1:5) {
  print(acc_house_trans[[i]]$mse)
}

for (i in 1:5) {
  print(acc_house_trans[[i]]$r_squared)
}

Compare

residual v.s. fitted value: 若epsilon 是 normal的假設成立，residual 也該是normal，residual的值很隨機，不該因 fitted value 變大而受影響，正常會像是雜亂無章的點，不該出現漏斗型、U型等
QQ plot: 看qualtile of residual v.s. quantile of standard normal, 若epsilon 是 normal的假設成立，residual 也該是normal。QQ plot 正常會是45度線，不該呈現s型
sqrt(Standardized Residual) v.s. Fitted value: 若epsilon 是 normal的假設成立，squared root of residual 也該是normal，residual的值很隨機，不該因 fitted value 變大而受影響，正常會像是雜亂無章的點，不該出現漏斗型、U型等
Residuals v.s. Leverage: 檢查是否有influential point，某點同時有大 residual 和大 leverage，可能是influential point。Cook’s distance是一個指標，通常 > 0.5就是 influential point

1
2
3
4
5
par(mfrow = c(2, 2))

# 不適合用lm
plot(reg_model$reg_1)
plot(reg_trans_model$reg_1)

cart (house)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
piece <- 5
cart_model <- list()
y_colname <- "owner_occupied_value"
  
for (i in 1:piece) {
  # sampling
  set.seed(i)
  trainIndex = createDataPartition(house[[y_colname]], p = 1/piece, list = FALSE)
  train_data = house[-trainIndex,] 
  test_data = house[trainIndex,]
  
  # 正確存入 model_name
  model_name <- paste("cart", i, sep = "_")  

  # as.formula 把字串轉成公式response ~ predictors，適用於lm, glm
  model <- train(as.formula(paste(y_colname, "~ .")),
                data = train_data, 
                method = "rpart",  # 使用 rpart (CART)
                trControl = trainControl(method = "cv", number = 10),  # 交叉驗證
                tuneLength = 10)  # 參數調整範圍
  
  # complexity parameter prune
  model = prune(model$finalModel, cp = 0.01)

  # 把模型存list
  cart_model[[model_name]] <- model  
  
  # 預測y 值
  predictions <- predict(model, newdata = test_data)

  acc_house[[i+piece]] <- list(
    mse =  (test_data[[y_colname]] - predictions)^2 %>% 
      mean(),
    r_squared = summary(model)$r.squared
  )
  
}

result

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
r_avg=0
reg_avg=0
cart_avg=0

for (i in 1:piece) {
  reg_avg <- reg_avg+acc_house[[i]]$mse
  r_avg <- r_avg + acc_house_trans[[i]]$r_squared
} 
cat("average reg MSE:", reg_avg/piece, "\n")
cat("average r squared:", r_avg/piece, "\n")

for (i in (piece+1):(2*piece)) {
  cart_avg <- cart_avg+acc_house[[i]]$mse
}
cat("average cart MSE:", cart_avg/piece, "\n")

cart (house_trans)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
piece <- 5
cart_model <- list()
y_colname <- "owner_occupied_value"
  
for (i in 1:piece) {
  # sampling
  set.seed(i)
  trainIndex = createDataPartition(house_trans[[y_colname]], p = 1/piece, list = FALSE)
  train_data = house_trans[-trainIndex,] 
  test_data = house_trans[trainIndex,]
  
  # 正確存入 model_name
  model_name <- paste("cart", i, sep = "_")  

  # as.formula 把字串轉成公式response ~ predictors，適用於lm, glm
  model <- train(as.formula(paste(y_colname, "~ .")),
                data = train_data, 
                method = "rpart",  # 使用 rpart (CART)
                trControl = trainControl(method = "cv", number = 10),  # 交叉驗證
                tuneLength = 10)  # 參數調整範圍
  
  # complexity parameter prune
  model = prune(model$finalModel, cp = 0.01)

  # 把模型存list
  cart_model[[model_name]] <- model  
  
  # 預測y 值
  predictions <- predict(model, newdata = test_data)

  acc_house_trans[[i+piece]] <- list(
    mse =  (exp(test_data[[y_colname]]) - exp(predictions))^2 %>% 
      mean(),
    r_squared = summary(model)$r.squared
  )
  
}

result

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
r_avg=0
reg_avg=0
cart_avg=0

for (i in 1:piece) {
  reg_avg <- reg_avg+acc_house_trans[[i]]$mse
  r_avg <- r_avg + acc_house_trans[[i]]$r_squared
} 
cat("average reg MSE:", reg_avg/piece, "\n")
cat("average r squared:", r_avg/piece, "\n")

for (i in (piece+1):(2*piece)) {
  cart_avg <- cart_avg+acc_house_trans[[i]]$mse
}
cat("average cart MSE:", cart_avg/piece, "\n")

下次

SVM

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33

piece = 5
acc = c()
kf = kfcv(data = bank, seed = 0, piece)
bank$離職 = as.factor(bank$離職)
# 创建一个列表来存储模型
svm_models <- list()

for (i in 1:piece) {
  train_data = bank[-kf[[i]],] # Training data
  test_data = bank[kf[[i]],] # Test data
  train_data <- rbind(train_data, train_data[which(train_data$離職==1)]) 
  
  model_name <- paste("svm_reg", i, sep = "_")
  # 使用 assign 函数动态创建模型变量
  model <- svm(離職 ~ .,
                   data = train_data,
                   kernel = "radial",  
                   probability = TRUE)
  assign(model_name, model)
      # 将模型存储在列表中
  svm_models[[model_name]] <- model
  
  predicted_probabilities <- predict(svm_models[[i]], newdata = test_data, probability = TRUE)
  predicted_probabilities <- attr(predicted_probabilities, "probabilities")[, 2]
  
  ##### Confusion Matrix #####
  conf_matrix <- confusionMatrix(predict(svm_models[[i]],
                                newdata = bank[kf[[i]],]),
                                 bank[kf[[i]],]$離職)
  
  acc[i] <- round(conf_matrix$byClass["Balanced Accuracy"],4) 
}

Random Forest

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33


piece = 5
acc = c()
kf = kfcv(data = bank, seed = 0, piece)
bank$離職 = as.factor(bank$離職)
# 创建一个列表来存储模型
rf_models <- list()

for (i in 1:piece) {
  train_data = bank[-kf[[i]],] # Training data
  test_data = bank[kf[[i]],] # Test data
  train_data <- rbind(train_data, train_data[which(train_data$離職==1)]) 
  
  model_name <- paste("rf_reg", i, sep = "_")
  # 使用 assign 函数动态创建模型变量
  model <- train(離職 ~ .,
                       data = train_data, 
                       method = "rf")
  assign(model_name, model)
  
    # 将模型存储在列表中
  rf_models[[model_name]] <- model
  
  # Predict on the test data
  predicted_probabilities = predict(rf_models[[i]], newdata = test_data, type = "prob")[, "1"]
  predicted_classes = predict(rf_models[[i]], newdata = test_data, type = "raw")
  
  ##### Confusion Matrix #####  
  conf_matrix <- confusionMatrix(as.factor(predicted_classes),
                                 as.factor(bank[kf[[i]],]$離職))
  acc[i] <- round(conf_matrix$byClass["Balanced Accuracy"],4)
}

Reference

Type of censoring
KMsurv