1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
| rm(list = ls())
house <- fread("bostonh.dat")
# 丟掉na,null
house <- na.omit(house)
house <- Filter(Negate(is.null), house)
colnames(house) <- c("crime", "large_lots", "nonretail_business", "river", "nitric_oxides", "room",
"owner_occupied_previous", "employment_center_distance", "highways_accessibility",
"tax", "teacher_pupil", "African_American", "lower_status", "owner_occupied_value")
# price_median:房價大於median給2,否則1
house$price_median <- as.numeric(house$owner_occupied_value > median(house$owner_occupied_value)) + 1
house_trans <- house
# 變數調整
house_trans$crime <- house_trans$crime %>%
log()
house_trans$large_lots <- house_trans$large_lots/10
house_trans$nonretail_business <- house_trans$nonretail_business %>%
log()
# 4 variable do not change
house_trans$nitric_oxides <- house_trans$nitric_oxides %>%
log()
house_trans$room <- house_trans$room %>%
log()
house_trans$owner_occupied_previous <- house_trans$owner_occupied_previous^(2.5)/1000
house_trans$employment_center_distance <- house_trans$employment_center_distance %>%
log()
house_trans$highways_accessibility <- house_trans$highways_accessibility %>%
log()
house_trans$tax <- house_trans$tax %>%
log()
house_trans$teacher_pupil <- (0.4*house_trans$teacher_pupil) %>%
exp()/1000
house_trans$African_American <- house_trans$African_American/100
house_trans$lower_status <- house_trans$lower_status %>%
sqrt()
house_trans$owner_occupied_value <- house_trans$owner_occupied_value %>%
log()
house %>%
scale() %>%
boxplot()
house_trans %>%
scale() %>%
boxplot()
# pairs plot,blue= owner_occupied_value> median
# 大地塊住宅用地比例高,犯罪率幾乎是 0
pairs(house[, c(1,2)], col = c("red", "blue")[house$price_median], pch = 1)
# 昂貴的房子位於large_lots 大的區域
pairs(house[, c(14,2)], pch = 1)
# 非零售商業用地面積比例高,房價越低。可能是非零售商業會造成噪音等汙染
pairs(house[, c(14,3)], pch = 1)
# 一氧化氮濃度高,房價越低。一氧化氮主要來自交通工具、工業排放,低房價的主因可能是交通工具跟工業區的噪音
pairs(house[, c(14,5)], pch = 1)
# 平均房間數高,房價越高。平均房間數可能跟房子總面積有關
pairs(house[, c(14,6)], pch = 1)
# 波士頓地區,新的房子離就業地區比較遠
pairs(house[, c(7,8)], pch = 1)
# 每名學生的教師比例低,房價低
pairs(house[, c(14,11)], pch = 1)
|