introduction
Data from kaggle. This data was taken from an actual production line near Detroit, Michigan. The goal is to predict certain properties of the line’s output from the various input data. The line is a high-speed, continuous manufacturing process with parallel and series stages. Raw data dim is $14088 \times 116$. Use 41 variable to predict one of 15 measurement.
Variable:\
Including
- time and the production line humidity, temperature
- five machine data like amperage, feeder parameter, RPM(Revolutions Per Minute), pressure, etc.
- combiner temperature
What’s more, setpoints is the goal. We want to estimate y1 as close as setpoints
data feature:\
- Each setpoints only 2 kind of value, zero or not zero.
2. While 15 setpoint is 0, 15 measurement is 0.
3. 27 pairs of x has absolute correlation greater than 0.7. Further more, cor(x15,x16) and cor(x17,x18) both equal to 1.
4. lots outliers.
Question:
We reduce the problem to one measurement and setpoint. Predicting y1, one of the 15 measurements and it close setpont is better.
package
1
2
3
4
5
6
| library(data.table)
library(dplyr)
library(ggplot2) # plot
library(ggcorrplot) # correlation plot
library(patchwork) # combine ggplot2 into a window
library(caret)
|
function
- cor_variable
Among lots variable, how to choose those which has high correlation? We build a function cor_variable() for working.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
| cor_variable <- function(data, threshold){
library(dplyr)
# --- (1) limit parameter class ---
if (!is.data.frame(data)) stop("`data` must be a data.frame.")
if (!is.numeric(threshold) || length(threshold) != 1) stop("`threshold` must be a single numeric value.")
# --- (2) compute correlation ---
# a is the location of |correlation| > threshold
a= cor(data) %>%
abs() %>%
{
.[lower.tri(., diag = TRUE)] <- 0 # lower triangle, diag set 0
which(. > threshold, arr.ind = TRUE)
}
# set |correlation| > threshold variable name, cor
a <- as.data.frame(a)
a$v3 <- colnames(data)[a[,1] ]
a$v4 <- colnames(data)[a[,2] ]
colnames(a) <- c("order1","order2","variable1","variable2")
a$order1 <- as.numeric(a$order1)
a$order2 <- as.numeric(a$order2)
rownames(a) <- NULL
# compute cor
a$correlation <- mapply(
function(i, j) {
cor(data[[i]], data[[j]])
},
a$order1,
a$order2)
# order by cor
a <- a %>%
arrange(desc(correlation))
return(a[3:5])
}
|
1
2
3
4
5
6
7
8
9
10
11
12
| outlier <- function(col){
if (is.numeric(col)) {
q1 <- quantile(col, 0.25, na.rm = TRUE)
q3 <- quantile(col, 0.75, na.rm = TRUE)
iqr <- q3 - q1
lower <- q1 - 1.5 * iqr
upper <- q3 + 1.5 * iqr
which(col < lower | col > upper)
} else {
NULL
}
}
|
clean data
- 15 setpoint is 0 at same time
- while 15 setpoint is 0, 15 measurement is 0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
| df <- fread("continuous_factory_process.csv")
setpoint <- df[,44, with = FALSE] %>%
as.data.frame()
# make a setpoint data
names(setpoint) <- c(paste0("setpoint_", 1))
# choose time, x1~x41, y1 only
df <- df[,1:43]
# variable name transform matrix
new_name <- c("time", paste0("x", 1:41), paste0("y", 1))
name_ref <- matrix(c(colnames(df),new_name),ncol=2,nrow=43)
names(df) <- new_name
head(df)
|
name_ref
EDA
lots outlier
1
2
3
| # x, y have lots outliers
boxplot(df$y1, main="y1")
boxplot(df$x24, main="x24")
|
density plot
EDA for deleting outliers y1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
i = 43
# histogram
p <- ggplot(df, aes(x = df[[i]])) +
geom_bar(color = "black", fill = "lightblue", width=0.01) +
xlim( quantile(df[[i]], 0.25) - IQR(df[[i]]) * 1.5,
quantile(df[[i]], 0.75) + IQR(df[[i]]) * 1.5) +
labs(
title = paste("hitogram -", names(df)[i]),
x = names(df)[i],
y = "Density"
) +
theme_minimal()
print(p)
|
EDA for deleting outliers x1~x41
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
for (i in 2:10) {
p <- ggplot(df, aes_string(x = names(df)[i])) +
xlim( quantile(df[[i]], 0.25) - IQR(df[[i]]) * 1.5,
quantile(df[[i]], 0.75) + IQR(df[[i]]) * 1.5) +
geom_density(alpha = 0.5, adjust=0.3) +
labs(
title = paste("Density Plot -", names(df)[i]),
x = names(df)[i],
y = "Density"
) +
theme_minimal()
print(p)
}
|
correlation
Which x has the high correlation to y1?
Among all Variable, x37 has the higher absolute correlation 0.11
1
2
3
4
5
6
7
8
9
10
11
12
13
| # y and x correlation > 0.7
i = 43
# use fread, so df is data.table rather than data.frame
cor_variable( as.data.frame(df[, c(i, 2:42), with = FALSE]), 0.1) %>%
filter(variable1 == names(df)[i] ) %>% # select x which correlated to y
select(variable2, correlation) %>% # print those x and correlation
print()
# y1, x2~x41 correlation
ggcorrplot(cor(df[,-1]), type = c("lower"), tl.cex = 5)
|
PCA
8-th PC cumulative var achieve 0.7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
| # 做 PCA(標準化)
# sum prcomp$rotation[,1] * x[1:41] = prcomp$x[,1] ; sum prcomp$x[,1] * PC[1:41] = sample 1
pca <- prcomp(df[,2:42], scale. = TRUE)
((pca$sdev)^2/sum((pca$sdev)^2)) %>%
cumsum() %>%
plot()
ggplot(as.data.frame(pca$rotation),
aes(y = as.data.frame(pca$rotation) %>%
rownames(),
x = PC1)) +
geom_col(fill = "lightblue") +
labs(title = "PC1",
y = "",
x = "") +
theme_minimal() +
theme(axis.text.y = element_text(size = 6))
|
split data
1
2
3
4
5
6
| set.seed(1234)
trainIndex = createDataPartition(df[, y1], p = 0.7, list = FALSE)
train = df[trainIndex, ]
test = df[-trainIndex, ]
pca <- prcomp(train[,2:42], scale. = TRUE)
|
Modeling
1
2
3
4
5
6
7
8
9
10
11
12
13
14
| # Using 8 first PC to SLR
y_name <- "y1"
train_pca <- as.data.frame(pca$x[, 1:8])
train_pca$y <- train[[y_name]]
lm(y ~ ., data = train_pca) %>%
summary() %>%
print()
cat(rep("\n", 4))
# Using 6 correlated variable to linear model
simple_lm <- lm(y1 ~ x37+x25+x30+x28+x1, data=train)
summary(simple_lm)
|
transfer to categorical
divide measurement by median
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
| # greater than median=1
df$y1_median <- ifelse(df$y1 > median(df$y1), 1,0)
for (i in 2:42) {
print(
df %>%
ggplot(aes(x = df[[i]], color = factor(y1_median, labels = c("over median", "lower median")))) +
geom_density(alpha = 0.5, adjust = 0.3) + # 使用透明度讓不同的密度曲線區分開
labs(title = "Density Plot",
x = names(df)[i],
y = "Density",
fill = "type") + # 修改圖例標題
theme_minimal()+
guides(color = guide_legend(title = NULL)) # 拿掉圖例標題
)
}
|
Reference
Type of censoring
KMsurv