Detroit Production Line

introduction

Data from kaggle. This data was taken from an actual production line near Detroit, Michigan. The goal is to predict certain properties of the line’s output from the various input data. The line is a high-speed, continuous manufacturing process with parallel and series stages. Raw data dim is $14088 \times 116$. Use 41 variable to predict one of 15 measurement.

Variable:\ Including

time and the production line humidity, temperature
five machine data like amperage, feeder parameter, RPM(Revolutions Per Minute), pressure, etc.
combiner temperature

What’s more, setpoints is the goal. We want to estimate y1 as close as setpoints

data feature:\

Each setpoints only 2 kind of value, zero or not zero.

2. While 15 setpoint is 0, 15 measurement is 0.

3. 27 pairs of x has absolute correlation greater than 0.7. Further more, cor(x15,x16) and cor(x17,x18) both equal to 1.

4. lots outliers.

Question:
We reduce the problem to one measurement and setpoint. Predicting y1, one of the 15 measurements and it close setpont is better.

package

1
2
3
4
5
6
library(data.table)
library(dplyr)
library(ggplot2)     # plot
library(ggcorrplot)  # correlation plot
library(patchwork)   # combine ggplot2 into a window 
library(caret)

function

cor_variable Among lots variable, how to choose those which has high correlation? We build a function cor_variable() for working.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
cor_variable <-  function(data, threshold){
  library(dplyr)
  
  # --- (1) limit parameter class ---
  if (!is.data.frame(data)) stop("`data` must be a data.frame.")
  if (!is.numeric(threshold) || length(threshold) != 1) stop("`threshold` must be a single numeric value.")
  
  
  # --- (2) compute correlation ---
  
  # a is the location of |correlation| > threshold 
  a= cor(data) %>% 
    abs() %>%
    { 
    .[lower.tri(., diag = TRUE)] <- 0  # lower triangle, diag set 0
    which(. > threshold, arr.ind = TRUE)
  }
  
  
  # set |correlation| > threshold variable name, cor
  a <- as.data.frame(a)
  a$v3 <- colnames(data)[a[,1] ] 
  a$v4 <- colnames(data)[a[,2] ] 
  colnames(a) <- c("order1","order2","variable1","variable2")
  a$order1 <- as.numeric(a$order1)
  a$order2 <- as.numeric(a$order2)
  rownames(a) <- NULL
  
  # compute cor
  a$correlation <- mapply(
    function(i, j) {
      cor(data[[i]], data[[j]])
      },
    a$order1,
    a$order2)
  
  # order by cor
  a <- a %>% 
    arrange(desc(correlation))  
    
  
  return(a[3:5])
}

outlier

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
outlier <- function(col){
  if (is.numeric(col)) {
    q1 <- quantile(col, 0.25, na.rm = TRUE)
    q3 <- quantile(col, 0.75, na.rm = TRUE)
    iqr <- q3 - q1
    lower <- q1 - 1.5 * iqr
    upper <- q3 + 1.5 * iqr
    which(col < lower | col > upper)
  } else {
    NULL
  }
} 

clean data

15 setpoint is 0 at same time
while 15 setpoint is 0, 15 measurement is 0

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
df <- fread("continuous_factory_process.csv")

setpoint <- df[,44, with = FALSE] %>% 
  as.data.frame()

# make a setpoint data
names(setpoint) <- c(paste0("setpoint_", 1))

# choose time, x1~x41, y1 only
df <- df[,1:43]

# variable name transform matrix
new_name <- c("time", paste0("x", 1:41), paste0("y", 1))
name_ref <- matrix(c(colnames(df),new_name),ncol=2,nrow=43)
names(df) <- new_name
head(df)

name_ref

1
name_ref

EDA

lots outlier

1
2
3
# x, y have lots outliers
boxplot(df$y1, main="y1")
boxplot(df$x24, main="x24")

density plot

EDA for deleting outliers y1

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17

i = 43

# histogram
p <- ggplot(df, aes(x = df[[i]])) +
  geom_bar(color = "black", fill = "lightblue", width=0.01) +
  xlim( quantile(df[[i]], 0.25) - IQR(df[[i]]) * 1.5, 
        quantile(df[[i]], 0.75) + IQR(df[[i]]) * 1.5) +
  labs(
    title = paste("hitogram -", names(df)[i]),
    x = names(df)[i],
    y = "Density"
  ) +
  theme_minimal()
  

print(p)

EDA for deleting outliers x1~x41

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15

for (i in 2:10) {
  p <- ggplot(df, aes_string(x = names(df)[i])) +
    xlim( quantile(df[[i]], 0.25) - IQR(df[[i]]) * 1.5, 
          quantile(df[[i]], 0.75) + IQR(df[[i]]) * 1.5) +
    geom_density(alpha = 0.5, adjust=0.3) +
    labs(
      title = paste("Density Plot -", names(df)[i]),
      x = names(df)[i],
      y = "Density"
    ) +
    theme_minimal() 
  
  print(p)
}

correlation

Which x has the high correlation to y1?

Among all Variable, x37 has the higher absolute correlation 0.11

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
# y and x correlation > 0.7 

i = 43

# use fread, so df is data.table rather than data.frame
cor_variable( as.data.frame(df[, c(i, 2:42), with = FALSE]), 0.1) %>%    
  filter(variable1 == names(df)[i] ) %>%    # select x which correlated to y
  select(variable2, correlation) %>%        # print those x and correlation
  print()


# y1, x2~x41 correlation
ggcorrplot(cor(df[,-1]), type = c("lower"),  tl.cex = 5)  

PCA

8-th PC cumulative var achieve 0.7

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
# 做 PCA（標準化)
# sum prcomp$rotation[,1] * x[1:41] = prcomp$x[,1] ;  sum prcomp$x[,1] * PC[1:41] = sample 1
pca <- prcomp(df[,2:42], scale. = TRUE)

((pca$sdev)^2/sum((pca$sdev)^2)) %>% 
  cumsum() %>% 
  plot()


ggplot(as.data.frame(pca$rotation),
       aes(y = as.data.frame(pca$rotation) %>% 
             rownames(),
           x = PC1)) +
  geom_col(fill = "lightblue") +
  labs(title = "PC1",
       y = "",
       x = "") +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 6))

split data

1
2
3
4
5
6
set.seed(1234)
trainIndex = createDataPartition(df[, y1], p = 0.7, list = FALSE)
train = df[trainIndex, ]
test = df[-trainIndex, ]

pca <- prcomp(train[,2:42], scale. = TRUE)

Modeling

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
# Using 8 first PC to SLR
y_name <- "y1"
train_pca <- as.data.frame(pca$x[, 1:8])  
train_pca$y <- train[[y_name]]

lm(y ~ ., data = train_pca) %>% 
    summary() %>% 
    print()

cat(rep("\n", 4))	

# Using 6 correlated variable to linear model
simple_lm <- lm(y1 ~ x37+x25+x30+x28+x1, data=train)
summary(simple_lm)

transfer to categorical

median

divide measurement by median

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
# greater than median=1

df$y1_median <- ifelse(df$y1 > median(df$y1), 1,0)


for (i in 2:42) {
  print(
    df %>% 
      ggplot(aes(x = df[[i]], color = factor(y1_median, labels = c("over median", "lower median")))) +
      geom_density(alpha = 0.5,  adjust = 0.3) +  # 使用透明度讓不同的密度曲線區分開
      labs(title = "Density Plot", 
           x = names(df)[i], 
           y = "Density", 
           fill = "type") +  # 修改圖例標題
      theme_minimal()+
  guides(color = guide_legend(title = NULL))  # 拿掉圖例標題
  )
}

Reference

Type of censoring
KMsurv