Useful Code in R

Introduction

紀錄一些近期常用的code

快捷鍵

ctrl+d: 刪除當前行上的所有內容
ctrl+k: 刪除游標後當前行上的所有內容
shift+alt+方向鍵下: 複製這行並貼上

字串

整理字串 str_extract

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
# for prefix characters str_ function
library(stringr)

# "1:123" -> "123"
str_extract("1:123" , "(?<=\\:)\\d+")

# "1-7:10-12" -> "1-7", "10-12"
str_extract_all("1-7:10-12", "\\d+-\\d+")

# "1:123" -> "1"
str_extract("1:123", ".*(?=:)")

# "chr22:123-124" -> "22:123"
str_extract("chr22:123-124", "(\\d+:\\d+)(?=\\-)")

# "ILMN_1343291_1" -> "ILMN_1343291"
gsub("^([^_]+_[^_]+).*", "\\1","ILMN_1343291_1")

data.table 功能

元素重複

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
DT = data.table(x=rep(c("b","a","c"),each=3), v=c(1,1,1,2,2,1,1,2,2), y=c(1,3,6), a=1:9, b=9:1)
X = data.table(x=c("c","b"), v=8:7, foo=c(4,2))

# 欄位x 各種元素出現次數
DT[, .N, by=x]

# 欄位x 各種元素第i次出現的row
DT[, .SD[i], by=x]

# 欄位x 各種元素在其他欄位的 sum，欄位N 代表欄位x 各種元素出現次數
DT[, c(.N, lapply(.SD, sum)), by=x]    # get rows *and* sum columns 'v' and 'y' by group

# 欄位x 各種元素在 DT 出現第i次的 row index
DT[, .I[i], by=x]

多欄位分組、合併

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
DT = data.table(x=rep(c("b","a","c"),each=3),
                v=c(1,1,1,2,2,1,1,2,2),
                y=c(1,3,6),
                a=as.character(1:9),
                b=9:1)
X = data.table(x=c("c","c","c","b"), v=1:4, foo=c("6","24","9","92"))

# 在 data.table 裡，.() 是 list() 的縮寫，用來分組，by = .(x, v) 就是把 x, v 都一樣的row 分成同一組，同一組只會剩下一個 row，該組的 new := 該組的y 以, 串一起
aa <- DT[,.(new = paste(y,collapse = ", ")), by= .(x,v)]

# DT 併入 X，以欄位 x,v 合併，沒對倒的欄位值 NA 表示
ab <- DT[X, on= .(x,v)]

# 修改 DT ，把 X 併入 DT ，以x,v 合併，更改對到的欄位 a，沒對到的欄位 a 不變
DT[X, on= .(x,v), a:= paste0(a, ", ", i.foo)]

# 判斷每組欄位大小。 x 元素都出現3次，以 grp 數值當作 group index，對相同 grp 比較 v 數值是否等於第一個出現的 v ，只保留等於的 row
DT_1 <- DT[
  , grp := (seq_len(.N) - 1) %/% 3
][
  , keep := v == v[1], by = grp
][
  keep == TRUE
]

統計相關

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
# $F_{1,23,0.95}$(左尾機率)
qf(0.95,1,23)

# pt 表示 t 值是-0.78 ,df=24時， area to the left of pdf
1 - pt(-0.78, 14)

# area to the right
pt(-0.78, 14, lower.tail = FALSE)

# 左側面積達到 0.78 的t 值
qt(0.78, 14)

畫圖

1
2
3
4
5
6
7
# 加上圖標
plot()
legend("topright",
       lwd = 1,
       col = c("blue", "red"),
       legend = c("name1", "name2")
       )

其他

R markdown file 加上浮動目錄

1
2
3
4
5
6
7
8
9
title: "Midterm"
output:
  html_document:
    toc: true
    toc_depth: 3
    toc_float: true
  pdf_document:
    toc: true
    toc_depth: '3'

確認變數

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
DT = data.table(x=rep(c("b","a","c"),each=3),
                v=c(1,1,"",2,2,NA,1,2,2),
                y=c(1,3,6),
                a=as.character(1:9),
                b=9:1)

k <- DT$v
# 幾種不同的值
uniqueN(k, na.rm=T)
# NA 多少row
length(which(is.na(k)))
# 空值 多少row
length(which(k==""))


k <- DT
# all col NA row number
colSums(is.na(k))
# all col 空值 row number
colSums((k==""))
## Reference

R 語言字串處理 [筆記]
.N