library(randomForest)#RF模型
library(tidyverse) #数据处理
library(dplyr) #数据处理
library(skimr) #数据概况
library(DataExplorer)#数据探索
library(ggplot2) #绘图
library(GGally)#数据探索
library(caret) # 调参和计算模型评价参数
library(pROC) #绘制ROC曲线
library(data.table) #读取数据
library(ggpubr) #绘图
library(ggprism) #绘图
library(rms) #绘图
library(vip)#绘图
# 读取fa文件
nas <- readFASTA("D:/Botrbrau1/Botrbrau1/Coccomyxa_subellipsoidea_filter.fasta")
# 使用gsub移除序列中的*符号
las_clean <- lapply(las, function(seq) {
seq <- gsub("\\\\*", "", seq)
return(seq)
})
#氨基酸类型健全性检查并删除非标准序列:
csu_clean <- csu_clean [(sapply(csu_clean , protcheck))]
ext <- function (x, props = c("Hydrophobicity", "Hydrophilicity"),
lambda = 4, w = 0.05, customprops = NULL)
{
if (protcheck(x) == FALSE) {
stop("x has unrecognized amino acid type")
}
if (nchar(x) <= lambda) {
stop("Length of the protein sequence must be greater than \\"lambda\\"")
}
AAidx <- read.csv(system.file("sysdata/AAidx.csv", package = "protr"),
header = TRUE)
tmp <- data.frame(AccNo = c("Hydrophobicity", "Hydrophilicity",
"SideChainMass"), A = c(0.62, -0.5, 15), R = c(-2.53,
3, 101), N = c(-0.78, 0.2, 58), D = c(-0.9, 3, 59),
C = c(0.29, -1, 47), E = c(-0.74, 3, 73), Q = c(-0.85,
0.2, 72), G = c(0.48, 0, 1), H = c(-0.4, -0.5, 82),
I = c(1.38, -1.8, 57), L = c(1.06, -1.8, 57), K = c(-1.5,
3, 73), M = c(0.64, -1.3, 75), F = c(1.19, -2.5,
91), P = c(0.12, 0, 42), S = c(-0.18, 0.3, 31),
T = c(-0.05, -0.4, 45), W = c(0.81, -3.4, 130), Y = c(0.26,
-2.3, 107), V = c(1.08, -1.5, 43))
AAidx <- rbind(AAidx, tmp)
if (!is.null(customprops))
AAidx <- rbind(AAidx, customprops)
aaidx <- AAidx[, -1]
row.names(aaidx) <- AAidx[, 1]
n <- length(props)
H0 <- as.matrix(aaidx[props, ])
H <- matrix(ncol = 20, nrow = n)
for (i in 1:n) {
H[i, ] <- (H0[i, ] - mean(H0[i, ]))/(sqrt(sum((H0[i,
] - mean(H0[i, ]))^2)/20))
}
AADict <- c("A", "R", "N", "D", "C", "E", "Q", "G", "H",
"I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V")
dimnames(H) <- list(props, AADict)
Theta <- vector("list", lambda)
for (i in 1:lambda) Theta[[i]] <- vector("list", n)
xSplitted <- strsplit(x, split = "")[[1]]
N <- length(xSplitted)
for (i in 1:lambda) {
for (j in 1:n) {
for (k in 1:(N - i)) {
Theta[[i]][[j]][k] <- H[props[j], xSplitted[k]] *
H[props[j], xSplitted[k + i]]
}
}
}
tau <- sapply(unlist(Theta, recursive = FALSE), mean)
fc <- summary(factor(xSplitted, levels = AADict), maxsum = 21)
Pc1 <- fc/(1 + (w * sum(tau)))
names(Pc1) <- paste("Pc1.", names(Pc1), sep = "")
Pc2 <- (w * tau)/(1 + (w * sum(tau)))
names(Pc2) <- paste("Pc2", as.vector(outer(props, 1:lambda,
paste, sep = ".")), sep = ".")
Pc <- c(Pc1, Pc2)
Pc
}
x1 <- t(sapply(las_clean, ext))
x2 <- t(sapply(nas_clean, ext))
x <- rbind(x1, x2)
labels <- as.factor(c(rep(0, length(las_clean)), rep(1, length(nas_clean))))
set.seed(1001)
# Split training and test set
tr.idx <- c(
sample(1:nrow(x1), round(nrow(x1) * 0.75)),
sample(nrow(x1) + 1:nrow(x2), round(nrow(x2) * 0.75))
)
te.idx <- setdiff(1:nrow(x), tr.idx)
x.tr <- x[tr.idx, ]
x.te <- x[te.idx, ]
y.tr <- labels[tr.idx]
y.te <- labels[te.idx]
rf.fit <- randomForest(train_data,labels2, cv.fold = 5)
library("randomForest")
rf.fit <- randomForest(x.tr, y.tr, cv.fold = 5)
rf.fit
# Predict on test set
rf.pred <- predict(rf.fit, newdata = x.te, type = "prob")[, 1]
# Plot ROC curve
library("pROC")
roc_curve <- roc(test_data_group, rf.pred)
plot.roc(y.te, rf.pred, grid = TRUE, print.auc = TRUE)