library(randomForest)#RF模型
library(tidyverse) #数据处理
library(dplyr) #数据处理
library(skimr) #数据概况
library(DataExplorer)#数据探索
library(ggplot2) #绘图
library(GGally)#数据探索
library(caret) # 调参和计算模型评价参数
library(pROC) #绘制ROC曲线
library(data.table) #读取数据
library(ggpubr) #绘图
library(ggprism) #绘图
library(rms) #绘图
library(vip)#绘图
# 读取fa文件
nas <- readFASTA("D:/Botrbrau1/Botrbrau1/Coccomyxa_subellipsoidea_filter.fasta")

# 使用gsub移除序列中的*符号
las_clean <- lapply(las, function(seq) {
  
  seq <- gsub("\\\\*", "", seq)
  return(seq)
})
#氨基酸类型健全性检查并删除非标准序列:
csu_clean  <- csu_clean [(sapply(csu_clean , protcheck))]

ext <- function (x, props = c("Hydrophobicity", "Hydrophilicity"), 
          lambda = 4, w = 0.05, customprops = NULL) 
{
  if (protcheck(x) == FALSE) {
    stop("x has unrecognized amino acid type")
  }
  if (nchar(x) <= lambda) {
    stop("Length of the protein sequence must be greater than \\"lambda\\"")
  }
  AAidx <- read.csv(system.file("sysdata/AAidx.csv", package = "protr"), 
                    header = TRUE)
  tmp <- data.frame(AccNo = c("Hydrophobicity", "Hydrophilicity", 
                              "SideChainMass"), A = c(0.62, -0.5, 15), R = c(-2.53, 
                                                                             3, 101), N = c(-0.78, 0.2, 58), D = c(-0.9, 3, 59), 
                    C = c(0.29, -1, 47), E = c(-0.74, 3, 73), Q = c(-0.85, 
                                                                    0.2, 72), G = c(0.48, 0, 1), H = c(-0.4, -0.5, 82), 
                    I = c(1.38, -1.8, 57), L = c(1.06, -1.8, 57), K = c(-1.5, 
                                                                        3, 73), M = c(0.64, -1.3, 75), F = c(1.19, -2.5, 
                                                                                                             91), P = c(0.12, 0, 42), S = c(-0.18, 0.3, 31), 
                    T = c(-0.05, -0.4, 45), W = c(0.81, -3.4, 130), Y = c(0.26, 
                                                                          -2.3, 107), V = c(1.08, -1.5, 43))
  AAidx <- rbind(AAidx, tmp)
  if (!is.null(customprops)) 
    AAidx <- rbind(AAidx, customprops)
  aaidx <- AAidx[, -1]
  row.names(aaidx) <- AAidx[, 1]
  n <- length(props)
  H0 <- as.matrix(aaidx[props, ])
  H <- matrix(ncol = 20, nrow = n)
  for (i in 1:n) {
    H[i, ] <- (H0[i, ] - mean(H0[i, ]))/(sqrt(sum((H0[i, 
    ] - mean(H0[i, ]))^2)/20))
  }
  AADict <- c("A", "R", "N", "D", "C", "E", "Q", "G", "H", 
              "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V")
  dimnames(H) <- list(props, AADict)
  Theta <- vector("list", lambda)
  for (i in 1:lambda) Theta[[i]] <- vector("list", n)
  xSplitted <- strsplit(x, split = "")[[1]]
  N <- length(xSplitted)
  for (i in 1:lambda) {
    for (j in 1:n) {
      for (k in 1:(N - i)) {
        Theta[[i]][[j]][k] <- H[props[j], xSplitted[k]] * 
          H[props[j], xSplitted[k + i]]
      }
    }
  }
  tau <- sapply(unlist(Theta, recursive = FALSE), mean)
  fc <- summary(factor(xSplitted, levels = AADict), maxsum = 21)
  Pc1 <- fc/(1 + (w * sum(tau)))
  names(Pc1) <- paste("Pc1.", names(Pc1), sep = "")
  Pc2 <- (w * tau)/(1 + (w * sum(tau)))
  names(Pc2) <- paste("Pc2", as.vector(outer(props, 1:lambda, 
                                             paste, sep = ".")), sep = ".")
  Pc <- c(Pc1, Pc2)
  Pc
}

x1 <- t(sapply(las_clean, ext))
x2 <- t(sapply(nas_clean, ext))
x <- rbind(x1, x2)
labels <- as.factor(c(rep(0, length(las_clean)), rep(1, length(nas_clean))))

set.seed(1001)

# Split training and test set
tr.idx <- c(
  sample(1:nrow(x1), round(nrow(x1) * 0.75)),
  sample(nrow(x1) + 1:nrow(x2), round(nrow(x2) * 0.75))
)
te.idx <- setdiff(1:nrow(x), tr.idx)

x.tr <- x[tr.idx, ]
x.te <- x[te.idx, ]
y.tr <- labels[tr.idx]
y.te <- labels[te.idx]

rf.fit <- randomForest(train_data,labels2, cv.fold = 5)
library("randomForest")
  rf.fit <- randomForest(x.tr, y.tr, cv.fold = 5)
rf.fit

# Predict on test set
rf.pred <- predict(rf.fit, newdata = x.te, type = "prob")[, 1]

# Plot ROC curve
library("pROC")
roc_curve <- roc(test_data_group, rf.pred)

plot.roc(y.te, rf.pred, grid = TRUE, print.auc = TRUE)