devtools::install_github("Tong-Chen/ImageGP")
# 安装好之后,之前教程的library(YSX)都改为library(ImageGP)
library("ImageGP")
#-----------------------------数据格式和读入数据--------------------------------------
# 每个基因表达值是内部比较,只要是样品之间标准化的数据即可,其它什么转换都关系不大
# 机器学习时,字符串还是默认为因子类型的好
expr_mat2 <- read.table("clipboard",header = T,check.names = F,fill=TRUE,row.names = 1, stringsAsFactors =T) ##导入复制的数据
# 处理异常的基因名字
rownames(expr_mat) <- make.names(rownames(expr_mat))
metadata <- read.table("clipboard",header = T,check.names = F,row.names = 1,fill=TRUE, stringsAsFactors =T) ##导入复制的数据
dim(expr_mat)
#-----------------------------样品筛选和排序--------------------------------------
expr_mat <- t(expr_mat)
expr_mat_sampleL <- rownames(expr_mat)
metadata_sampleL <- rownames(metadata)
common_sampleL <- intersect(expr_mat_sampleL, metadata_sampleL)
# 保证表达表样品与METAdata样品顺序和数目完全一致
expr_mat <- expr_mat[common_sampleL,,drop=F]
metadata <- metadata[common_sampleL,,drop=F]
#-----------------------------判断是分类还是回归--------------------------------------
# R4.0之后默认读入的不是factor,需要做一个转换
# devtools::install_github("Tong-Chen/ImageGP")
library(ImageGP)
# 此处的class根据需要修改,为metadata的列名
group = "class"
# 如果group对应的列为数字,转换为数值型 - 做回归
# 如果group对应的列为分组,转换为因子型 - 做分类
if(numCheck(metadata[[group]])){
if (!is.numeric(metadata[[group]])) {
metadata[[group]] <- mixedToFloat(metadata[[group]])
}
} else{
metadata[[group]] <- as.factor(metadata[[group]])
}
#-----------------------------数据清洗--------------------------------------
variance <- apply(as, 2, var)
as_clean <- as[, variance > 0.01]
as_clean <- as[!duplicated(as), ]
as_clean <- scale(as_clean)
#-----------------------------随机森林一般分析--------------------------------------
library(randomForest)
set.seed(304)
# 直接使用默认参数
rf <- randomForest(expr_mat, metadata[[group]])
#-----------------------------随机森林标准操作流程 (适用于其他机器学习模型)--------------------------------------
#拆分训练集和测试集
library(caret)
train_index <- createDataPartition(metadata[[group]], p=0.75, list=F)
train_data <- expr_mat[train_index,]
train_data_group <- metadata[[group]][train_index]
test_data <- expr_mat[-train_index,]
test_data_group <- metadata[[group]][-train_index]
dim(train_data)
dim(test_data)
#-----------------------------Boruta特征选择鉴定关键分类变量--------------------------------------
# install.packages("Boruta")
library(Boruta)
set.seed(1)
boruta <- Boruta(x=train_data, y=train_data_group, pValue=0.01, mcAdj=T,
maxRuns=300)
table(boruta$finalDecision)
#Tentative Confirmed Rejected
#771 0 32380
#-----------------------------定义一个函数提取每个变量对应的重要性值--------------------------------------
library(dplyr)
boruta.imp <- function(x){
imp <- reshape2::melt(x$ImpHistory, na.rm=T)[,-1]
colnames(imp) <- c("Variable","Importance")
imp <- imp[is.finite(imp$Importance),]
variableGrp <- data.frame(Variable=names(x$finalDecision),
finalDecision=x$finalDecision)
showGrp <- data.frame(Variable=c("shadowMax", "shadowMean", "shadowMin"),
finalDecision=c("shadowMax", "shadowMean", "shadowMin"))
variableGrp <- rbind(variableGrp, showGrp)
boruta.variable.imp <- merge(imp, variableGrp, all.x=T)
sortedVariable <- boruta.variable.imp %>% group_by(Variable) %>%
summarise(median=median(Importance)) %>% arrange(median)
sortedVariable <- as.vector(sortedVariable$Variable)
boruta.variable.imp$Variable <- factor(boruta.variable.imp$Variable, levels=sortedVariable)
invisible(boruta.variable.imp)
}
boruta.variable.imp <- boruta.imp(boruta)
head(boruta.variable.imp)
##只绘制Confirmed变量。
sp_boxplot(boruta.variable.imp, melted=T, xvariable = "Variable", yvariable = "Importance",
legend_variable = "finalDecision",
legend_variable_order = c("shadowMax", "shadowMean", "shadowMin", "Confirmed"),
xtics_angle = 90)
##提取重要的变量和可能重要的变量
boruta.finalVarsWithTentative <- data.frame(Item=getSelectedAttributes(boruta, withTentative = T),
Type="Boruta_with_tentative")
##看下这些变量的值的分布
caret::featurePlot(train_data[,boruta.finalVarsWithTentative$Item], train_data_group, plot="box")
#-----------------------------交叉验证选择参数并拟合模型--------------------------------------
#定义一个函数生成一些列用来测试的mtry (一系列不大于总变量数的数值)。
generateTestVariableSet <- function(num_toal_variable){
max_power <- ceiling(log10(num_toal_variable))
tmp_subset <- c(unlist(sapply(1:max_power, function(x) (1:10)^x, simplify = F)), ceiling(max_power/3))
#return(tmp_subset)
base::unique(sort(tmp_subset[tmp_subset<num_toal_variable]))
}
#选择关键特征变量相关的数据
# 提取训练集的特征变量子集
boruta_train_data <- train_data[, boruta.finalVarsWithTentative$Item]
boruta_mtry <- generateTestVariableSet(ncol(boruta_train_data))
#使用 Caret 进行调参和建模
# Create model with default parameters
trControl <- trainControl(method="repeatedcv", number=10, repeats=5,seeds = set.seed(123))
set.seed(123)
# `根据经验或感觉设置一些待查询的参数和参数值`
tuneGrid <- expand.grid(mtry=boruta_mtry)
borutaConfirmed_rf_default <- train(x=boruta_train_data, y=train_data_group, method="rf",
tuneGrid = tuneGrid,
metric="Kappa" ,
trControl=trControl)
borutaConfirmed_rf_default
#可视化不同参数的准确性分布
plot(borutaConfirmed_rf_default)
##可视化Top20重要的变量
dotPlot(varImp(borutaConfirmed_rf_default))
library(ggplot2)
ggplot(Top20_sorted, aes(x = importance, y = X)) +
geom_point(color = "blue", size = 3) + # 设置点的颜色和大小
labs(title = "Rank of importance of genes", x = "Improtance", y = "Genes") +
theme_minimal() + # 使用白色背景
theme(panel.border = element_rect(color = "black", fill = NA), # 添加外围的框
panel.background = element_blank(),# 移除默认灰色背景
axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 10,face = "bold"))
#----------------------绘制top20表达值差异------------------------------------
# 检查train_data中的列名是否存在于rownames(top20)
common_columns <- intersect(rownames(top20), colnames(train_data))
# 设置更小的边距
par(mfrow = c(5, 4), mar = c(2, 2, 2, 1)) # c(bottom, left, top, right)
# 循环绘制每列的数据
for (i in 1:length(common_columns)) {
column_name <- common_columns[i]
data <- list(train_data[1:18, column_name], train_data[19:30, column_name])
boxplot(data,
main = column_name,
xlab = "Groups",
ylab = "Values",
col = c("#f9766d", "#00bfc4"), # A 组红色,B 组蓝色
names = c("Normal", "Drought")) # 自定义组名
}
# 恢复默认的单图布局
par(mfrow = c(1, 1))
##小提琴图
# 加载必要的包
library(ggplot2)
library(reshape2)
# 重新整理数据框以适应 ggplot2
melted_data <- melt(train_data[, rownames(top20)])
# 添加组信息
melted_data$Group <- rep(c("A", "B"), each = 18, times = ncol(train_data[, rownames(top20)]) / 2)
# 绘制小提琴图
ggplot(melted_data, aes(x = Group, y = value, fill = Group)) +
geom_violin(trim = FALSE) +
facet_wrap(~ Var2, scales = "free_y", ncol = 5) + # 使用Var2作为faceting变量
scale_fill_manual(values = c("red", "blue")) + # 设置颜色
scale_x_discrete(labels = c("A" = "Control", "B" = "Treatment")) + # 修改组名
labs(title = "Violin Plots for Selected Features", x = "Groups", y = "Values") +
theme_minimal() + # 使用简洁主题
theme(
strip.text = element_text(size = 8), # 调整facet标签的字体大小
axis.text.x = element_text(size = 14) # 调整X轴字体大小
)
#-----------------------------提取最终选择的模型,并绘制 ROC 曲线评估模型--------------------------------------
borutaConfirmed_rf_default_finalmodel <- borutaConfirmed_rf_default$finalModel
#-----------------------------单基因ROC曲线-----------------------
hub <- expr_mat2["Pp3c20_12760",]
hub <- as.data.frame(t(hub))
hub$group <- as.array(colnames(expr_mat2))
View(hub)
roc <- roc(response = hub$group,
predictor = hub[, 1],
levels = c('normal', 'Drought'))
response <- hub$group
gene_names <- colnames(train_data)[-1] # 获取所有基因名
# 创建一个列表来存储 AUC 值
auc_results <- list()
# 循环计算每个基因的 AUC 值
for (gene in gene_names) {
predictor <- train_data[[gene]] # 获取当前基因的表达值
roc_obj <- roc(response, predictor, levels = c('control', 'disease'))
auc_results[[gene]] <- auc(roc_obj) # 计算并存储 AUC 值
}
# 将 AUC 结果转换为数据框
auc_df <- data.frame(Gene = names(auc_results), AUC = unlist(auc_results))
print(auc_df)
# 加载所需包
library(rsample)
# 假设 data 是你的RNA-seq表达矩阵,target 是目标变量
set.seed(123)
split <- initial_split(data, prop = 0.85)
train_data <- training(split)
test_data <- testing(split)
# 加载所需包
library(caret)
library(ranger)
# 设置交叉验证
train_control <- trainControl(method = "cv", number = 10)
# 定义参数网格
tune_grid <- expand.grid(
mtry = c(2, 3, 4), # 你可以根据具体情况调整这个范围
splitrule = "gini",
min.node.size = c(1, 5, 10)
)
# RF分类器1:以处理条件为目标变量
rf_model1 <- train(
treatment_condition ~ .,
data = train_data,
method = "ranger",
trControl = train_control,
tuneGrid = tune_grid
)
# RF分类器2:以基因型为目标变量
rf_model2 <- train(
genotype ~ .,
data = train_data,
method = "ranger",
trControl = train_control,
tuneGrid = tune_grid
)