下载数据库
cd ~/genomic/dbCAN2
wget -c <https://bcb.unl.edu/dbCAN2/download/CAZyDB.07262023.fa> && diamond makedb --in CAZyDB.07262023.fa -d CAZy
wget <https://bcb.unl.edu/dbCAN2/download/dbCAN-HMMdb-V12.txt> && mv dbCAN-HMMdb-V12.txt dbCAN.txt && hmmpress dbCAN.txt
wget <https://bcb.unl.edu/dbCAN2/download/Databases/V12/tcdb.fa> && diamond makedb --in tcdb.fa -d tcdb
wget <https://bcb.unl.edu/dbCAN2/download/Databases/V12/tf-1.hmm> && hmmpress tf-1.hmm
wget <https://bcb.unl.edu/dbCAN2/download/Databases/V12/tf-2.hmm> && hmmpress tf-2.hmm
wget <https://bcb.unl.edu/dbCAN2/download/Databases/V12/stp.hmm> && hmmpress stp.hmm
运行
run_dbcan.py input.faa protein --out_dir output --db_dir /pub/biodatabase/dbCAN2
数据处理
my_dir <- "D:/dbcan/out"
file_vector <- list.files(path = my_dir, full.names = TRUE)
xudd <- function(b) {
filename <- paste0(b)
as <- read.delim(filename)
as <- as[,1:2]
df_split <- data.frame(do.call('rbind', strsplit(as.character(as$CAZy.ID), '|', fixed = TRUE)))
tidy1 <- df_split[,1:2]
gt_count <- sum(grepl(".*GT.*", tidy1$X2))
pl_count <- sum(grepl(".*PL.*", tidy1$X2))
ce_count <- sum(grepl(".*CE.*", tidy1$X2))
gh_count <- sum(grepl(".*GH.*", tidy1$X2))
aa_count <- sum(grepl(".*AA.*", tidy1$X2))
cbm_count <- sum(grepl(".*CBM.*", tidy1$X2))
demo <- data.frame(
GT = gt_count,
PL = pl_count,
CE = ce_count,
GH = gh_count,
AA = aa_count,
CBM = cbm_count
)
write.table(demo,filename)
}
for (i in file_vector) { xudd(i)}
14 1
(((Pca,(Hpa,Hco)),(Hps,Hru)'B(1.36,1.65)'),(lh,((Sco,(Rbu,Led))'B(1.18,2.18)',((Psu,Gju),((Hsu,Sru),Pco)))));
# 获取文件夹下所有.csv文件的文件路径
file_paths <- list.files(my_dir, pattern = "\\\\.out$", full.names = TRUE)
# 读取并合并所有表格
combined_data <- bind_rows(lapply(file_paths, read_csv))
combined_data <- data.frame(do.call('rbind', strsplit(as.character(combined_data$`GT PL CE GH AA CBM`), ' ', fixed = TRUE)))
combined_data <- combined_data[,-1]
names(combined_data) <- c("GT","PL" ,"CE", "GH", "AA", "CBM")
rownames(combined_data) <- file_vector
write.csv(combined_data,"D:/diamondout/combined.csv")
# 分列
df <- separate(combined_data,b,into = c("AS","GT","PL" ,"CE", "GH", "AA", "CBM"),sep = " ")
data <- data.frame(
X = c(1, 2, 3, 4, 5),
Y = c(5, 4, 3, 2, 1),
Type = c("A", "A", "B", "B", "A")
)
ggplot(data, aes(x = X, y = Y, color = Type)) +
geom_point(size = 3) +
labs(title = "Scatter Plot with Two Types", x = "X-axis", y = "Y-axis") +
theme_minimal()
##细分
my_dir <- "D:/dbcan/out"
file_vector <- list.files(path = my_dir, full.names = TRUE)
xudd <- function(b) {
filename <- paste0(b)
as <- read.delim(filename)
as <- as[,1:2]
df_split <- data.frame(do.call('rbind', strsplit(as.character(as$CAZy.ID), '|', fixed = TRUE)))
tidy1<- df_split[,1:2]
tidy1[,2]<-gsub("_.*$","",tidy1[,2])
##GT
gt <- tidy1[grep("^GT", tidy1$X2), ]
gt_stat <- as.data.frame( table(gt$X2))
gt_stat$as <- paste(rep(b,nrow(gt_stat)))
gt_file <- paste0(b,"gt")
write.csv(gt_stat,gt_file)
##PL
pl <- tidy1[grep("^PL", tidy1$X2), ]
pl_stat <- as.data.frame( table(pl$X2))
pl_stat$as <- paste(rep(b,nrow(pl_stat)))
pl_file <- paste0(b,"pl")
write.csv(pl_stat,pl_file)
##CE
CE <- tidy1[grep("^CE", tidy1$X2), ]
CE_stat <- as.data.frame( table(CE$X2))
CE_stat$as <- paste(rep(b,nrow(CE_stat)))
CE_file <- paste0(b,"CE")
write.csv(CE_stat,CE_file)
##GH
GH <- tidy1[grep("^GH", tidy1$X2), ]
GH_stat <- as.data.frame( table(GH$X2))
GH_stat$as <- paste(rep(b,nrow(GH_stat)))
GH_file <- paste0(b,"GH")
write.csv(GH_stat,GH_file)
##AA
AA <- tidy1[grep("^AA", tidy1$X2), ]
AA_stat <- as.data.frame( table(AA$X2))
AA_stat$as <- paste(rep(b,nrow(AA_stat)))
AA_file <- paste0(b,"AA")
write.csv(AA_stat,AA_file)
##CBM
CBM <- tidy1[grep("^CBM", tidy1$X2), ]
CBM_stat <- as.data.frame( table(CBM$X2))
CBM_stat$as <- paste(rep(b,nrow(CBM_stat)))
CBM_file <- paste0(b,"CBM")
write.csv(CBM_stat,CBM_file)
}
for (i in file_vector) { xudd(i)}
##AA
file_paths <- list.files(my_dir, pattern = "\\\\AA$", full.names = TRUE)
combined_data_AA<- bind_rows(lapply(file_paths, read_csv))
combined_data_AA<-spread(combined_data,Var1,Freq)
combined_data_AA <- combined_data %>% group_by(as) %>%
summarise(across(starts_with("AA"), sum, na.rm = TRUE))
##CBM
file_paths <- list.files(my_dir, pattern = "\\\\CBM$", full.names = TRUE)
combined_data_CBM<- bind_rows(lapply(file_paths, read_csv))
combined_data_CBM<-spread(combined_data_CBM,Var1,Freq)
combined_data_CBM <- combined_data_CBM %>% group_by(as) %>%
summarise(across(starts_with("CBM"), sum, na.rm = TRUE))
##GH
file_paths <- list.files(my_dir, pattern = "\\\\GH$", full.names = TRUE)
combined_data_GH<- bind_rows(lapply(file_paths, read_csv))
combined_data_GH<-spread(combined_data_GH,Var1,Freq)
combined_data_GH <- combined_data_GH %>% group_by(as) %>%
summarise(across(starts_with("GH"), sum, na.rm = TRUE))
##CE
file_paths <- list.files(my_dir, pattern = "\\\\CE$", full.names = TRUE)
combined_data_CE<- bind_rows(lapply(file_paths, read_csv))
combined_data_CE<-spread(combined_data_CE,Var1,Freq)
combined_data_CE <- combined_data_CE %>% group_by(as) %>%
summarise(across(starts_with("CE"), sum, na.rm = TRUE))
##PL
file_paths <- list.files(my_dir, pattern = "\\\\pl$", full.names = TRUE)
combined_data_PL<- bind_rows(lapply(file_paths, read_csv))
combined_data_PL<-spread(combined_data_PL,Var1,Freq)
combined_data_PL <- combined_data_PL %>% group_by(as) %>%
summarise(across(starts_with("PL"), sum, na.rm = TRUE))
##GT
file_paths <- list.files(my_dir, pattern = "\\\\gt$", full.names = TRUE)
combined_data_GT<- bind_rows(lapply(file_paths, read_csv))
combined_data_GT<-spread(combined_data_GT,Var1,Freq)
combined_data_GT <- combined_data_GT %>% group_by(as) %>%
summarise(across(starts_with("GT"), sum, na.rm = TRUE))
write.csv(combined_data_AA,"D:/combined_data_AA.csv")
rownames(combined_data_GH) <- combined_data_PL$as
combined_data_PL <- combined_data_PL[,-1]
rownames(combined_data_PL) <- rownames(combined_data_GH)
file_paths <- list.files(my_dir, pattern = "\\\\AA$", full.names = TRUE)
combined_data <- bind_rows(lapply(file_paths, read_csv))
combined_data %>% group_by(Var1) %>%
summarise(sum_col3 = sum(Freq))
##热图
row_annotation <- data.frame(
Category = c(rep("symbiosis", 6), rep("parasitism", 5), rep("putrefaction", 3))
)
rownames(row_annotation) <- rownames(combined_data_AA[-12,])
p_gt <- pheatmap(combined_data_GT[-12,],
cluster_cols = F,
cluster_rows = F,
cellwidth = 15,
cellheight = 15,
display_numbers = TRUE,
number_format = "%.0f",
annotation_row = row_annotation, ##注释列
annotation_colors = list(Category = c("symbiosis" ="white" , "parasitism" = "#7baf57", "putrefaction" = "#448e12")),
legend = FALSE,
color = colorRampPalette(colors = c("white","#7baf57","#448e12"))(100))
##获取单独家族成员
xudd <- function(b) {
filename <- paste0(b)
as <- read.delim(filename)
as <- as %>% mutate(family = str_extract(`CAZy.ID`, "(?<=\\\\|)[A-Z]+[0-9]+"))
demo <- as.data.frame(table(as$family))
demo <- demo %>%column_to_rownames(var = 'Var1')
demo <- t(demo)
write.csv(demo,filename)
}
for (i in file_vector) { xudd(i)}