下载数据库

cd ~/genomic/dbCAN2
wget -c <https://bcb.unl.edu/dbCAN2/download/CAZyDB.07262023.fa>  && diamond makedb --in CAZyDB.07262023.fa -d CAZy 
wget <https://bcb.unl.edu/dbCAN2/download/dbCAN-HMMdb-V12.txt> && mv dbCAN-HMMdb-V12.txt dbCAN.txt  && hmmpress dbCAN.txt 
wget <https://bcb.unl.edu/dbCAN2/download/Databases/V12/tcdb.fa>  && diamond makedb --in tcdb.fa -d tcdb 
wget <https://bcb.unl.edu/dbCAN2/download/Databases/V12/tf-1.hmm> && hmmpress tf-1.hmm 
wget <https://bcb.unl.edu/dbCAN2/download/Databases/V12/tf-2.hmm> && hmmpress tf-2.hmm 
wget <https://bcb.unl.edu/dbCAN2/download/Databases/V12/stp.hmm> && hmmpress stp.hmm

运行

run_dbcan.py input.faa protein --out_dir  output --db_dir /pub/biodatabase/dbCAN2

数据处理

my_dir <- "D:/dbcan/out"
file_vector <- list.files(path = my_dir, full.names = TRUE)
xudd <- function(b) {
filename <- paste0(b)
as <- read.delim(filename)
as <- as[,1:2]
df_split <- data.frame(do.call('rbind', strsplit(as.character(as$CAZy.ID), '|', fixed = TRUE)))
tidy1 <- df_split[,1:2]
gt_count <- sum(grepl(".*GT.*", tidy1$X2))
pl_count <- sum(grepl(".*PL.*", tidy1$X2))
ce_count <- sum(grepl(".*CE.*", tidy1$X2))
gh_count <- sum(grepl(".*GH.*", tidy1$X2))
aa_count <- sum(grepl(".*AA.*", tidy1$X2))
cbm_count <- sum(grepl(".*CBM.*", tidy1$X2))
demo <- data.frame(
  GT = gt_count,
  PL = pl_count,
  CE = ce_count,
  GH = gh_count,
  AA = aa_count,
  CBM = cbm_count
)
write.table(demo,filename)
}
for (i in file_vector) { xudd(i)}
14	1
(((Pca,(Hpa,Hco)),(Hps,Hru)'B(1.36,1.65)'),(lh,((Sco,(Rbu,Led))'B(1.18,2.18)',((Psu,Gju),((Hsu,Sru),Pco)))));

# 获取文件夹下所有.csv文件的文件路径
file_paths <- list.files(my_dir, pattern = "\\\\.out$", full.names = TRUE)

# 读取并合并所有表格
combined_data <- bind_rows(lapply(file_paths, read_csv))
combined_data <- data.frame(do.call('rbind', strsplit(as.character(combined_data$`GT PL CE GH AA CBM`), ' ', fixed = TRUE)))
combined_data <- combined_data[,-1]
names(combined_data) <- c("GT","PL" ,"CE", "GH", "AA", "CBM")
rownames(combined_data) <- file_vector
write.csv(combined_data,"D:/diamondout/combined.csv")

# 分列
df <- separate(combined_data,b,into = c("AS","GT","PL" ,"CE", "GH", "AA", "CBM"),sep = " ")

data <- data.frame(
  X = c(1, 2, 3, 4, 5),
  Y = c(5, 4, 3, 2, 1),
  Type = c("A", "A", "B", "B", "A")
)
ggplot(data, aes(x = X, y = Y, color = Type)) +
  geom_point(size = 3) +
  labs(title = "Scatter Plot with Two Types", x = "X-axis", y = "Y-axis") +
  theme_minimal()

##细分
my_dir <- "D:/dbcan/out"
file_vector <- list.files(path = my_dir, full.names = TRUE)
xudd <- function(b) {
  filename <- paste0(b)
  as <- read.delim(filename)
  as <- as[,1:2]
  df_split <- data.frame(do.call('rbind', strsplit(as.character(as$CAZy.ID), '|', fixed = TRUE)))
  tidy1<- df_split[,1:2]
  tidy1[,2]<-gsub("_.*$","",tidy1[,2])
##GT
gt <- tidy1[grep("^GT", tidy1$X2), ]
gt_stat <- as.data.frame( table(gt$X2))
gt_stat$as <- paste(rep(b,nrow(gt_stat)))
gt_file <- paste0(b,"gt")
write.csv(gt_stat,gt_file)
##PL
pl <- tidy1[grep("^PL", tidy1$X2), ]
pl_stat <- as.data.frame( table(pl$X2))
pl_stat$as <- paste(rep(b,nrow(pl_stat)))
pl_file <- paste0(b,"pl")
write.csv(pl_stat,pl_file)
##CE
CE <- tidy1[grep("^CE", tidy1$X2), ]
CE_stat <- as.data.frame( table(CE$X2))
CE_stat$as <- paste(rep(b,nrow(CE_stat)))
CE_file <- paste0(b,"CE")
write.csv(CE_stat,CE_file)
##GH
GH <- tidy1[grep("^GH", tidy1$X2), ]
GH_stat <- as.data.frame( table(GH$X2))
GH_stat$as <- paste(rep(b,nrow(GH_stat)))
GH_file <- paste0(b,"GH")
write.csv(GH_stat,GH_file)
##AA
AA <- tidy1[grep("^AA", tidy1$X2), ]
AA_stat <- as.data.frame( table(AA$X2))
AA_stat$as <- paste(rep(b,nrow(AA_stat)))
AA_file <- paste0(b,"AA")
write.csv(AA_stat,AA_file)
##CBM
CBM <- tidy1[grep("^CBM", tidy1$X2), ]
CBM_stat <- as.data.frame( table(CBM$X2))
CBM_stat$as <- paste(rep(b,nrow(CBM_stat)))
CBM_file <- paste0(b,"CBM")
write.csv(CBM_stat,CBM_file)
}
for (i in file_vector) { xudd(i)}
##AA
file_paths <- list.files(my_dir, pattern = "\\\\AA$", full.names = TRUE)
combined_data_AA<- bind_rows(lapply(file_paths, read_csv))
combined_data_AA<-spread(combined_data,Var1,Freq)
combined_data_AA <- combined_data %>% group_by(as) %>%
       summarise(across(starts_with("AA"), sum, na.rm = TRUE))
##CBM
file_paths <- list.files(my_dir, pattern = "\\\\CBM$", full.names = TRUE)
combined_data_CBM<- bind_rows(lapply(file_paths, read_csv))
combined_data_CBM<-spread(combined_data_CBM,Var1,Freq)
combined_data_CBM <- combined_data_CBM %>% group_by(as) %>%
  summarise(across(starts_with("CBM"), sum, na.rm = TRUE))
##GH
file_paths <- list.files(my_dir, pattern = "\\\\GH$", full.names = TRUE)
combined_data_GH<- bind_rows(lapply(file_paths, read_csv))
combined_data_GH<-spread(combined_data_GH,Var1,Freq)
combined_data_GH <- combined_data_GH %>% group_by(as) %>%
  summarise(across(starts_with("GH"), sum, na.rm = TRUE))
##CE
file_paths <- list.files(my_dir, pattern = "\\\\CE$", full.names = TRUE)
combined_data_CE<- bind_rows(lapply(file_paths, read_csv))
combined_data_CE<-spread(combined_data_CE,Var1,Freq)
combined_data_CE <- combined_data_CE %>% group_by(as) %>%
  summarise(across(starts_with("CE"), sum, na.rm = TRUE))
##PL
file_paths <- list.files(my_dir, pattern = "\\\\pl$", full.names = TRUE)
combined_data_PL<- bind_rows(lapply(file_paths, read_csv))
combined_data_PL<-spread(combined_data_PL,Var1,Freq)
combined_data_PL <- combined_data_PL %>% group_by(as) %>%
  summarise(across(starts_with("PL"), sum, na.rm = TRUE))
##GT
file_paths <- list.files(my_dir, pattern = "\\\\gt$", full.names = TRUE)
combined_data_GT<- bind_rows(lapply(file_paths, read_csv))
combined_data_GT<-spread(combined_data_GT,Var1,Freq)
combined_data_GT <- combined_data_GT %>% group_by(as) %>%
  summarise(across(starts_with("GT"), sum, na.rm = TRUE))
write.csv(combined_data_AA,"D:/combined_data_AA.csv")

rownames(combined_data_GH) <- combined_data_PL$as
combined_data_PL <- combined_data_PL[,-1]
rownames(combined_data_PL) <- rownames(combined_data_GH)

file_paths <- list.files(my_dir, pattern = "\\\\AA$", full.names = TRUE)
combined_data <- bind_rows(lapply(file_paths, read_csv))
combined_data %>% group_by(Var1) %>%
  summarise(sum_col3 = sum(Freq))
##热图
row_annotation <- data.frame(
  Category = c(rep("symbiosis", 6), rep("parasitism", 5), rep("putrefaction", 3))
)
rownames(row_annotation) <- rownames(combined_data_AA[-12,])
p_gt <- pheatmap(combined_data_GT[-12,],
                 cluster_cols = F,
                 cluster_rows = F,
                 cellwidth = 15,
                 cellheight = 15,
                 display_numbers = TRUE,
                 number_format = "%.0f",
                 annotation_row = row_annotation, ##注释列
                 annotation_colors = list(Category = c("symbiosis" ="white" , "parasitism" = "#7baf57", "putrefaction" = "#448e12")),
                 legend = FALSE,
                 color = colorRampPalette(colors = c("white","#7baf57","#448e12"))(100))

##获取单独家族成员
xudd <- function(b) {
    filename <- paste0(b)
    as <- read.delim(filename)
    as <- as %>% mutate(family = str_extract(`CAZy.ID`, "(?<=\\\\|)[A-Z]+[0-9]+"))
    demo <- as.data.frame(table(as$family))
    
    demo <- demo %>%column_to_rownames(var = 'Var1')
    demo <- t(demo)
    write.csv(demo,filename)
}
 for (i in file_vector) { xudd(i)}