% #过滤掉一些未知的cis dplyr::filter(cis_element != "Unnamed__2")%>% dplyr::filter(cis_element != "Unnamed__3")%>% dplyr::filter(cis_element != "Unnamed__4")%>% dplyr::filter(cis_element != "Unnamed__5")%>% dplyr::filter(cis_element != "Unnamed__6")%>% dplyr::filter(cis_element != "Unnamed__7")%>% dplyr::filter(cis_element != "Unnamed__8")%>% dplyr::filter(cis_element != "Unnamed__9")%>% dplyr::filter(cis_element != "Unnamed__10")%>% dplyr::filter(cis_element != "Unna"> % #过滤掉一些未知的cis dplyr::filter(cis_element != "Unnamed__2")%>% dplyr::filter(cis_element != "Unnamed__3")%>% dplyr::filter(cis_element != "Unnamed__4")%>% dplyr::filter(cis_element != "Unnamed__5")%>% dplyr::filter(cis_element != "Unnamed__6")%>% dplyr::filter(cis_element != "Unnamed__7")%>% dplyr::filter(cis_element != "Unnamed__8")%>% dplyr::filter(cis_element != "Unnamed__9")%>% dplyr::filter(cis_element != "Unnamed__10")%>% dplyr::filter(cis_element != "Unna"> % #过滤掉一些未知的cis dplyr::filter(cis_element != "Unnamed__2")%>% dplyr::filter(cis_element != "Unnamed__3")%>% dplyr::filter(cis_element != "Unnamed__4")%>% dplyr::filter(cis_element != "Unnamed__5")%>% dplyr::filter(cis_element != "Unnamed__6")%>% dplyr::filter(cis_element != "Unnamed__7")%>% dplyr::filter(cis_element != "Unnamed__8")%>% dplyr::filter(cis_element != "Unnamed__9")%>% dplyr::filter(cis_element != "Unnamed__10")%>% dplyr::filter(cis_element != "Unna">
library(tidyverse)
library(cowplot)
library(openxlsx)
library(pheatmap)
library(ggsci)

setwd('E:\\\\RNA_seq_result-徐舰航分析2\\\\cis_promoter\\\\statistic')

#======================读取和整理文件==============================
#读取顺式作用元件的分类文件
cis_annotation <- read.xlsx('./cis_promoter.xlsx') %>%
  select(cis_element, category)

#读取并过滤PlanctCARE里面的结果文件
cutin_cir <- read_xlsx('D:/nqc/nqc.xlsx') %>%
  left_join(cis_annotation, by = c('cis_element')) %>%   #与分类文件进行关联
  dplyr::filter(cis_element != 	"Unnamed__1")%>%         #过滤掉一些未知的cis
  dplyr::filter(cis_element != 	"Unnamed__2")%>%
  dplyr::filter(cis_element != 	"Unnamed__3")%>%
  dplyr::filter(cis_element != 	"Unnamed__4")%>%
  dplyr::filter(cis_element != 	"Unnamed__5")%>%
  dplyr::filter(cis_element != 	"Unnamed__6")%>%
  dplyr::filter(cis_element != 	"Unnamed__7")%>%
  dplyr::filter(cis_element != 	"Unnamed__8")%>%
  dplyr::filter(cis_element != 	"Unnamed__9")%>%
  dplyr::filter(cis_element != 	"Unnamed__10")%>%
  dplyr::filter(cis_element != 	"Unnamed__11")%>%
  dplyr::filter(cis_element != 	"Unnamed__12") %>%
  dplyr::filter(sequence != 	"motif_sequence")%>%
  dplyr::filter(cis_element != 	"TATA-box")%>%  #filter common
  dplyr::filter(cis_element != 	"CAAT-box")%>%  #filter enhancer
  dplyr::filter(cis_element != 	"GC-motif")%>%  
  dplyr::filter(cis_element != 	"TCA") %>%
  dplyr::filter(!is.na(category.y)) %>%   # 过滤掉不在分类文件里面的cis
  select(gene_id, cis_element, sequence,  category.y) 
  #distinct()

#定义如果整列都是NA就删除这列的一个函数
removeColsAllNa  <- function(x){x[, apply(x, 2, function(y) any(!is.na(y)))]}  

#对富集到进行分组统计 先统计的Plant growth and development
stat1 <- group_by(cutin_cir, gene_id, category.y, cis_element) %>%
  summarise(count = n()) %>%
  pivot_wider(names_from = cis_element, values_from = count) %>%  #长数据转化为宽数据
  filter(category.y %in% "Plant growth and development") %>%
  removeColsAllNa() %>%   #去除整列都是NA的这列
  select(-category.y)
stat1[is.na(stat1)] <- 0   #把NA的值改为0
#转化为matrix矩阵
stat1 <- stat1[,c(2:12)] %>%
  column_to_rownames(var = 'gene_id')
#Phytohormone responsive
stat2 <- group_by(cutin_cir, gene_id, category.y, cis_element) %>%
  summarise(count = n()) %>%
  pivot_wider(names_from = cis_element, values_from = count) %>%  #长数据转化为宽数据
  filter(category.y %in% "Phytohormone responsive") %>%
  removeColsAllNa() %>%   #去除整列都是NA的这列
  select(-category.y)
stat2[is.na(stat2)] <- 0   #把NA的值改为0
#Abiotic and biotic stresses
stat3 <- group_by(cutin_cir, gene_id, category.y, cis_element) %>%
  summarise(count = n()) %>%
  pivot_wider(names_from = cis_element, values_from = count) %>%  #长数据转化为宽数据
  filter(category.y %in% "Abiotic and biotic stresses") %>%
  removeColsAllNa() %>%   #去除整列都是NA的这列
  select(-category.y)
stat3[is.na(stat3)] <- 0   #把NA的值改为0

#转化为matrix矩阵
stat3 <- stat3[,c(2:12)] %>%
  column_to_rownames(var = 'gene_id')

#=========================画热图==========================
pdf('D:/Plant growth and development.pdf', width = 5, height = 7)
p <- pheatmap(stat,
         cluster_cols = F,
         cluster_rows = F,
         cellwidth = 15,
         cellheight = 15,
         display_numbers = TRUE,
         number_format = "%.0f",
         legend = FALSE,
         color = colorRampPalette(colors = c("white","#7baf57","#448e12"))(100))
print(p)
dev.off()

#======================堆叠柱形图的数据整理====================
library(cowplot)
library(ggsci)
#得到gene_id的因子,使得柱形图与热图的基因顺序一致
factor_levels <- group_by(cutin_cir, gene_id, category) %>%
  summarise(count = n()) %>%
  filter(category %in% "Abiotic and biotic stresses") %>%
  mutate(gene_id = factor(gene_id, levels = rev(gene_id)))
factor_levels <- levels(factor_levels$gene_id)

#整理绘图时的柱形图数据
stat_bar <- group_by(cutin_cir, gene_id, category) %>%
  summarise(count = n()) %>%
  mutate(cum_count = cumsum(count)) %>%
  mutate(position_x = cum_count-0.5*count) %>%
  mutate(gene_id = factor(gene_id, levels = rev(factor_levels))) %>%
  mutate(category = factor(category, levels = c('Plant growth and development', 
                                                'Phytohormone responsive',
                                                'Abiotic and biotic stresses')))

#============================绘制堆叠柱形图======================
pdf('./stat_bar.pdf', width = 5, height = 7)
p <- ggplot(data = stat_bar, aes(x = count, y = gene_id)) +
  geom_col(aes(fill = category), position = 'stack', width = 0.7, alpha = 0.6)+
  geom_text(color = 'white', 
            aes(x = position_x, y = gene_id, label = count)) +
  scale_fill_lancet() +
  scale_x_continuous(expand = c(0,0))+
  #使柱子从0的位置开始
  theme_half_open()+
  theme(legend.position = 'top',
        axis.text.y = element_blank(),
        axis.ticks=element_blank())+labs(y = NULL)
  
print(p)
dev.off()
scale_fill_l