% #过滤掉一些未知的cis dplyr::filter(cis_element != "Unnamed__2")%>% dplyr::filter(cis_element != "Unnamed__3")%>% dplyr::filter(cis_element != "Unnamed__4")%>% dplyr::filter(cis_element != "Unnamed__5")%>% dplyr::filter(cis_element != "Unnamed__6")%>% dplyr::filter(cis_element != "Unnamed__7")%>% dplyr::filter(cis_element != "Unnamed__8")%>% dplyr::filter(cis_element != "Unnamed__9")%>% dplyr::filter(cis_element != "Unnamed__10")%>% dplyr::filter(cis_element != "Unna"> % #过滤掉一些未知的cis dplyr::filter(cis_element != "Unnamed__2")%>% dplyr::filter(cis_element != "Unnamed__3")%>% dplyr::filter(cis_element != "Unnamed__4")%>% dplyr::filter(cis_element != "Unnamed__5")%>% dplyr::filter(cis_element != "Unnamed__6")%>% dplyr::filter(cis_element != "Unnamed__7")%>% dplyr::filter(cis_element != "Unnamed__8")%>% dplyr::filter(cis_element != "Unnamed__9")%>% dplyr::filter(cis_element != "Unnamed__10")%>% dplyr::filter(cis_element != "Unna"> % #过滤掉一些未知的cis dplyr::filter(cis_element != "Unnamed__2")%>% dplyr::filter(cis_element != "Unnamed__3")%>% dplyr::filter(cis_element != "Unnamed__4")%>% dplyr::filter(cis_element != "Unnamed__5")%>% dplyr::filter(cis_element != "Unnamed__6")%>% dplyr::filter(cis_element != "Unnamed__7")%>% dplyr::filter(cis_element != "Unnamed__8")%>% dplyr::filter(cis_element != "Unnamed__9")%>% dplyr::filter(cis_element != "Unnamed__10")%>% dplyr::filter(cis_element != "Unna">
library(tidyverse)
library(cowplot)
library(openxlsx)
library(pheatmap)
library(ggsci)
setwd('E:\\\\RNA_seq_result-徐舰航分析2\\\\cis_promoter\\\\statistic')
#======================读取和整理文件==============================
#读取顺式作用元件的分类文件
cis_annotation <- read.xlsx('./cis_promoter.xlsx') %>%
select(cis_element, category)
#读取并过滤PlanctCARE里面的结果文件
cutin_cir <- read_xlsx('D:/nqc/nqc.xlsx') %>%
left_join(cis_annotation, by = c('cis_element')) %>% #与分类文件进行关联
dplyr::filter(cis_element != "Unnamed__1")%>% #过滤掉一些未知的cis
dplyr::filter(cis_element != "Unnamed__2")%>%
dplyr::filter(cis_element != "Unnamed__3")%>%
dplyr::filter(cis_element != "Unnamed__4")%>%
dplyr::filter(cis_element != "Unnamed__5")%>%
dplyr::filter(cis_element != "Unnamed__6")%>%
dplyr::filter(cis_element != "Unnamed__7")%>%
dplyr::filter(cis_element != "Unnamed__8")%>%
dplyr::filter(cis_element != "Unnamed__9")%>%
dplyr::filter(cis_element != "Unnamed__10")%>%
dplyr::filter(cis_element != "Unnamed__11")%>%
dplyr::filter(cis_element != "Unnamed__12") %>%
dplyr::filter(sequence != "motif_sequence")%>%
dplyr::filter(cis_element != "TATA-box")%>% #filter common
dplyr::filter(cis_element != "CAAT-box")%>% #filter enhancer
dplyr::filter(cis_element != "GC-motif")%>%
dplyr::filter(cis_element != "TCA") %>%
dplyr::filter(!is.na(category.y)) %>% # 过滤掉不在分类文件里面的cis
select(gene_id, cis_element, sequence, category.y)
#distinct()
#定义如果整列都是NA就删除这列的一个函数
removeColsAllNa <- function(x){x[, apply(x, 2, function(y) any(!is.na(y)))]}
#对富集到进行分组统计 先统计的Plant growth and development
stat1 <- group_by(cutin_cir, gene_id, category.y, cis_element) %>%
summarise(count = n()) %>%
pivot_wider(names_from = cis_element, values_from = count) %>% #长数据转化为宽数据
filter(category.y %in% "Plant growth and development") %>%
removeColsAllNa() %>% #去除整列都是NA的这列
select(-category.y)
stat1[is.na(stat1)] <- 0 #把NA的值改为0
#转化为matrix矩阵
stat1 <- stat1[,c(2:12)] %>%
column_to_rownames(var = 'gene_id')
#Phytohormone responsive
stat2 <- group_by(cutin_cir, gene_id, category.y, cis_element) %>%
summarise(count = n()) %>%
pivot_wider(names_from = cis_element, values_from = count) %>% #长数据转化为宽数据
filter(category.y %in% "Phytohormone responsive") %>%
removeColsAllNa() %>% #去除整列都是NA的这列
select(-category.y)
stat2[is.na(stat2)] <- 0 #把NA的值改为0
#Abiotic and biotic stresses
stat3 <- group_by(cutin_cir, gene_id, category.y, cis_element) %>%
summarise(count = n()) %>%
pivot_wider(names_from = cis_element, values_from = count) %>% #长数据转化为宽数据
filter(category.y %in% "Abiotic and biotic stresses") %>%
removeColsAllNa() %>% #去除整列都是NA的这列
select(-category.y)
stat3[is.na(stat3)] <- 0 #把NA的值改为0
#转化为matrix矩阵
stat3 <- stat3[,c(2:12)] %>%
column_to_rownames(var = 'gene_id')
#=========================画热图==========================
pdf('D:/Plant growth and development.pdf', width = 5, height = 7)
p <- pheatmap(stat,
cluster_cols = F,
cluster_rows = F,
cellwidth = 15,
cellheight = 15,
display_numbers = TRUE,
number_format = "%.0f",
legend = FALSE,
color = colorRampPalette(colors = c("white","#7baf57","#448e12"))(100))
print(p)
dev.off()
#======================堆叠柱形图的数据整理====================
library(cowplot)
library(ggsci)
#得到gene_id的因子,使得柱形图与热图的基因顺序一致
factor_levels <- group_by(cutin_cir, gene_id, category) %>%
summarise(count = n()) %>%
filter(category %in% "Abiotic and biotic stresses") %>%
mutate(gene_id = factor(gene_id, levels = rev(gene_id)))
factor_levels <- levels(factor_levels$gene_id)
#整理绘图时的柱形图数据
stat_bar <- group_by(cutin_cir, gene_id, category) %>%
summarise(count = n()) %>%
mutate(cum_count = cumsum(count)) %>%
mutate(position_x = cum_count-0.5*count) %>%
mutate(gene_id = factor(gene_id, levels = rev(factor_levels))) %>%
mutate(category = factor(category, levels = c('Plant growth and development',
'Phytohormone responsive',
'Abiotic and biotic stresses')))
#============================绘制堆叠柱形图======================
pdf('./stat_bar.pdf', width = 5, height = 7)
p <- ggplot(data = stat_bar, aes(x = count, y = gene_id)) +
geom_col(aes(fill = category), position = 'stack', width = 0.7, alpha = 0.6)+
geom_text(color = 'white',
aes(x = position_x, y = gene_id, label = count)) +
scale_fill_lancet() +
scale_x_continuous(expand = c(0,0))+
#使柱子从0的位置开始
theme_half_open()+
theme(legend.position = 'top',
axis.text.y = element_blank(),
axis.ticks=element_blank())+labs(y = NULL)
print(p)
dev.off()
scale_fill_l