RNA-seq | Notion

##画出颜色代码
# 一些颜色代码
colors <- c("#FDDED7", "#F5BE8F", "#C1E0DB", "#A28CC2", "#FFD700")

# 创建一个空白的绘图区域
plot(1:length(colors), rep(1, length(colors)), pch = 15, cex = 5, col = colors,
     xlab = "颜色索引", ylab = "", xaxt = 'n', yaxt = 'n', bty = 'n')

# 添加颜色标签
text(1:length(colors), rep(1, length(colors)) + 0.1, labels = colors, cex = 0.8)

##FPKM计算
total_mapped_reads <- colSums(expr_mat2)
# 假设 gene_info 中的基因顺序和 expr_mat2 的行顺序一致
gene_lengths <- gene_info  # 确保 gene_info 包含基因长度

# FPKM 计算函数
calculate_fpkm <- function(expr_mat, gene_lengths, total_mapped_reads) {
  # 矩阵每个 count 除以基因长度
  fpkm <- sweep(expr_mat, 1, gene_lengths, FUN = "/")
  
  # 然后每列除以该样本的总 mapped reads，并乘以 10^9
  fpkm <- sweep(fpkm, 2, total_mapped_reads, FUN = "/")
  
  # 最后乘以 10^9
  fpkm <- fpkm * 10^9
  
  return(fpkm)
}

# 计算 FPKM
fpkm_matrix <- calculate_fpkm(expr_mat, gene_lengths, total_mapped_reads)

# 查看结果
print(fpkm_matrix)

##复制
df <- read.table("clipboard",header = T,check.names = F,fill=TRUE,row.names = 1, stringsAsFactors =T) ##导入复制的数据
##差异基因
library(DESeq2)
count <- read.delim("D:/TRANS/count.csv",header = T,row.names = 1,sep = ",")##修改
condition <- factor(c(rep("L3",3),rep("L4",3)),levels = c("L3","L4"))##修改
countData <- count[,c(7:12)]
colData <- data.frame(row.names = colnames(countData),condition)
ddf <- DESeqDataSetFromMatrix(countData,colData,design = ~ condition)
ddf2 <- DESeq(ddf)
resultsNames(ddf2)
res <- results(ddf2)
diff_gene_deseq2 <-subset(res,padj < 0.05 & (log2FoldChange > 2 | log2FoldChange < -2))

up_deg <- subset(res,padj < 0.05 & log2FoldChange > 2)
down_deg <- subset(res,padj < 0.05 & log2FoldChange < -2)
length(rownames(diff_gene_deseq2))
length(rownames(up_deg))
length(rownames(down_deg))
write.csv(diff_gene_deseq2,"D:/AS/3-4.csv")
##差异基因柱状图
df <- data.frame(
  Kegg_Secondary = c("Transport and catabolism", "Signal transduction", "Membrane transport", "Translation", 
                     "Folding, sorting and degradation", "Replication and repair", "Transcription", 
                     "Energy metabolism", "Global and overview maps", "Metabolism of cofactors and vitamins",
                     "Amino acid metabolism", "Carbohydrate metabolism", "Metabolism of other amino acids", 
                     "Lipid metabolism", "Glycan biosynthesis and metabolism", "Nucleotide metabolism",
                     "Synthesis of other secondary metabolites", "Metabolism of terpenoids and polyketides"),
  Upregulated = c(50, 40, 100, 300, 20, 30, 70, 120, 500, 200, 150, 180, 60, 90, 80, 110, 20, 50),
  Downregulated = c(30, 20, 80, 200, 10, 20, 50, 100, 400, 150, 120, 140, 40, 70, 60, 90, 10, 40),
  Kegg_Primary = c("Cellular Processes", "Environmental Information Processing", "Environmental Information Processing", 
                   "Genetic Information Processing", "Genetic Information Processing", "Genetic Information Processing", 
                   "Genetic Information Processing", "Metabolism", "Metabolism", "Metabolism", 
                   "Metabolism", "Metabolism", "Metabolism", "Metabolism", "Metabolism", 
                   "Metabolism", "Metabolism", "Metabolism")
)

# 将数据转换为长格式
df_long <- reshape2::melt(df, id.vars = c("Kegg_Secondary", "Kegg_Primary"),
                          variable.name = "Regulation", value.name = "Gene_Count")

# 调整下调基因的数值为负数
df_long$Gene_Count[df_long$Regulation == "Downregulated"] <- -df_long$Gene_Count[df_long$Regulation == "Downregulated"]
##调整顺序
#柱状图的顺序改变可能是因为 ggplot2 默认根据数值或者字母顺序排列分类变量。
#因此，如果您希望按 Kegg_Primary 列的顺序绘制图形，您需要将 Kegg_Primary 列转换为因子（factor），并明确设置因子水平的顺序。
df_sorted$Kegg_Primary <- factor(df_sorted$Kegg_Primary, levels = unique(df_sorted$Kegg_Primary))
##最终数据形式
# Kegg_Secondary                         Kegg_Primary    Regulation Gene_Count
# 1               Membrane_transport Environmental_Information_Processing   Upregulated         31
# 2              Signal_transduction Environmental_Information_Processing   Upregulated        188
# 3               Membrane_transport Environmental_Information_Processing Downregulated        -29
# 4              Signal_transduction Environmental_Information_Processing Downregulated       -196
# 5                      Translation       Genetic_Information_Processing   Upregulated        256
# 6 Folding,_sorting_and_degradation       Genetic_Information_Processing   Upregulated        199
##绘图
ggplot(df_long, aes(x = Gene_Count, y = Kegg_Secondary, fill = Kegg_Primary)) +
  geom_bar(stat = "identity") +
  geom_vline(xintercept = 0, color = "black", linetype = "solid") +  # 添加中轴线
  scale_x_continuous(breaks = seq(-300, 300, by = 100)) +  # 自定义X轴刻度
  scale_fill_manual(values = colors) +  # 使用自定义颜色
  labs(x = "Number of Genes", y = NULL, fill = "KEGG Primary Classification") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(size = 14, face = "bold", family = "serif"),  # 设置x轴文字字体
    axis.text.y = element_text(size = 14, face = "bold", family = "serif"),  # 设置y轴文字字体
    axis.title.x = element_text(size = 14, face = "bold", family = "serif"),  # 设置x轴标题字体
    legend.title = element_text(size = 14, family = "serif"),  # 设置图例标题字体
    legend.text = element_text(size = 12, family = "serif"),  # 设置图例文字字体
    panel.grid = element_blank(),  # 移除背景网格线
    panel.border = element_blank(),  # 移除所有边框
    axis.line.x = element_line(color = "black", size = 1),  # 添加X轴线
    axis.line.y = element_line(color = "black", size = 1),  # 添加Y轴线
    axis.ticks.x = element_line(color = "black", size = 1),  # 添加X轴刻度线
    axis.ticks.y = element_line(color = "black", size = 1),  # 添加Y轴刻度线
    axis.ticks.length = unit(-0.2, "cm")  # 设置刻度线向内，负值表示向内，长度为0.2cm
  )
##补集
# 找到A中不存在于B中的基因ID
unique_A_ids <- setdiff(A_ids, B_ids)

##基因结构域绘图
cd_search_data <- read.delim("D:/bhlh/ma基因组/hitdata (1).txt")
protein_length <- 1000 ##看最大的蛋白是多大就设置多大
##分组
cd_search_data <- cd_search_data %>%
  mutate(Group = case_when(
    grepl("*ACT*", Short.name) ~ "Group 1",  # 包含 "ACT" 的为第1组
    grepl("*bHLH*", Short.name) ~ "Group 2",  # 包含 "bHLH" 的为第2组
    TRUE ~ "Group 3"  # 其他的为第3组
  ))

# 为每个分组指定颜色
colors <- c("Group 1" = "red", "Group 2" = "blue", "Group 3" = "green")

##绘图
p <- ggplot(cd_search_data, aes(x = From, xend = To, y = Query, yend = Query, color = Group)) +
  geom_segment(size = 4) +  # 用条形表示结构域
  scale_color_manual(values = colors) +  # 为分组指定颜色
  scale_x_continuous(limits = c(0, protein_length)) +  # 设置蛋白质序列的长度
  labs(x = "Amino Acid Position", y = "Gene Name") +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 10),
    axis.title = element_text(size = 12),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10),
    panel.grid = element_blank()  # 去掉背景网格线
  ) +
  geom_hline(aes(yintercept = Query), linetype = "solid", color = "black", size = 0.5) +  # 每个基因加一条线
  ggtitle("Protein Domains Visualization with Gene Names")
ggsave("D:/protein_domains_visualization.pdf", plot = p, width = 12, height = 26)

##进化树美化

library(ggtree)
library(dplyr)
##读取进化树
tree <- read.tree("D:/Q7HpXhj6Kj5zAcPlm0LxJg_newick.txt")
# 将树转换为数据框
tree_data <- ggtree(tree, branch.length = "none", layout = "circular")$data

# 找到所有以 "At" 开头的叶子节点
at_tips <- tree_data[grepl("^At", tree_data$label), ]

# 找到所有以 "Ma" 开头的叶子节点
ma_tips <- tree_data[grepl("^Ma", tree_data$label), ]

# 绘制进化树并添加绿色和红色的圆形
ggtree(tree, branch.length = "none", layout = "circular") +
  # 为所有以 "At" 开头的叶子节点添加绿色的圆形
  geom_point(data = at_tips, aes(x = x, y = y), color = "green", size = 4) +
  
  # 为所有以 "Ma" 开头的叶子节点添加红色的圆形
  geom_point(data = ma_tips, aes(x = x, y = y), color = "red", size = 4) +
  
  # 仅为叶节点（基因）添加标签
  geom_tiplab() +
  
  # 设置标题
  labs(title = "Circular Phylogenetic Tree with Custom Gene Points") +
  
  geom_tiplab(size=4)+geom_nodelab(hjust=-0.05,size=4)