统计内含子个数
awk -F'\\t' '{
# 查找外显子行
if ($3 == "exon") {
# 获取Parent字段(包含基因ID),并提取其值
n = split($9, arr, ";");
for (i = 1; i <= n; i++) {
if (arr[i] ~ /^Parent=/) {
gene_id = arr[i];
sub("Parent=", "", gene_id); # 去掉 "Parent=" 字符串,只保留基因ID
exons[gene_id]++; # 记录该基因的外显子数量
break;
}
}
}
}
END {
# 打印每个基因的内含子数目(外显子数目减去 1)
for (gene in exons) {
intron_count = exons[gene] - 1; # 内含子数目等于外显子数目减 1
if (intron_count < 0) { intron_count = 0; } # 防止负数
print gene " has " intron_count " introns";
}
}' GWHBJUL00000000.gff > 1.txt
统计外显子个数
awk -F'\\t' '{
# 查找外显子行
if ($3 == "exon") {
# 获取Parent字段(包含基因ID),并提取其值
n = split($9, arr, ";");
for (i = 1; i <= n; i++) {
if (arr[i] ~ /^Parent=/) {
gene_id = arr[i];
sub("Parent=", "", gene_id); # 去掉 "Parent=" 字符串,只保留基因ID
exons[gene_id]++; # 记录该基因的外显子数量
break;
}
}
}
}
END {
# 打印每个基因的外显子数目
for (gene in exons) {
print gene " has " exons[gene] " exons";
}
}' GWHBJUL00000000.gff > exon.txt
合并
join -t $'\\t' -1 1 -2 1 exon.txt intron.txt > merged.txt