叶绿体基因组组装

getorganelle

conda install -c bioconda getorganelle
svn co <https://github.com/Kinggerm/GetOrganelleDB/trunk/0.0.1> ##下载数据库
get_organelle_config.py -a alge_pt --use-local ./0.0.1 构建数据库
##根据组装结果来
get_organelle_from_assembly.py -g ../lh_green.hic.p_utg.gfa \\
	-o plastome_output --no-slim -F alge_pt
##根据二代测序结果
get_organelle_from_reads.py -1 Arabidopsis_simulated.1.fq.gz -2 Arabidopsis_simulated.2.fq.gz -t 1 -o Arabidopsis_simulated.plastome -F embplant_pt -R 10

blastn -db nt -query sample1.fasta -outfmt 5 -num_alignments 5 -max_hsps 1 -**out** blast-output.xml

oatk

conda install -c bioconda oatk

oatk -k 1001 -c 200 -t 20 --nhmmscan /bin/nhmmscan -m moss_pltd.fam -p moss_pltd.fam -o moss green.fastq

##-m -p分别为线粒体 叶绿体 seed序列 green.fastq为hifi数据(非组装)

去除线粒体基因组

与线粒体库进行blast

blastn -query AT-MYB.FA -out RefSeq_plant_blastp.xml -db ~/super/mito/mito -evalue 0.5 -max_hsps 10000 -outfmt 6 -num_alignments 20000

提取genomic_ext.fasta(比对上的基因组序列)& extracted_sequences.fasta(库里比对上的序列)
bwa index genomic_ext.fasta(比对上的基因组序列)
bwa mem -t 20 genomic_ext.fasta extracted_sequences.fasta(库里比对上的序列) | samtools sort -@ 20 > bwa_aln.bam
samtools index -@ 20 bwa_aln.bam
samtools coverage bwa_aln.bam > coverage.txt
samtools coverage -m -A bwa_aln.bam > hist.txt

选出Histo max bin大于50的序列并删除

去除线粒体基因组—hifi数据

minimap2 -a  /public1/home/t0s000552/green/result/organelle.fa  ../green.fastq  > new.sam
##organelle.fa 细胞器序列  green.fastq 为hifi数据 
samtools view -F 4 new.sam | awk '{print $1}' > chloroplast_read_ids.txt
awk -v threshold=80 '  ##筛选序列,阈值为80
BEGIN {FS="\\t"; OFS="\\t"}
!/^@/ {
    read_length = length($10)
    alignment_length = 0
    for (i = 1; i <= NF; i++) {
        if ($i ~ /^MD:Z:/) {
            alignment_length += length($10) - gsub("[^0-9]", "", $i)
        }
        if ($i ~ /^[0-9]+[MIDNSHP=X]/) {
            match($i, /([0-9]+)([MIDNSHP=X])/)
            if (substr($i, RSTART + RLENGTH - 1, 1) == "M") {
                alignment_length += substr($i, RSTART, RLENGTH - 1)
            }
        }
    }
    match_rate = (alignment_length / read_length) * 100
    if (match_rate > threshold) {
        print $1
    }
}' new.sam > chloroplast_high_match_reads.txt
seqkit grep -v -f chloroplast_high_match_reads.txt ../green.fastq > green_no_chloroplast.fastq

去除细胞器基因组—hic数据

bwa index organelle.fa ##给细胞器序列建立索引
bwa mem organelle.fa ../xz1_1.fq.gz ../xz1_2.fq.gz > hic_chloroplast.sam ##比对测序序列,这里为hic数据
samtools view hic_chloroplast.sam | awk -v threshold=95 '  ##筛选数据,阈值为95
{
    read_length = length($10)
    alignment_length = 0
    cigar = $6
    match_length = 0
    split_cigar = 0
    gsub(/[0-9]+[I]/, "", cigar)
    for (i = 1; i <= length(cigar); i++) {
        if (substr(cigar, i, 1) ~ /[MDN=X]/) {
            split_cigar = substr(cigar, 1, i)
            gsub(/[A-Za-z]/, "", split_cigar)
            match_length += split_cigar
            split_cigar = ""
        }
    }
    match_rate = (match_length / read_length) * 100
    if (match_rate >= threshold) {
        print $1
    }
}' | sort | uniq > chloroplast_high_match_reads.txt
##筛选序列
seqkit grep -v -f chloroplast_high_match_reads.txt ../xz1_1.fq.gz -o xz1_1_no_chloroplast.fq.gz
seqkit grep -v -f chloroplast_high_match_reads.txt ../xz1_2.fq.gz -o xz1_2_no_chloroplast.fq.gz

绘制GC含量和测序深度(GC-Depth)分布图

#为基因组建立索引
bwa index /home/liuli/geneomic/nextpolish/pilon_result3/pilon_result/pilon.fasta
samtools faidx /home/liuli/geneomic/nextpolish/pilon_result3/pilon_result/pilon.fasta
 
#使用 bwa 将测序 reads 比对至基因组,并通过 samtools 获得 bam 文件
bwa mem -t 10 -M /home/liuli/geneomic/nextpolish/pilon_result3/pilon_result/pilon.fasta ~/geneomic/illumin/D2117372A_1.fq ~/geneomic/illumin/D2117372A_2.fq | samtools view -@ 20 -bS > lh.bam
 
#继续使用 samtools 为 bam 文件排序
samtools sort -@ 20 lh.bam > lh.sort.bam

#使用 bedtools 在基因组序列中划分滑窗,这里设置以 2000bp 为一滑窗
bedtools makewindows -g /home/liuli/geneomic/nextpolish/pilon_result3/pilon_result/pilon.fasta.fai -w 2000 -s 2000 > windows.bed
 
#使用 bedtools 统计每段滑窗的测序深度
bedtools coverage -mean -a windows.bed -b lh.sort.bam > depth.txt

“depth.txt”共四列信息。第一列,基因组中每条序列的名称(染色体,scaffold,或contig的ID);第二列,滑窗起始位点(需要+1);第三列,滑窗结束位点;第四列,该段滑窗的平均测序深度。

Untitled