conda install -c bioconda getorganelle
svn co <https://github.com/Kinggerm/GetOrganelleDB/trunk/0.0.1> ##下载数据库
get_organelle_config.py -a alge_pt --use-local ./0.0.1 构建数据库
##根据组装结果来
get_organelle_from_assembly.py -g ../lh_green.hic.p_utg.gfa \\
-o plastome_output --no-slim -F alge_pt
##根据二代测序结果
get_organelle_from_reads.py -1 Arabidopsis_simulated.1.fq.gz -2 Arabidopsis_simulated.2.fq.gz -t 1 -o Arabidopsis_simulated.plastome -F embplant_pt -R 10
blastn -db nt -query sample1.fasta -outfmt 5 -num_alignments 5 -max_hsps 1 -**out** blast-output.xml
conda install -c bioconda oatk
oatk -k 1001 -c 200 -t 20 --nhmmscan /bin/nhmmscan -m moss_pltd.fam -p moss_pltd.fam -o moss green.fastq
##-m -p分别为线粒体 叶绿体 seed序列 green.fastq为hifi数据(非组装)
与线粒体库进行blast
blastn -query AT-MYB.FA -out RefSeq_plant_blastp.xml -db ~/super/mito/mito -evalue 0.5 -max_hsps 10000 -outfmt 6 -num_alignments 20000
提取genomic_ext.fasta(比对上的基因组序列)& extracted_sequences.fasta(库里比对上的序列)
bwa index genomic_ext.fasta(比对上的基因组序列)
bwa mem -t 20 genomic_ext.fasta extracted_sequences.fasta(库里比对上的序列) | samtools sort -@ 20 > bwa_aln.bam
samtools index -@ 20 bwa_aln.bam
samtools coverage bwa_aln.bam > coverage.txt
samtools coverage -m -A bwa_aln.bam > hist.txt
选出Histo max bin大于50的序列并删除
minimap2 -a /public1/home/t0s000552/green/result/organelle.fa ../green.fastq > new.sam
##organelle.fa 细胞器序列 green.fastq 为hifi数据
samtools view -F 4 new.sam | awk '{print $1}' > chloroplast_read_ids.txt
awk -v threshold=80 ' ##筛选序列,阈值为80
BEGIN {FS="\\t"; OFS="\\t"}
!/^@/ {
read_length = length($10)
alignment_length = 0
for (i = 1; i <= NF; i++) {
if ($i ~ /^MD:Z:/) {
alignment_length += length($10) - gsub("[^0-9]", "", $i)
}
if ($i ~ /^[0-9]+[MIDNSHP=X]/) {
match($i, /([0-9]+)([MIDNSHP=X])/)
if (substr($i, RSTART + RLENGTH - 1, 1) == "M") {
alignment_length += substr($i, RSTART, RLENGTH - 1)
}
}
}
match_rate = (alignment_length / read_length) * 100
if (match_rate > threshold) {
print $1
}
}' new.sam > chloroplast_high_match_reads.txt
seqkit grep -v -f chloroplast_high_match_reads.txt ../green.fastq > green_no_chloroplast.fastq
bwa index organelle.fa ##给细胞器序列建立索引
bwa mem organelle.fa ../xz1_1.fq.gz ../xz1_2.fq.gz > hic_chloroplast.sam ##比对测序序列,这里为hic数据
samtools view hic_chloroplast.sam | awk -v threshold=95 ' ##筛选数据,阈值为95
{
read_length = length($10)
alignment_length = 0
cigar = $6
match_length = 0
split_cigar = 0
gsub(/[0-9]+[I]/, "", cigar)
for (i = 1; i <= length(cigar); i++) {
if (substr(cigar, i, 1) ~ /[MDN=X]/) {
split_cigar = substr(cigar, 1, i)
gsub(/[A-Za-z]/, "", split_cigar)
match_length += split_cigar
split_cigar = ""
}
}
match_rate = (match_length / read_length) * 100
if (match_rate >= threshold) {
print $1
}
}' | sort | uniq > chloroplast_high_match_reads.txt
##筛选序列
seqkit grep -v -f chloroplast_high_match_reads.txt ../xz1_1.fq.gz -o xz1_1_no_chloroplast.fq.gz
seqkit grep -v -f chloroplast_high_match_reads.txt ../xz1_2.fq.gz -o xz1_2_no_chloroplast.fq.gz
#为基因组建立索引
bwa index /home/liuli/geneomic/nextpolish/pilon_result3/pilon_result/pilon.fasta
samtools faidx /home/liuli/geneomic/nextpolish/pilon_result3/pilon_result/pilon.fasta
#使用 bwa 将测序 reads 比对至基因组,并通过 samtools 获得 bam 文件
bwa mem -t 10 -M /home/liuli/geneomic/nextpolish/pilon_result3/pilon_result/pilon.fasta ~/geneomic/illumin/D2117372A_1.fq ~/geneomic/illumin/D2117372A_2.fq | samtools view -@ 20 -bS > lh.bam
#继续使用 samtools 为 bam 文件排序
samtools sort -@ 20 lh.bam > lh.sort.bam
#使用 bedtools 在基因组序列中划分滑窗,这里设置以 2000bp 为一滑窗
bedtools makewindows -g /home/liuli/geneomic/nextpolish/pilon_result3/pilon_result/pilon.fasta.fai -w 2000 -s 2000 > windows.bed
#使用 bedtools 统计每段滑窗的测序深度
bedtools coverage -mean -a windows.bed -b lh.sort.bam > depth.txt
“depth.txt”共四列信息。第一列,基因组中每条序列的名称(染色体,scaffold,或contig的ID);第二列,滑窗起始位点(需要+1);第三列,滑窗结束位点;第四列,该段滑窗的平均测序深度。
