mkdir snpEff cd snpEff ###进入snpEff目录下 mkdir data ###新建data目录 cd data ####进入data目录下 mkdir genomes ####新建genomes目录 mkdir ecoli ###新建ecoli目录
cd genomes wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.fna.gz gzip -d GCF_000005845.2_ASM584v2_genomic.fna.gz mv GCF_000005845.2_ASM584v2_genomic.fna ecoli.fa
cd ../ecoli wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.gff.gz gzip -d GCF_000005845.2_ASM584v2_genomic.gff.gz mv GCF_000005845.2_ASM584v2_genomic.gff genes.gff cd ../../ #回到snpEff目录下
echo"ecoli.genome:ecoli" >> snpEff.config
Mu protocol for Drosophila
tree data
data
├── Dme
│ └── genes.gtf
└── genomes
└── Dme.fa
cd data/Dme # sudo install gffread gffread -g ../genomes/Dme.fa -x cds.fa genes.gtf # Using TransDecoder to translate the cds file into protein sequence TransDecoder.LongOrfs -t cds.fa TransDecoder.Predict -t cds.fa
There is an interesting conflicate between two fiels I download. The Protine file is starded by Flybase ID. The cds file is started by Gene Name. But The Database was using Transcript id That rediculase.
mkdir dir BioSeq
for i in $(grep ">" protein.fa| awk '{print $1}'| sed 's/>//'); do Name=$(grep -w $i genes.gtf| tr ";""\n" |grep transcript_id|head -n 1| awk '{print $2}'| sed 's/"//g') sed -i 's/>'$i'/>'$Name'/' protein.fa done
for i in $(grep ">" cds.fa| awk '{print $1}'| sed 's/>//'); do Name=$(grep -w $i genes.gtf| tr ";""\n" |grep transcript_id|head -n 1| awk '{print $2}'| sed 's/"//g') sed -i 's/>'$i'/>'$Name'/' cds.fa done
from Bio import SeqIO Seq1='cds.fa' Name_l = [] for seq_record in SeqIO.parse(Seq1, "fasta"): Anot = [] try: # for we can find the flybase id in gtf file Anot = [i for i in GTF_list if seq_record.idin i][0].split(";") except: # try it in the parent id Pat_list = [i for i in seq_record.description.split(";") if"parent="in i][0].replace(" ",'').replace("parent=", "").split(",") for P_id in Pat_list: try: Anot = [i for i in GTF_list if P_id in i][0].split(";") except: 1 == 1 Tra_ID = [i for i in Anot if"transcript_id"in i][0].split(" ")[-1].replace('"', "") Name_l += [Tra_ID]
Test for samplifed Genom
sed -i "/^EGFP/d" genes.gtf sed -i "/^GAL4/d" genes.gtf sed -i "/^mCD8GFP/d" genes.gtf
# Generate the CDS gffread -g ../genomes/Dme.fa -x cds.fa genes.gtf
from Bio import SeqIO from Bio.Seq import translate
Seq1='cds.fa' Name_l = [] for seq_record in SeqIO.parse(Seq1, "fasta"): seq_record.seq = translate(seq_record.seq)[:-1] Name = seq_record.description.split("FlyBase:")[-1].split(",")[0].split(";")[0] Name_l += [Name] SeqIO.write(seq_record, "BioSeq/"+Name+".fa", "fasta")
It works with lots of stop codon in protein fasta file.
For Real Genome
conda create -n snpeff python=3.7 # simplify the gtf sed -i "/^EGFP/d" genes.gtf sed -i "/^GAL4/d" genes.gtf sed -i "/^mCD8GFP/d" genes.gtf snpEff build -gtf22 Dme