##RNA to DNA back_transcribe(my_string) --> { 'GCTGTTATGGGTCGTTGGAAGGGTGGTCGTGCTGCTGGTTAG' }
##DNA to protine translate(my_string) --> { 'AVMGRWKGGRAAG*' }
2. Sequences
2.1 reading FASTA file
from Bio import SeqIO Seq1='GSE44995_Reference_assembled_isotig_seq.fna' for seq_record in SeqIO.parse(Seq1, "fasta"): print(seq_record.id) print(repr(seq_record.seq)) print(len(seq_record.seq))
## print by fasta formate Seq10='' for seq_record in SeqIO.parse(Seq1, "fasta"): iflen(seq_record.seq) < 100: Seq10 += ">"+str(seq_record.id)+"\n" Seq10 += str(seq_record.seq)+"\n" print(seq_record.id) print(repr(seq_record.seq)) print(len(seq_record.seq))
2.2 Seq run as string
from Bio.Seq import Seq from Bio.Alphabet import IUPAC my_seq = Seq("GATCG", IUPAC.unambiguous_dna) for index, letter inenumerate(my_seq): print("%i %s" % (index, letter))
from Bio.Seq import Seq from Bio.Alphabet import IUPAC from Bio.SeqUtils import GC my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC", IUPAC.unambiguous_dna) GC(my_seq)
##Result ##46.875
2.4 Slicing a sequence
my_seq[4:12]
##### the first, second and third codon positions of this DNA sequence: ##### my_seq[0::3] # Seq('GCTGTAGTAAG', IUPACUnambiguousDNA()) my_seq[1::3] # Seq('AGGCATGCATC', IUPACUnambiguousDNA()) my_seq[2::3] # Seq('TAGCTAAGAC', IUPACUnambiguousDNA())
2.5 revers string
my_seq = my_seq[::-1]
2.6 Changing Font case
from Bio.Seq import Seq from Bio.Alphabet import generic_dna dna_seq = Seq("acgtACGT", generic_dna) dna_seq.upper() --> {Seq('ACGTACGT', DNAAlphabet())} dna_seq.lower() --> {Seq('acgtacgt', DNAAlphabet())}
3 Bio-information
3.1 Revers Complement
my_seq.reverse_complement()
Seq("ACGTCGTAGCTAC").complement() # standard example => output: Seq('TGCAGCATCGATG') Seq("ACGTCGTAGCTAC").reverse_complement() # output => Seq('GTAGCTACGACGT')
3.2 Translation
from Bio.Seq import Seq from Bio.Alphabet import IUPAC
Seq("UUU", IUPAC.unambiguous_rna).translate() # for RNA Seq("TTT", IUPAC.unambiguous_dna).translate() # for DNA
| T | C | A | G |
--+---------+---------+---------+---------+--
T | TTT F | TCT S | TAT Y | TGT C | T
T | TTC F | TCC S | TAC Y | TGC C | C
T | TTA L | TCA S | TAA Stop| TGA Stop| A
T | TTG L(s)| TCG S | TAG Stop| TGG W | G
--+---------+---------+---------+---------+--
C | CTT L | CCT P | CAT H | CGT R | T
C | CTC L | CCC P | CAC H | CGC R | C
C | CTA L | CCA P | CAA Q | CGA R | A
C | CTG L(s)| CCG P | CAG Q | CGG R | G
--+---------+---------+---------+---------+--
A | ATT I | ACT T | AAT N | AGT S | T
A | ATC I | ACC T | AAC N | AGC S | C
A | ATA I | ACA T | AAA K | AGA R | A
A | ATG M(s)| ACG T | AAG K | AGG R | G
--+---------+---------+---------+---------+--
G | GTT V | GCT A | GAT D | GGT G | T
G | GTC V | GCC A | GAC D | GGC G | C
G | GTA V | GCA A | GAA E | GGA G | A
G | GTG V | GCG A | GAG E | GGG G | G
--+---------+---------+---------+---------+--
4. Alignment
1. Echo a test file
## run in bash echo">TRINITY_DN106095_c2_g1_i2 MSRIMKVFLFLAVMVCISEAQLHAQCLCPRVRSRISSMTDIREVQIYEATIFCDRMEIVVTNDSGLRYCLNPKLKAVQKLLTAMKPKTSTTARPTVHSSSTGSTNTARM >TRINITY_DN92154_c0_g1_i1 DIHVRRRTLTRSKTLGRSTNVNKMKLCILLMLGTLLVLVYGMPPISRDYNTHCRCLQVESRIIPPNSLKSIKLVPEGPHCPDMEVIAGLSNGEKVCLNPRSSWVKKLVNFVLEKQQGGALPKNQGQ" > test.fa
2. Align
## run in python from Bio import pairwise2 from Bio.Seq import Seq from Bio.pairwise2 import format_alignment from Bio.SubsMat import MatrixInfo
for alignment in alignments: print(format_alignment(*alignment))
for alignment in test_alignments: print(format_alignment(*alignment))
SingleLetterAlphabet() alignment with 6 rows and 65 columns
MQNTPAERLPAIIEKAKSKHDINVWLLDRQGRDLLEQRVPAKVA...EGP B7RZ31_9GAMM/59-123
AKQRGIAGLEEWLHRLDHSEAIPIFLIDEAGKDLLEREVPADIT...KKP A0A0C3NPG9_9PROT/58-119
ARRHGQEYFQQWLERQPKKVKEQVFAVDQFGRELLGRPLPEDMA...KKP A0A143HL37_9GAMM/57-121
TRRHGPESFRFWLERQPVEARDRIYAIDRSGAEILDRPIPRGMA...NKP A0A0X3UC67_9GAMM/57-121
AINRNTQQLTQDLRAMPNWSLRFVYIVDRNNQDLLKRPLPPGIM...NRK B3PFT7_CELJU/62-126
AVNATEREFTERIRTLPHWARRNVFVLDSQGFEIFDRELPSPVA...NRT K4KEM7_SIMAS/61-125
Quick Alignment
from Bio import pairwise2 from Bio.pairwise2 import format_alignment alignments = pairwise2.align.globalxx("ACCGT", "ACG") print(format_alignment(*alignments[0]))
defmake_pdb(self, pdb_path, chain_letters, overwrite=False, struct=None): """ Create a new PDB file containing only the specified chains.
Returns the path to the created file.
:param pdb_path: full path to the crystal structure :param chain_letters: iterable of chain characters (case insensitive) :param overwrite: write over the output file if it exists """ chain_letters = [chain.upper() for chain in chain_letters]
# Skip PDB generation if the file already exists if (not overwrite) and (os.path.isfile(out_path)): print("Chain%s %s of '%s' already extracted to '%s'." % (plural, ", ".join(chain_letters), pdb_id, out_name)) return out_path
print("Extracting chain%s %s from %s..." % (plural, ", ".join(chain_letters), pdb_fn))
# Get structure, write new file with only given chains if struct isNone: struct = self.parser.get_structure(pdb_id, pdb_path) self.writer.set_structure(struct) self.writer.save(out_path, select=SelectChains(chain_letters))
return out_path
classSelectChains(PDB.Select): """ Only accept the specified chains when saving. """ def__init__(self, chain_letters): self.chain_letters = chain_letters
defaccept_chain(self, chain): return (chain.get_id() in self.chain_letters)
if __name__ == "__main__": """ Parses PDB id's desired chains, and creates new PDB structures. """ import sys ifnotlen(sys.argv) == 2: print( "Usage: $ python %s 'pdb.txt'" % __file__) sys.exit()
withopen(pdb_textfn) as pdb_textfile: for line in pdb_textfile: pdb_id = line[:4].lower() chain = line[4] pdb_fn = pdbList.retrieve_pdb_file(pdb_id) splitter.make_pdb(pdb_fn, chain)
Another example:
This one works fine for me
from Bio.PDB import Select, PDBIO from Bio.PDB.PDBParser import PDBParser import sys