Biopython Introduction for Newbies

Biopython Introduction for Newbies

Biopython

1. Quick Start

from Bio.Seq import reverse_complement, transcribe, back_transcribe, translate

## echo a seq
my_string = "GCTGTTATGGGTCGTTGGAAGGGTGGTCGTGCTGCTGGTTAG"

##Revers complement
reverse_complement(my_string)

## { 'CTAACCAGCAGCACGACCACCCTTCCAACGACCCATAACAGC' }

##DNA to RNA
transcribe(my_string)

##--> { 'GCUGUUAUGGGUCGUUGGAAGGGUGGUCGUGCUGCUGGUUAG' }

##RNA to DNA
back_transcribe(my_string) --> { 'GCTGTTATGGGTCGTTGGAAGGGTGGTCGTGCTGCTGGTTAG' }

##DNA to protine
translate(my_string) --> { 'AVMGRWKGGRAAG*' }

2. Sequences

2.1 reading FASTA file

from Bio import SeqIO
Seq1='GSE44995_Reference_assembled_isotig_seq.fna'
for seq_record in SeqIO.parse(Seq1, "fasta"):
print(seq_record.id)
print(repr(seq_record.seq))
print(len(seq_record.seq))

## print by fasta formate
Seq10=''
for seq_record in SeqIO.parse(Seq1, "fasta"):
if len(seq_record.seq) < 100:
Seq10 += ">"+str(seq_record.id)+"\n"
Seq10 += str(seq_record.seq)+"\n"
print(seq_record.id)
print(repr(seq_record.seq))
print(len(seq_record.seq))

2.2 Seq run as string

from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
my_seq = Seq("GATCG", IUPAC.unambiguous_dna)
for index, letter in enumerate(my_seq):
print("%i %s" % (index, letter))

'''
print result
0 G
1 A
2 T
3 C
4 G
'''

print(my_seq[0]) #/ print(my_seq[-1])
Seq("AAAA").count("AA")
len(my_seq)

## convert to str
my_seq = str(my_seq)

### 2.3 GC caculate
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio.SeqUtils import GC
my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC", IUPAC.unambiguous_dna)
GC(my_seq)

##Result
##46.875

2.4 Slicing a sequence


my_seq[4:12]

#####
the first, second and third codon positions of this DNA sequence:
#####
my_seq[0::3] # Seq('GCTGTAGTAAG', IUPACUnambiguousDNA())
my_seq[1::3] # Seq('AGGCATGCATC', IUPACUnambiguousDNA())
my_seq[2::3] # Seq('TAGCTAAGAC', IUPACUnambiguousDNA())

2.5 revers string


my_seq = my_seq[::-1]

2.6 Changing Font case


from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
dna_seq = Seq("acgtACGT", generic_dna)
dna_seq.upper() --> {Seq('ACGTACGT', DNAAlphabet())}
dna_seq.lower() --> {Seq('acgtacgt', DNAAlphabet())}

3 Bio-information

3.1 Revers Complement


my_seq.reverse_complement()

Seq("ACGTCGTAGCTAC").complement() # standard example => output: Seq('TGCAGCATCGATG')
Seq("ACGTCGTAGCTAC").reverse_complement() # output => Seq('GTAGCTACGACGT')

3.2 Translation

from Bio.Seq import Seq
from Bio.Alphabet import IUPAC

Seq("UUU", IUPAC.unambiguous_rna).translate() # for RNA
Seq("TTT", IUPAC.unambiguous_dna).translate() # for DNA

3.3 Translation Tables

from Bio.Data import CodonTable

standard_table = CodonTable.unambiguous_dna_by_name['Standard']
mito_table = CodonTable.unambiguous_dna_by_name['Vertebrate Mitochondrial']

##same as:

standard_table = CodonTable.unambiguous_dna_by_id[1]
mito_table = CodonTable.unambiguous_dna_by_id[2]

mito_table.stop_codons #--> { ['TAA', 'TAG', 'AGA', 'AGG'] }
mito_table.start_codons # --> { ['ATT', 'ATC', 'ATA', 'ATG', 'GTG'] }
mito_table.forward_table["ACG"] #--> { 'T' }

print(standard_table)
  |  T      |  C      |  A      |  G      |
--+---------+---------+---------+---------+--
T | TTT F   | TCT S   | TAT Y   | TGT C   | T
T | TTC F   | TCC S   | TAC Y   | TGC C   | C
T | TTA L   | TCA S   | TAA Stop| TGA Stop| A
T | TTG L(s)| TCG S   | TAG Stop| TGG W   | G
--+---------+---------+---------+---------+--
C | CTT L   | CCT P   | CAT H   | CGT R   | T
C | CTC L   | CCC P   | CAC H   | CGC R   | C
C | CTA L   | CCA P   | CAA Q   | CGA R   | A
C | CTG L(s)| CCG P   | CAG Q   | CGG R   | G
--+---------+---------+---------+---------+--
A | ATT I   | ACT T   | AAT N   | AGT S   | T
A | ATC I   | ACC T   | AAC N   | AGC S   | C
A | ATA I   | ACA T   | AAA K   | AGA R   | A
A | ATG M(s)| ACG T   | AAG K   | AGG R   | G
--+---------+---------+---------+---------+--
G | GTT V   | GCT A   | GAT D   | GGT G   | T
G | GTC V   | GCC A   | GAC D   | GGC G   | C
G | GTA V   | GCA A   | GAA E   | GGA G   | A
G | GTG V   | GCG A   | GAG E   | GGG G   | G
--+---------+---------+---------+---------+--

4. Alignment

1. Echo a test file

## run in bash
echo ">TRINITY_DN106095_c2_g1_i2
MSRIMKVFLFLAVMVCISEAQLHAQCLCPRVRSRISSMTDIREVQIYEATIFCDRMEIVVTNDSGLRYCLNPKLKAVQKLLTAMKPKTSTTARPTVHSSSTGSTNTARM
>TRINITY_DN92154_c0_g1_i1
DIHVRRRTLTRSKTLGRSTNVNKMKLCILLMLGTLLVLVYGMPPISRDYNTHCRCLQVESRIIPPNSLKSIKLVPEGPHCPDMEVIAGLSNGEKVCLNPRSSWVKKLVNFVLEKQQGGALPKNQGQ" > test.fa

2. Align

## run in python
from Bio import pairwise2
from Bio.Seq import Seq
from Bio.pairwise2 import format_alignment
from Bio.SubsMat import MatrixInfo

seq1 = Seq("MSRIMKVFLFLAVMVCISEAQLHAQCLCPRVRSRISSMTDIREVQIYEATIFCDRMEIVVTNDSGLRYCLNPKLKAVQKLLTAMKPKTSTTARPTVHSSSTGSTNTARM")
seq2 = Seq("DIHVRRRTLTRSKTLGRSTNVNKMKLCILLMLGTLLVLVYGMPPISRDYNTHCRCLQVESRIIPPNSLKSIKLVPEGPHCPDMEVIAGLSNGEKVCLNPRSSWVKKLVNFVLEKQQGGALPKNQGQ")
alignments = pairwise2.align.globalxx(seq1, seq2)
test_alignments = pairwise2.align.localds(seq1, seq2, MatrixInfo.blosum62, -10, -1)

for alignment in alignments:
print(format_alignment(*alignment))

for alignment in test_alignments:
print(format_alignment(*alignment))
SingleLetterAlphabet() alignment with 6 rows and 65 columns
MQNTPAERLPAIIEKAKSKHDINVWLLDRQGRDLLEQRVPAKVA...EGP B7RZ31_9GAMM/59-123
AKQRGIAGLEEWLHRLDHSEAIPIFLIDEAGKDLLEREVPADIT...KKP A0A0C3NPG9_9PROT/58-119
ARRHGQEYFQQWLERQPKKVKEQVFAVDQFGRELLGRPLPEDMA...KKP A0A143HL37_9GAMM/57-121
TRRHGPESFRFWLERQPVEARDRIYAIDRSGAEILDRPIPRGMA...NKP A0A0X3UC67_9GAMM/57-121
AINRNTQQLTQDLRAMPNWSLRFVYIVDRNNQDLLKRPLPPGIM...NRK B3PFT7_CELJU/62-126
AVNATEREFTERIRTLPHWARRNVFVLDSQGFEIFDRELPSPVA...NRT K4KEM7_SIMAS/61-125

Quick Alignment

from Bio import pairwise2
from Bio.pairwise2 import format_alignment
alignments = pairwise2.align.globalxx("ACCGT", "ACG")
print(format_alignment(*alignments[0]))
ACCGT
| ||
A-CG-
  Score=3

PDB file

Split the chain from PDF file

Cite: David Cain

import os
from Bio import PDB

class ChainSplitter:
def __init__(self, out_dir=None):
""" Create parsing and writing objects, specify output directory. """
self.parser = PDB.PDBParser()
self.writer = PDB.PDBIO()
if out_dir is None:
out_dir = os.path.join(os.getcwd(), "chain_PDBs")
self.out_dir = out_dir

def make_pdb(self, pdb_path, chain_letters, overwrite=False, struct=None):
""" Create a new PDB file containing only the specified chains.

Returns the path to the created file.

:param pdb_path: full path to the crystal structure
:param chain_letters: iterable of chain characters (case insensitive)
:param overwrite: write over the output file if it exists
"""
chain_letters = [chain.upper() for chain in chain_letters]

# Input/output files
(pdb_dir, pdb_fn) = os.path.split(pdb_path)
pdb_id = pdb_fn[3:7]
out_name = "pdb%s_%s.ent" % (pdb_id, "".join(chain_letters))
out_path = os.path.join(self.out_dir, out_name)
print ("OUT PATH:",out_path)
plural = "s" if (len(chain_letters) > 1) else "" # for printing

# Skip PDB generation if the file already exists
if (not overwrite) and (os.path.isfile(out_path)):
print("Chain%s %s of '%s' already extracted to '%s'." %
(plural, ", ".join(chain_letters), pdb_id, out_name))
return out_path

print("Extracting chain%s %s from %s..." % (plural,
", ".join(chain_letters), pdb_fn))

# Get structure, write new file with only given chains
if struct is None:
struct = self.parser.get_structure(pdb_id, pdb_path)
self.writer.set_structure(struct)
self.writer.save(out_path, select=SelectChains(chain_letters))

return out_path

class SelectChains(PDB.Select):
""" Only accept the specified chains when saving. """
def __init__(self, chain_letters):
self.chain_letters = chain_letters

def accept_chain(self, chain):
return (chain.get_id() in self.chain_letters)

if __name__ == "__main__":
""" Parses PDB id's desired chains, and creates new PDB structures. """
import sys
if not len(sys.argv) == 2:
print( "Usage: $ python %s 'pdb.txt'" % __file__)
sys.exit()

pdb_textfn = sys.argv[1]

pdbList = PDB.PDBList()
splitter = ChainSplitter("") # Change me.

with open(pdb_textfn) as pdb_textfile:
for line in pdb_textfile:
pdb_id = line[:4].lower()
chain = line[4]
pdb_fn = pdbList.retrieve_pdb_file(pdb_id)
splitter.make_pdb(pdb_fn, chain)

Another example:

This one works fine for me

Carlos Rodrigues

from Bio.PDB import Select, PDBIO
from Bio.PDB.PDBParser import PDBParser
import sys

pdb_file = sys.argv[1]

class ChainSelect(Select):
def __init__(self, chain):
self.chain = chain
def accept_chain(self, chain):
if chain.get_id() == self.chain:
return 1
else:
return 0

chains = ['A','B','C']
p = PDBParser(PERMISSIVE=1)
structure = p.get_structure(pdb_file, pdb_file)

for chain in chains:
pdb_chain_file = 'pdb_file_chain_{}.pdb'.format(chain)
io_w_no_h = PDBIO()
io_w_no_h.set_structure(structure)
io_w_no_h.save('{}'.format(pdb_chain_file), ChainSelect(chain))

Extract fasta from PDB file

import sys
from Bio import SeqIO

PDBFile = sys.argv[1]
with open(PDBFile, 'r') as pdb_file:
for record in SeqIO.parse(pdb_file, 'pdb-atom'):
print('>' + record.id)
print(record.seq)
Author

Karobben

Posted on

2020-01-22

Updated on

2024-01-11

Licensed under

Comments