Source code for pydna.seq

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
A subclass of Biopython Bio.Seq.Seq

Has a number of extra methods and uses
the :class:`pydna._pretty_str.pretty_str` class instread of str for a
nicer output in the IPython shell.
"""

from Bio.SeqUtils.ProtParam import ProteinAnalysis
from pydna.codon import rare_codons
from pydna.codon import start as _start
from pydna.codon import stop as _stop
from pydna.codon import n_end as _n_end
from seguid import lsseguid
from pydna.utils import rc

from Bio.SeqUtils import seq3
from Bio.SeqUtils import gc_fraction
import re
from Bio.Seq import Seq as _Seq
from pydna._pretty import PrettyTable

from typing import List, Optional, Tuple


[docs] class Seq(_Seq): """docstring.""" # @property # def full_sequence(self): # return self # def translate( # self, # *args, # stop_symbol: str = "*", # to_stop: bool = False, # cds: bool = False, # gap: str = "-", # **kwargs, # ) -> "ProteinSeq": # """Translate..""" # p = super().translate( # *args, stop_symbol=stop_symbol, to_stop=to_stop, cds=cds, gap=gap, **kwargs # ) # return ProteinSeq(p._data)
[docs] def translate( self, table: [str, int] = "Standard", stop_symbol: [str] = "*", to_stop: bool = False, cds: bool = False, gap: str = "-", ) -> _Seq: # TODO: is this method needed? """ Translate into protein. The table argument is the name of a codon table (string). These names can be for example "Standard" or "Alternative Yeast Nuclear" for the yeast CUG clade where the CUG codon is translated as serine instead of the standard leucine. Over forty translation tables are available from the BioPython Bio.Data.CodonTable module. Look at the keys of the dictionary ´CodonTable.ambiguous_generic_by_name´. These are based on tables in this file provided by NCBI: https://ftp.ncbi.nlm.nih.gov/entrez/misc/data/gc.prt Standard table | T | C | A | G | --+---------+---------+---------+---------+-- T | TTT F | TCT S | TAT Y | TGT C | T T | TTC F | TCC S | TAC Y | TGC C | C T | TTA L | TCA S | TAA Stop| TGA Stop| A T | TTG L(s)| TCG S | TAG Stop| TGG W | G --+---------+---------+---------+---------+-- C | CTT L | CCT P | CAT H | CGT R | T C | CTC L | CCC P | CAC H | CGC R | C C | CTA L | CCA P | CAA Q | CGA R | A C | CTG L(s)| CCG P | CAG Q | CGG R | G --+---------+---------+---------+---------+-- A | ATT I | ACT T | AAT N | AGT S | T A | ATC I | ACC T | AAC N | AGC S | C A | ATA I | ACA T | AAA K | AGA R | A A | ATG M(s)| ACG T | AAG K | AGG R | G --+---------+---------+---------+---------+-- G | GTT V | GCT A | GAT D | GGT G | T G | GTC V | GCC A | GAC D | GGC G | C G | GTA V | GCA A | GAA E | GGA G | A G | GTG V | GCG A | GAG E | GGG G | G --+---------+---------+---------+---------+-- Parameters ---------- table : [str, int], optional The default is "Standard". Can be a table id integer, see here for table numbering https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi stop_symbol : [str], optional The default is "*". Single character string to indicate translation stop. to_stop : bool, optional The default is False. True means that translation terminates at the first in frame stop codon. False translates to the end. cds : bool, optional The default is False. If True, checks that the sequence starts with a valid alternative start codon sequence length is a multiple of three, and that there is a single in frame stop codon at the end. If these tests fail, an exception is raised. gap : str, optional The default is "-". Returns ------- Bio.Seq.Seq A Biopython Seq object with the translated amino acid code. """ p = _Seq(self._data).translate( stop_symbol=stop_symbol, to_stop=to_stop, cds=cds, gap=gap ) return ProteinSeq(p._data)
[docs] def transcribe(self) -> _Seq: """ Transcribe a DNA sequence into RNA and return the RNA sequence as a new Seq object. """ return Seq(_Seq(self._data).transcribe()._data)
[docs] def gc(self) -> float: """Return GC content.""" return round(gc_fraction(self._data.upper().decode("ASCII")), 3)
[docs] def cai(self, organism: str = "sce") -> float: """docstring.""" from pydna.utils import cai as _cai return _cai(self._data.upper().decode("ASCII"), organism=organism)
[docs] def rarecodons(self, organism: str = "sce") -> List[slice]: """docstring.""" rare = rare_codons[organism] s = self._data.upper().decode("ASCII") slices: List[slice] = [] for i in range(0, len(self) // 3): x, y = i * 3, i * 3 + 3 trip = s[x:y] if trip in rare: slices.append(slice(x, y, 1)) return slices
[docs] def startcodon(self, organism: str = "sce") -> Optional[float]: """docstring.""" return _start[organism].get(self._data.upper().decode("ASCII")[:3])
[docs] def stopcodon(self, organism: str = "sce") -> Optional[float]: """docstring.""" return _stop[organism].get(self._data.upper().decode("ASCII")[-3:])
[docs] def express(self, organism: str = "sce") -> PrettyTable: """docstring.""" x = PrettyTable( ["cds", "len", "cai", "gc", "sta", "stp", "n-end"] + rare_codons[organism] + ["rare"] ) val = [] val.append( f"{self._data.upper().decode('ASCII')[:3]}..." f"{self._data.upper().decode('ASCII')[-3:]}" ) val.append(len(self) / 3) val.append(self.cai(organism)) val.append(self.gc()) val.append(self.startcodon()) val.append(self.stopcodon()) val.append( _n_end[organism].get(seq3(self[3:6].translate())), ) s = self._data.upper().decode("ASCII") trps = [s[i * 3 : i * 3 + 3] for i in range(0, len(s) // 3)] tot = 0 for cdn in rare_codons[organism]: cnt = trps.count(cdn) tot += cnt val.append(cnt) val.append(round(tot / len(trps), 3)) x.add_row(val) return x
[docs] def orfs2(self, minsize: int = 30) -> List[str]: """docstring.""" orf = re.compile( f"ATG(?:...){{{minsize},}}?(?:TAG|TAA|TGA)", flags=re.IGNORECASE ) start = 0 matches: List[slice] = [] s = self._data.decode("ASCII") while True: match = orf.search(s, pos=start) if match: matches.append(slice(match.start(), match.end())) start = match.start() + 1 else: break return sorted([self[sl] for sl in matches], key=len, reverse=True)
[docs] def orfs(self, minsize: int = 100) -> List[Tuple[int, int]]: dna = self._data.decode("ASCII") from pydna.utils import three_frame_orfs return [(x, y) for frame, x, y in three_frame_orfs(dna, limit=minsize)]
[docs] def seguid(self) -> str: """Url safe SEGUID [#]_ for the sequence. This checksum is the same as seguid but with base64.urlsafe encoding instead of the normal base64. This means that the characters + and / are replaced with - and _ so that the checksum can be part of a URL. Examples -------- >>> from pydna.seq import Seq >>> a = Seq("aa") >>> a.seguid() 'lsseguid=gBw0Jp907Tg_yX3jNgS4qQWttjU' References ---------- .. [#] http://wiki.christophchamp.com/index.php/SEGUID """ return lsseguid( self._data.decode("ascii").upper(), alphabet="{DNA-extended},AU" )
# def __getitem__(self, key): # result = super().__getitem__(key) # try: # result.__class__ = self.__class__ # except TypeError: # pass # return result
[docs] def reverse_complement(self): return self.__class__(rc(self._data))
rc = reverse_complement
[docs] class ProteinSeq(_Seq): """docstring."""
[docs] def translate(self): raise NotImplementedError("Not defined for protein.")
[docs] def complement(self): raise NotImplementedError("Not defined for protein.")
[docs] def complement_rna(self): raise NotImplementedError("Not defined for protein.")
[docs] def reverse_complement(self): raise NotImplementedError("Not defined for protein.")
rc = reverse_complement
[docs] def reverse_complement_rna(self): raise NotImplementedError("Not defined for protein.")
[docs] def transcribe(self): raise NotImplementedError("Not defined for protein.")
[docs] def back_transcribe(self): raise NotImplementedError("Not defined for protein.")
[docs] def seguid(self) -> str: """Url safe SEGUID [#]_ for the sequence. This checksum is the same as seguid but with base64.urlsafe encoding instead of the normal base64. This means that the characters + and / are replaced with - and _ so that the checksum can be part of a URL. Examples -------- >>> from pydna.seq import ProteinSeq >>> a = ProteinSeq("aa") >>> a.seguid() 'lsseguid=gBw0Jp907Tg_yX3jNgS4qQWttjU' References ---------- .. [#] http://wiki.christophchamp.com/index.php/SEGUID """ return lsseguid( self._data.decode("utf8").upper(), alphabet="{protein-extended}" )
def __getitem__(self, key): result = super().__getitem__(key) try: result.__class__ = self.__class__ except TypeError: pass return result def _pa(self) -> ProteinAnalysis: # breakpoint() return ProteinAnalysis(self._data.decode("ascii"))
[docs] def molecular_weight(self) -> float: return self._pa().molecular_weight()
[docs] def pI(self) -> float: return self._pa().isoelectric_point()
[docs] def instability_index(self) -> float: """ Instability index according to Guruprasad et al. Value above 40 means the protein is has a short half life. Guruprasad K., Reddy B.V.B., Pandit M.W. Protein Engineering 4:155-161(1990). """ return self._pa().instability_index()