Source code for pydna.seq

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
A subclass of Biopython Bio.Seq.Seq

Has a number of extra methods and uses
the :class:`pydna._pretty_str.pretty_str` class instread of str for a
nicer output in the IPython shell.
"""

from Bio.SeqUtils.ProtParam import ProteinAnalysis
from pydna.codon import rare_codons
from pydna.codon import start as _start
from pydna.codon import stop as _stop
from pydna.codon import n_end as _n_end
from seguid import lsseguid
from pydna.utils import rc

from Bio.SeqUtils import seq3
from Bio.SeqUtils import gc_fraction
import re
from Bio.Seq import Seq as _Seq
from pydna._pretty import PrettyTable

from typing import List, Optional, Tuple



[docs]
class Seq(_Seq):
    """docstring."""

    # @property
    # def full_sequence(self):
    #     return self

    # def translate(
    #     self,
    #     *args,
    #     stop_symbol: str = "*",
    #     to_stop: bool = False,
    #     cds: bool = False,
    #     gap: str = "-",
    #     **kwargs,
    # ) -> "ProteinSeq":
    #     """Translate.."""
    #     p = super().translate(
    #         *args, stop_symbol=stop_symbol, to_stop=to_stop, cds=cds, gap=gap, **kwargs
    #     )
    #     return ProteinSeq(p._data)


[docs]
    def translate(
        self,
        table: [str, int] = "Standard",
        stop_symbol: [str] = "*",
        to_stop: bool = False,
        cds: bool = False,
        gap: str = "-",
    ) -> _Seq:

        # TODO: is this method needed?
        """
        Translate into protein.

        The table argument is the name of a codon table (string). These names
        can be for example "Standard" or "Alternative Yeast Nuclear" for the
        yeast CUG clade where the CUG codon is translated as serine instead
        of the standard leucine.

        Over forty translation tables are available from the BioPython
        Bio.Data.CodonTable module. Look at the keys of the dictionary
        ´CodonTable.ambiguous_generic_by_name´.
        These are based on tables in this file provided by NCBI:

        https://ftp.ncbi.nlm.nih.gov/entrez/misc/data/gc.prt

        Standard table

          |  T      |  C      |  A      |  G      |
        --+---------+---------+---------+---------+--
        T | TTT F   | TCT S   | TAT Y   | TGT C   | T
        T | TTC F   | TCC S   | TAC Y   | TGC C   | C
        T | TTA L   | TCA S   | TAA Stop| TGA Stop| A
        T | TTG L(s)| TCG S   | TAG Stop| TGG W   | G
        --+---------+---------+---------+---------+--
        C | CTT L   | CCT P   | CAT H   | CGT R   | T
        C | CTC L   | CCC P   | CAC H   | CGC R   | C
        C | CTA L   | CCA P   | CAA Q   | CGA R   | A
        C | CTG L(s)| CCG P   | CAG Q   | CGG R   | G
        --+---------+---------+---------+---------+--
        A | ATT I   | ACT T   | AAT N   | AGT S   | T
        A | ATC I   | ACC T   | AAC N   | AGC S   | C
        A | ATA I   | ACA T   | AAA K   | AGA R   | A
        A | ATG M(s)| ACG T   | AAG K   | AGG R   | G
        --+---------+---------+---------+---------+--
        G | GTT V   | GCT A   | GAT D   | GGT G   | T
        G | GTC V   | GCC A   | GAC D   | GGC G   | C
        G | GTA V   | GCA A   | GAA E   | GGA G   | A
        G | GTG V   | GCG A   | GAG E   | GGG G   | G
        --+---------+---------+---------+---------+--


        Parameters
        ----------
        table : [str, int], optional
            The default is "Standard". Can be a table id integer, see here for table
            numbering https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
        stop_symbol : [str], optional
            The default is "*". Single character string to indicate translation stop.
        to_stop : bool, optional
            The default is False. True means that translation terminates at the first
              in frame stop codon. False translates to the end.
        cds : bool, optional
            The default is False. If True, checks that the sequence starts with a
            valid alternative start codon sequence length is a multiple of three, and
            that there is a single in frame stop codon at the end. If these tests fail,
            an exception is raised.
        gap : str, optional
            The default is "-".

        Returns
        -------
        Bio.Seq.Seq
            A Biopython Seq object with the translated amino acid code.

        """

        p = _Seq(self._data).translate(
            stop_symbol=stop_symbol, to_stop=to_stop, cds=cds, gap=gap
        )
        return ProteinSeq(p._data)



[docs]
    def transcribe(self) -> _Seq:
        """
        Transcribe a DNA sequence into RNA and return the RNA sequence
        as a new Seq object.

        """
        return Seq(_Seq(self._data).transcribe()._data)



[docs]
    def gc(self) -> float:
        """Return GC content."""
        return round(gc_fraction(self._data.upper().decode("ASCII")), 3)



[docs]
    def cai(self, organism: str = "sce") -> float:
        """docstring."""
        from pydna.utils import cai as _cai

        return _cai(self._data.upper().decode("ASCII"), organism=organism)



[docs]
    def rarecodons(self, organism: str = "sce") -> List[slice]:
        """docstring."""
        rare = rare_codons[organism]
        s = self._data.upper().decode("ASCII")
        slices: List[slice] = []
        for i in range(0, len(self) // 3):
            x, y = i * 3, i * 3 + 3
            trip = s[x:y]
            if trip in rare:
                slices.append(slice(x, y, 1))
        return slices



[docs]
    def startcodon(self, organism: str = "sce") -> Optional[float]:
        """docstring."""
        return _start[organism].get(self._data.upper().decode("ASCII")[:3])



[docs]
    def stopcodon(self, organism: str = "sce") -> Optional[float]:
        """docstring."""
        return _stop[organism].get(self._data.upper().decode("ASCII")[-3:])



[docs]
    def express(self, organism: str = "sce") -> PrettyTable:
        """docstring."""
        x = PrettyTable(
            ["cds", "len", "cai", "gc", "sta", "stp", "n-end"]
            + rare_codons[organism]
            + ["rare"]
        )
        val = []

        val.append(
            f"{self._data.upper().decode('ASCII')[:3]}..."
            f"{self._data.upper().decode('ASCII')[-3:]}"
        )
        val.append(len(self) / 3)
        val.append(self.cai(organism))
        val.append(self.gc())
        val.append(self.startcodon())
        val.append(self.stopcodon())
        val.append(
            _n_end[organism].get(seq3(self[3:6].translate())),
        )
        s = self._data.upper().decode("ASCII")
        trps = [s[i * 3 : i * 3 + 3] for i in range(0, len(s) // 3)]
        tot = 0
        for cdn in rare_codons[organism]:
            cnt = trps.count(cdn)
            tot += cnt
            val.append(cnt)
        val.append(round(tot / len(trps), 3))
        x.add_row(val)
        return x



[docs]
    def orfs2(self, minsize: int = 30) -> List[str]:
        """docstring."""
        orf = re.compile(
            f"ATG(?:...){{{minsize},}}?(?:TAG|TAA|TGA)", flags=re.IGNORECASE
        )
        start = 0
        matches: List[slice] = []
        s = self._data.decode("ASCII")

        while True:
            match = orf.search(s, pos=start)
            if match:
                matches.append(slice(match.start(), match.end()))
                start = match.start() + 1
            else:
                break
        return sorted([self[sl] for sl in matches], key=len, reverse=True)



[docs]
    def orfs(self, minsize: int = 100) -> List[Tuple[int, int]]:
        dna = self._data.decode("ASCII")
        from pydna.utils import three_frame_orfs

        return [(x, y) for frame, x, y in three_frame_orfs(dna, limit=minsize)]



[docs]
    def seguid(self) -> str:
        """Url safe SEGUID [#]_ for the sequence.

        This checksum is the same as seguid but with base64.urlsafe
        encoding instead of the normal base64. This means that
        the characters + and / are replaced with - and _ so that
        the checksum can be part of a URL.

        Examples
        --------
        >>> from pydna.seq import Seq
        >>> a = Seq("aa")
        >>> a.seguid()
        'lsseguid=gBw0Jp907Tg_yX3jNgS4qQWttjU'

        References
        ----------
        .. [#] http://wiki.christophchamp.com/index.php/SEGUID
        """
        return lsseguid(
            self._data.decode("ascii").upper(), alphabet="{DNA-extended},AU"
        )


    # def __getitem__(self, key):
    #     result = super().__getitem__(key)
    #     try:
    #         result.__class__ = self.__class__
    #     except TypeError:
    #         pass
    #     return result


[docs]
    def reverse_complement(self):
        return self.__class__(rc(self._data))


    rc = reverse_complement




[docs]
class ProteinSeq(_Seq):
    """docstring."""


[docs]
    def translate(self):
        raise NotImplementedError("Not defined for protein.")



[docs]
    def complement(self):
        raise NotImplementedError("Not defined for protein.")



[docs]
    def complement_rna(self):
        raise NotImplementedError("Not defined for protein.")



[docs]
    def reverse_complement(self):
        raise NotImplementedError("Not defined for protein.")


    rc = reverse_complement


[docs]
    def reverse_complement_rna(self):
        raise NotImplementedError("Not defined for protein.")



[docs]
    def transcribe(self):
        raise NotImplementedError("Not defined for protein.")



[docs]
    def back_transcribe(self):
        raise NotImplementedError("Not defined for protein.")



[docs]
    def seguid(self) -> str:
        """Url safe SEGUID [#]_ for the sequence.

        This checksum is the same as seguid but with base64.urlsafe
        encoding instead of the normal base64. This means that
        the characters + and / are replaced with - and _ so that
        the checksum can be part of a URL.

        Examples
        --------
        >>> from pydna.seq import ProteinSeq
        >>> a = ProteinSeq("aa")
        >>> a.seguid()
        'lsseguid=gBw0Jp907Tg_yX3jNgS4qQWttjU'

        References
        ----------
        .. [#] http://wiki.christophchamp.com/index.php/SEGUID
        """
        return lsseguid(
            self._data.decode("utf8").upper(), alphabet="{protein-extended}"
        )


    def __getitem__(self, key):
        result = super().__getitem__(key)
        try:
            result.__class__ = self.__class__
        except TypeError:
            pass
        return result

    def _pa(self) -> ProteinAnalysis:
        # breakpoint()
        return ProteinAnalysis(self._data.decode("ascii"))


[docs]
    def molecular_weight(self) -> float:
        return self._pa().molecular_weight()



[docs]
    def pI(self) -> float:
        return self._pa().isoelectric_point()



[docs]
    def instability_index(self) -> float:
        """
        Instability index according to Guruprasad et al.

        Value above 40 means the protein is has a short half life.

        Guruprasad K., Reddy B.V.B., Pandit M.W. Protein Engineering 4:155-161(1990).
        """
        return self._pa().instability_index()