Source code for pydna.dseq

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""Provides the Dseq class for handling double stranded DNA sequences.

Dseq is a subclass of :class:`Bio.Seq.Seq`. The Dseq class
is mostly useful as a part of the :class:`pydna.dseqrecord.Dseqrecord` class
which can hold more meta data.

The Dseq class support the notion of circular and linear DNA topology.
"""

import itertools
import re
import copy
import sys
import math
import inspect
from typing import List, Tuple, Union

from Bio.Restriction import RestrictionBatch
from Bio.Restriction import CommOnly

from seguid import ldseguid
from seguid import cdseguid

from pydna.seq import Seq
from Bio.Seq import _SeqAbstractBaseClass
from Bio.Data.IUPACData import unambiguous_dna_weights
from Bio.Data.IUPACData import unambiguous_rna_weights
from Bio.Data.IUPACData import atom_weights
from pydna._pretty import pretty_str
from pydna.utils import rc
from pydna.utils import flatten
from pydna.utils import cuts_overlap

from pydna.alphabet import basepair_dict
from pydna.alphabet import dscode_to_watson_table
from pydna.alphabet import dscode_to_crick_table
from pydna.alphabet import regex_ds_melt_factory
from pydna.alphabet import regex_ss_melt_factory
from pydna.alphabet import dscode_to_full_sequence_table
from pydna.alphabet import dscode_to_watson_tail_table
from pydna.alphabet import dscode_to_crick_tail_table
from pydna.alphabet import complement_table_for_dscode
from pydna.alphabet import letters_not_in_dscode
from pydna.alphabet import get_parts
from pydna.alphabet import representation_tuple
from pydna.alphabet import dsbreaks

from pydna.common_sub_strings import common_sub_strings
from pydna.types import DseqType, EnzymesType, CutSiteType


# Sequences larger than this gets a truncated representation.
length_limit_for_repr = 30
placeholder = letters_not_in_dscode[-1]


[docs] class CircularBytes(bytes): """ A circular bytes sequence: indexing and slicing wrap around index 0. """ def __new__(cls, value: bytes | bytearray | memoryview): return super().__new__(cls, bytes(value)) def __getitem__(self, key): n = len(self) if n == 0: if isinstance(key, slice): return self.__class__(b"") raise IndexError("CircularBytes index out of range (empty bytes)") if isinstance(key, int): return super().__getitem__(key % n) if isinstance(key, slice): start, stop, step = key.start, key.stop, key.step step = 1 if step is None else step if step == 0: raise ValueError("slice step cannot be zero") if step > 0: start = 0 if start is None else start stop = n if stop is None else stop while stop <= start: stop += n rng = range(start, stop, step) else: start = (n - 1) if start is None else start stop = -1 if stop is None else stop while stop >= start: stop -= n rng = range(start, stop, step) limit = n if step % n == 0 else n * 2 out = bytearray() count = 0 for i in rng: out.append(super().__getitem__(i % n)) count += 1 if count > limit: break return self.__class__(bytes(out)) return super().__getitem__(key)
[docs] def cutaround(self, start: int, length: int) -> bytes: """ Return a circular slice of given length starting at index `start`. Can exceed len(self), wrapping around as needed. Examples -------- s = CircularBytes(b"ABCDE") assert s.cutaround(3, 7) == b"DEABCDE" assert s.cutaround(-1, 4) == b"EABC" """ n = len(self) if n == 0 or length <= 0: return self.__class__(b"") start %= n out = bytearray() for i in range(length): out.append(self[(start + i) % n]) return self.__class__(bytes(out))
[docs] def find( self, sub: bytes | bytearray | memoryview | str, start: int = 0, end: int | None = None, ) -> int: """ Find a subsequence in the circular sequence, possibly wrapping across the origin. Returns -1 if not found. """ n = len(self) if n == 0: return -1 end = n if end is None else min(end, n) doubled = self + self try: sub = sub.encode("ascii") except AttributeError: pass pos = doubled.find(bytes(sub), start, n + len(sub) - 1) if pos == -1 or pos >= n: return -1 return pos
[docs] class Dseq(Seq): """Dseq describes a double stranded DNA fragment, linear or circular. Dseq can be initiated in two ways, using two strings, each representing the Watson (upper, sense) strand, the Crick (lower, antisense) strand and an optional value describing the stagger betwen the strands on the left side (ovhg). Alternatively, a single string represenation using dsIUPAC codes can be used. If a single string is used, the letters of that string are interpreted as base pairs rather than single bases. For example "A" would indicate the basepair "A/T". An expanded IUPAC code is used where the letters PEXI have been assigned to GATC on the Watson strand with no paring base on the Crick strand G/"", A/"", T/"" and C/"". The letters QFZJ have been assigned the opposite base pairs with an empty Watson strand ""/G, ""/A, ""/T, and ""/C. :: PEXIGATCQFZJ would indicate the linear double-stranded fragment: GATCGATC CTAGCTAG Parameters ---------- watson : str a string representing the Watson (sense) DNA strand or a basepair represenation. crick : str, optional a string representing the Crick (antisense) DNA strand. ovhg : int, optional A positive or negative number to describe the stagger between the Watson and Crick strands. see below for a detailed explanation. circular : bool, optional True indicates that sequence is circular, False that it is linear. Examples -------- Dseq is a subclass of the Biopython Bio.Seq.Seq class. The constructor can accept two strings representing the Watson (sense) and Crick(antisense) DNA strands. These are interpreted as single stranded DNA. There is a check for complementarity between the strands. If the DNA molecule is staggered on the left side, an integer ovhg (overhang) must be given, describing the stagger between the Watson and Crick strand in the 5' end of the fragment. Additionally, the optional boolean parameter circular can be given to indicate if the DNA molecule is circular. The most common usage of the Dseq class is probably not to use it directly, but to create it as part of a Dseqrecord object (see :class:`pydna.dseqrecord.Dseqrecord`). This works in the same way as for the relationship between the :class:`Bio.Seq.Seq` and :class:`Bio.SeqRecord.SeqRecord` classes in Biopython. There are multiple ways of creating a Dseq object directly listed below, but you can also use the function Dseq.from_full_sequence_and_overhangs() to create a Dseq: Two arguments (string, string), no overhang provided: >>> from pydna.dseq import Dseq >>> Dseq("gggaaat","ttt") Dseq(-7) gggaaat ttt If Watson and Crick are given, but not ovhg, an attempt will be made to find the best annealing between the strands. There are important limitations to this. If there are several ways to anneal the strands, this will fail. For long fragments it is quite slow. Three arguments (string, string, ovhg=int): The ovhg parameter is an integer describing the length of the Crick strand overhang on the left side (the 5' end of Watson strand). The ovhg parameter controls the stagger at the five prime end:: dsDNA overhang nnn... 2 nnnnn... nnnn... 1 nnnnn... nnnnn... 0 nnnnn... nnnnn... -1 nnnn... nnnnn... -2 nnn... Example of creating Dseq objects with different amounts of stagger: >>> Dseq(watson="att", crick="acata", ovhg=-2) Dseq(-7) att ataca >>> Dseq(watson="ata",crick="acata",ovhg=-1) Dseq(-6) ata ataca >>> Dseq(watson="taa",crick="actta",ovhg=0) Dseq(-5) taa attca >>> Dseq(watson="aag",crick="actta",ovhg=1) Dseq(-5) aag attca >>> Dseq(watson="agt",crick="actta",ovhg=2) Dseq(-5) agt attca If the ovhg parameter is specified a Crick strand also needs to be supplied, or an exception is raised. >>> Dseq(watson="agt", ovhg=2) Traceback (most recent call last): ... ValueError: ovhg (overhang) defined without a crick strand. The shape or topology of the fragment is set by the circular parameter, True or False (default). >>> Dseq("aaa", "ttt", ovhg = 0) # A linear sequence by default Dseq(-3) aaa ttt >>> Dseq("aaa", "ttt", ovhg = 0, circular = False) # A linear sequence if circular is False Dseq(-3) aaa ttt >>> Dseq("aaa", "ttt", ovhg = 0, circular = True) # A circular sequence Dseq(o3) aaa ttt >>> Dseq("aaa", "ttt", ovhg=1, circular = False) Dseq(-4) aaa ttt >>> Dseq("aaa","ttt",ovhg=-1) Dseq(-4) aaa ttt >>> Dseq("aaa", "ttt", circular = True , ovhg=0) Dseq(o3) aaa ttt >>> a=Dseq("tttcccc","aaacccc") >>> a Dseq(-11) tttcccc ccccaaa >>> a.ovhg 4 >>> b=Dseq("ccccttt","ccccaaa") >>> b Dseq(-11) ccccttt aaacccc >>> b.ovhg -4 >>> dsIUPAC [#]_ is an nn extension to the IUPAC alphabet used to describe ss regions: :: aaaGATC GATCccc ad-hoc representations CTAGttt gggCTAG QFZJaaaPEXI PEXIcccQFZJ dsIUPAC Coercing to string >>> str(a) 'ggggtttcccc' A Dseq object can be longer that either the watson or crick strands. :: <-- length --> GATCCTTT AAAGCCTAG <-- length --> GATCCTTT AAAGCCCTA The slicing of a linear Dseq object works mostly as it does for a string. >>> s="ggatcc" >>> s[2:3] 'a' >>> s[2:4] 'at' >>> s[2:4:-1] '' >>> s[::2] 'gac' >>> from pydna.dseq import Dseq >>> d=Dseq(s, circular=False) >>> d[2:3] Dseq(-1) a t >>> d[2:4] Dseq(-2) at ta >>> d[2:4:-1] Dseq(-0) <BLANKLINE> <BLANKLINE> >>> d[::2] Dseq(-3) gac ctg The slicing of a circular Dseq object has a slightly different meaning. >>> s="ggAtCc" >>> d=Dseq(s, circular=True) >>> d Dseq(o6) ggAtCc ccTaGg >>> d[4:3] Dseq(-5) CcggA GgccT The slice [X:X] produces an empty slice for a string, while this will return the linearized sequence starting at X: >>> s="ggatcc" >>> d=Dseq(s, circular=True) >>> d Dseq(o6) ggatcc cctagg >>> d[3:3] Dseq(-6) tccgga aggcct >>> See Also -------- pydna.dseqrecord.Dseqrecord """ def __init__( self, watson: Union[str, bytes], crick: Union[str, bytes, None] = None, ovhg=None, circular=False, pos=0, ): if isinstance(watson, (bytes, bytearray)): # watson is decoded to a string if needed. watson = watson.decode("ascii") if isinstance(crick, (bytes, bytearray)): # crick is decoded to a string if needed. crick = crick.decode("ascii") if crick is None: if ovhg is not None: raise ValueError("ovhg (overhang) defined without a crick strand.") """ Giving only the watson string implies inferring the Crick complementary strand from the Watson sequence. The watson string can contain dscode letters wich will be interpreted as outlined in the pydna.alphabet module. The _data property must be a byte string for compatibility with Biopython Bio.Seq.Seq """ data = watson self._data = data.encode("ascii") else: """ Crick strand given, ovhg is optional. An important consequence is that the watson and crick strands are interpreted as single stranded DNA that is supposed to anneal. If ovhg was not given, we try to guess the value below. This will fail if there are two or more ways to anneal with equal length of the double stranded part. """ if ovhg is None: # ovhg not given, try to guess from sequences limit = int(math.log(len(watson)) / math.log(4)) olaps = common_sub_strings( str(watson).lower(), str(rc(crick).lower()), limit, ) """No overlaps found, strands do not anneal""" if len(olaps) == 0: raise ValueError( "Could not anneal the two strands." f" looked for annealing with at least {limit} basepairs" " Please provide and overhang value (ovhg parameter)" ) """ We extract the positions and length of the first (longest) overlap, since common_sub_strings sorts the overlaps by length, longest first. """ (pos_watson, pos_crick, longest_olap_length), *rest = olaps """ We see if there is another overlap of the same length This means that annealing is ambigous. User should provide and ovhg value. """ if any( olap_length >= longest_olap_length for _, _, olap_length in rest ): raise ValueError( "More than one way of annealing the" " strands. Please provide ovhg value" ) ovhg = pos_crick - pos_watson """ Pad both strands on left side ovhg spaces a negative number gives no padding, """ sense = ovhg * " " + watson antisense = -ovhg * " " + crick[::-1] max_len = max(len(sense), len(antisense)) """pad both strands on right side to same size.""" sense = sense.ljust(max_len) antisense = antisense.ljust(max_len) """both strands padded so that bsepairs align""" assert len(sense) == len(antisense) data = [] for w, c in zip(sense, antisense): try: data.append(basepair_dict[w, c]) except KeyError as err: print(f"Base mismatch in representation {err}") raise ValueError(f"Base mismatch in representation: {err}") data = "".join(data).strip() self._data = data.encode("ascii") self.circular = circular self.pos = pos if circular: data += data[0:1] dsb = dsbreaks(data) if dsb: msg = "".join(dsb) raise ValueError( f"Molecule is internally split in {len(dsb)} location(s):\n\n{msg}".strip() )
[docs] @classmethod def quick(cls, data: bytes, *args, circular=False, pos=0, **kwargs): """Fastest way to instantiate an object of the Dseq class. No checks of parameters are made. Does not call Bio.Seq.Seq.__init__() which has lots of time consuming checks. """ obj = cls.__new__(cls) obj.circular = circular obj.pos = pos obj._data = data return obj
[docs] @classmethod def from_representation(cls, dsdna: str, *args, **kwargs): obj = cls.__new__(cls) obj.circular = False obj.pos = 0 clean = inspect.cleandoc("\n" + dsdna) watson, crick = [ ln for ln in clean.splitlines() if ln.strip() and not ln.strip().startswith("Dseq(") ] ovhgw = len(watson) - len(watson.lstrip()) ovhgc = -(len(crick) - len(crick.lstrip())) ovhg = ovhgw or ovhgc watson = watson.strip() crick = crick.strip()[::-1] return Dseq(watson, crick, ovhg)
[docs] @classmethod def from_full_sequence_and_overhangs( cls, full_sequence: str, crick_ovhg: int, watson_ovhg: int ): """Create a linear Dseq object from a full sequence and the 3' overhangs of each strand. The order of the parameters is like this because the 3' overhang of the crick strand is the one on the left side of the sequence. Parameters ---------- full_sequence: str The full sequence of the Dseq object. crick_ovhg: int The overhang of the crick strand in the 3' end. Equivalent to Dseq.ovhg. watson_ovhg: int The overhang of the watson strand in the 5' end. Returns ------- Dseq A Dseq object. Examples -------- >>> Dseq.from_full_sequence_and_overhangs('AAAAAA', crick_ovhg=2, watson_ovhg=2) Dseq(-6) AAAA TTTT >>> Dseq.from_full_sequence_and_overhangs('AAAAAA', crick_ovhg=-2, watson_ovhg=2) Dseq(-6) AAAAAA TT >>> Dseq.from_full_sequence_and_overhangs('AAAAAA', crick_ovhg=2, watson_ovhg=-2) Dseq(-6) AA TTTTTT >>> Dseq.from_full_sequence_and_overhangs('AAAAAA', crick_ovhg=-2, watson_ovhg=-2) Dseq(-6) AAAA TTTT """ full_sequence_rev = str(Dseq(full_sequence).reverse_complement()) watson = full_sequence crick = full_sequence_rev # If necessary, we trim the left side if crick_ovhg < 0: crick = crick[:crick_ovhg] elif crick_ovhg > 0: watson = watson[crick_ovhg:] # If necessary, we trim the right side if watson_ovhg < 0: watson = watson[:watson_ovhg] elif watson_ovhg > 0: crick = crick[watson_ovhg:] return Dseq(watson, crick=crick, ovhg=crick_ovhg)
@property def watson(self) -> str: """ The watson (upper) strand of the double stranded fragment 5'-3'. Returns ------- TYPE DESCRIPTION. """ return self._data.decode("ascii").translate(dscode_to_watson_table).strip() @property def crick(self) -> str: """ The crick (lower) strand of the double stranded fragment 5'-3'. Returns ------- TYPE DESCRIPTION. """ return self._data.decode("ascii").translate(dscode_to_crick_table).strip()[::-1] @property def left_ovhg(self) -> int: """ The 5' overhang of the lower strand compared the the upper. See module docstring for more information. Returns ------- TYPE DESCRIPTION. """ parts = self.get_parts() if parts.single_watson or parts.single_crick: return None return -len(parts.sticky_left5) or len(parts.sticky_left3) ovhg = left_ovhg @property def right_ovhg(self) -> int: """Overhang at the right side (end).""" parts = self.get_parts() if parts.single_watson or parts.single_crick: return None return -len(parts.sticky_right5) or len(parts.sticky_right3) watson_ovhg = right_ovhg def __str__(self) -> str: """ A string representation of the sequence. The returned string is the watson strand of a blunt version of the sequence. >>> ds = Dseq.from_representation( ... ''' ... GAATTC ... TAA ... ''') >>> str(ds) 'GAATTC' >>> ds = Dseq.from_representation( ... ''' ... ATT ... CTTAAG ... ''') >>> str(ds) 'GAATTC' Returns ------- str A string representation of the sequence. """ return bytes(self).decode("ascii") to_blunt_string = __str__ # alias of __str__ # TODO: consider removing def __bytes__(self) -> bytes: return self._data.translate(dscode_to_full_sequence_table)
[docs] def mw(self) -> float: """The molecular weight of the DNA/RNA molecule in g/mol. The molecular weight data in Biopython Bio.Data.IUPACData is used. The DNA is assumed to have a 5'-phosphate as many DNA fragments from restriction digestion do: :: P - G-A-T-T-A-C-A - OH | | | | | | | OH - C-T-A-A-T-G-T - P The molecular weights listed in the unambiguous_dna_weights dictionary refers to free monophosphate nucleotides. One water molecule is removed for every phopshodiester bond formed between nucleotides. For linear molecules, the weight of one water molecule is added to account for the terminal hydroxyl group and a hydrogen on the 5' terminal phosphate group. :: P - G---A---T - OH P - C---A - OH | | | | | OH - C---T---A---A---T---G---T - P If the DNA is discontinuous, the internal 5'- end is assumed to have a phosphate and the 3'- a hydroxyl group: Examples -------- >>> from pydna.dseq import Dseq >>> ds_lin_obj = Dseq("GATTACA") >>> ds_lin_obj Dseq(-7) GATTACA CTAATGT >>> round(ds_lin_obj.mw(), 1) 4359.8 >>> ds_circ_obj = Dseq("GATTACA", circular = True) >>> round(ds_circ_obj.mw(), 1) 4323.8 >>> ssobj = Dseq("PEXXEIE") >>> ssobj Dseq(-7) GATTACA ||||||| >>> round(ssobj.mw(), 1) 2184.4 >>> ds_lin_obj2 = Dseq("GATZFCA") >>> ds_lin_obj2 Dseq(-7) GAT CA CTAATGT >>> round(ds_lin_obj2.mw(), 1) 3724.4 """ h2o = atom_weights["H"] * 2 + atom_weights["O"] mwd = unambiguous_rna_weights | unambiguous_dna_weights | {" ": 0} watsn_weight = sum(mwd[nt] - h2o for nt in self.watson.upper()) crick_weight = sum(mwd[nt] - h2o for nt in self.crick.upper()) watsn_weight += h2o * len(re.findall(r" +", self.watson)) crick_weight += h2o * len(re.findall(r" +", self.crick)) if watsn_weight and not self.circular: watsn_weight += h2o if crick_weight and not self.circular: crick_weight += h2o return watsn_weight + crick_weight
[docs] def find( self, sub: Union[_SeqAbstractBaseClass, str, bytes], start=0, end=sys.maxsize ) -> int: """This method behaves like the python string method of the same name. Returns an integer, the index of the first occurrence of substring argument sub in the (sub)sequence given by [start:end]. Returns -1 if the subsequence is NOT found. The search is case sensitive. Parameters ---------- sub : string or Seq object a string or another Seq object to look for. start : int, optional slice start. end : int, optional slice end. Examples -------- >>> from pydna.dseq import Dseq >>> seq = Dseq("agtaagt") >>> seq Dseq(-7) agtaagt tcattca >>> seq.find("taa") 2 >>> seq = Dseq(watson="agta",crick="actta",ovhg=-2) >>> seq Dseq(-7) agta attca >>> seq.find("taa") -1 >>> seq = Dseq(watson="agta",crick="actta",ovhg=-2) >>> seq Dseq(-7) agta attca >>> seq.find("ta") 2 """ if self.circular: result = CircularBytes(self._data).find(sub, start, end) else: result = super().find(sub, start, end) return result
def __contains__(self, sub: [str, bytes]) -> bool: return self.find(sub) != -1 def __getitem__(self, sl: [slice, int]) -> DseqType: if isinstance(sl, int): sl = slice(sl, sl + 1, 1) sl = slice(sl.start, sl.stop, sl.step) if self.circular: cb = CircularBytes(self._data) return self.quick(cb[sl]) return super().__getitem__(sl) def __eq__(self, other: DseqType) -> bool: """Compare to another Dseq object OR an object that implements watson, crick and ovhg properties. This comparison is case insensitive. """ try: same = ( other.watson.lower() == self.watson.lower() and other.crick.lower() == self.crick.lower() and other.ovhg == self.ovhg and self.circular == other.circular ) # Also test for alphabet ? except AttributeError: same = False return same def __repr__(self, lim: int = length_limit_for_repr) -> pretty_str: header = f"{self.__class__.__name__}({({False: '-', True: 'o'}[self.circular])}{len(self)})" w, c = representation_tuple( self._data.decode("ascii"), length_limit_for_repr=length_limit_for_repr ) w = w or "|" * len(c) c = c or "|" * len(w) return pretty_str(header + "\n" + w + "\n" + c)
[docs] def reverse_complement(self) -> "Dseq": """Dseq object where watson and crick have switched places. This represents the same double stranded sequence. Examples -------- >>> from pydna.dseq import Dseq >>> a=Dseq("catcgatc") >>> a Dseq(-8) catcgatc gtagctag >>> b=a.reverse_complement() >>> b Dseq(-8) gatcgatg ctagctac >>> """ return Dseq.quick(rc(self._data), circular=self.circular)
rc = reverse_complement # alias for reverse_complement
[docs] def shifted(self: DseqType, shift: int) -> DseqType: """ Shifted copy of a circular Dseq object. >>> ds = Dseq("TAAG", circular = True) >>> ds.shifted(1) # First bp moved to right side: Dseq(o4) AAGT TTCA >>> ds.shifted(-1) # Last bp moved to left side: Dseq(o4) GTAA CATT """ if not self.circular: raise TypeError("DNA is not circular.") shift = shift % len(self) if not shift: return copy.deepcopy(self) else: return (self[shift:] + self[:shift]).looped()
[docs] def looped(self: DseqType) -> DseqType: """Circularized Dseq object. This can only be done if the two ends are compatible, otherwise a TypeError is raised. Examples -------- >>> from pydna.dseq import Dseq >>> a=Dseq("catcgatc") >>> a Dseq(-8) catcgatc gtagctag >>> a.looped() Dseq(o8) catcgatc gtagctag >>> b = Dseq("iatcgatj") >>> b Dseq(-8) catcgat tagctag >>> b.looped() Dseq(o7) catcgat gtagcta >>> c = Dseq("jatcgati") >>> c Dseq(-8) atcgatc gtagcta >>> c.looped() Dseq(o7) catcgat gtagcta >>> d = Dseq("ietcgazj") >>> d Dseq(-8) catcga agctag >>> d.looped() Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/usr/local/lib/python2.7/dist-packages/pydna/dsdna.py", line 357, in looped if type5 == type3 and str(sticky5) == str(rc(sticky3)): TypeError: DNA cannot be circularized. 5' and 3' sticky ends not compatible! >>> """ if self.circular: return copy.deepcopy(self) type5, sticky5 = self.five_prime_end() type3, sticky3 = self.three_prime_end() err = TypeError( "DNA cannot be circularized.\n" "5' and 3' sticky ends not compatible!" ) if type5 != type3: raise err try: # Test if sticky ends are compatible self + self except TypeError: raise err new = self.cast_to_ds_left()[: len(self) - len(sticky3)] new.circular = True return new
[docs] def five_prime_end(self) -> Tuple[str, str]: """Returns a 2-tuple of trings describing the structure of the 5' end of the DNA fragment. The tuple contains (type , sticky) where type is eiter "5'" or "3'". sticky is always in lower case and contains the sequence of the protruding end in 5'-3' direction. See examples below: Examples -------- >>> from pydna.dseq import Dseq >>> a = Dseq("aa", "tttg", ovhg=2) >>> a Dseq(-4) aa gttt >>> a.five_prime_end() ("3'", 'tg') >>> a = Dseq("caaa", "tt", ovhg=-2) >>> a Dseq(-4) caaa tt >>> a.five_prime_end() ("5'", 'ca') >>> a = Dseq("aa", "tt") >>> a Dseq(-2) aa tt >>> a.five_prime_end() ('blunt', '') See also -------- pydna.dseq.Dseq.three_prime_end """ # See docstring for function pydna.utils.get_parts for details # on what is contained in parts. parts = self.get_parts() sticky5 = parts.sticky_left5.translate(dscode_to_watson_table) sticky3 = parts.sticky_left3.translate(dscode_to_crick_table)[::-1] single_watson = parts.single_watson.translate(dscode_to_watson_table) single_crick = parts.single_crick.translate(dscode_to_crick_table)[::-1] # The walrus operator returns the value being assigned, so # we can test if it is empty or not. if sticky := single_watson: type_ = "single" elif sticky := single_crick: type_ = "single" elif sticky5 == sticky3 == "": type_, sticky = "blunt", "" elif sticky := sticky5: type_ = "5'" elif sticky := sticky3: type_ = "3'" return type_, sticky.lower()
[docs] def three_prime_end(self) -> Tuple[str, str]: """Returns a tuple describing the structure of the 5' end of the DNA fragment >>> a = Dseq("aa", "gttt", ovhg=0) >>> a Dseq(-4) aa tttg >>> a.three_prime_end() ("5'", 'gt') >>> a = Dseq("aaac", "tt", ovhg=0) >>> a Dseq(-4) aaac tt >>> a.three_prime_end() ("3'", 'ac') >>> from pydna.dseq import Dseq >>> a=Dseq("aaa", "ttt") >>> a Dseq(-3) aaa ttt >>> a.three_prime_end() ('blunt', '') See also -------- pydna.dseq.Dseq.five_prime_end """ # See docstring for function pydna.utils.get_parts for details # on what is contained in parts. parts = self.get_parts() sticky5 = parts.sticky_right5.translate(dscode_to_crick_table)[::-1] sticky3 = parts.sticky_right3.translate(dscode_to_watson_table) single_watson = parts.single_watson.translate(dscode_to_watson_table) single_crick = parts.single_crick.translate(dscode_to_crick_table)[::-1] # The walrus operator returns the value being assigned, so # we can test if it is empty or not. if sticky := single_watson: type_ = "single" elif sticky := single_crick: type_ = "single" elif sticky5 == sticky3 == "": type_, sticky = "blunt", "" elif sticky := sticky5: type_ = "5'" elif sticky := sticky3: type_ = "3'" return type_, sticky.lower()
def __add__(self: DseqType, other: [DseqType, str, bytes]) -> DseqType: """ Adding two Dseq objects together. >>> ds = Dseq("a", "t", ovhg=0) >>> ds Dseq(-1) a t >>> ds + ds Dseq(-2) aa tt >>> "g" + ds # adding a string of left side returns a Dseq Dseq(-2) ga ct >>> ds + "c" # adding a string of right side returns a Dseq Dseq(-2) ac tg Parameters ---------- other : [DseqType, str, bytes] Object to be added. Raises ------ TypeError Preventing adding to a circular sequence. Returns ------- DseqType A new Dseq object. """ if self.circular: raise TypeError("circular DNA cannot be ligated!") try: if other.circular: raise TypeError("circular DNA cannot be ligated!") except AttributeError: pass # If other evaluates to False, return a copy of self. if not other: return copy.deepcopy(self) # If self evaluates to False, return a copy of other. elif not self: return copy.deepcopy(other) # get right side end properties for self. self_type, self_tail = self.three_prime_end() try: other_type, other_tail = other.five_prime_end() except AttributeError: # if other does not have the expected properties # most likely it is a string that can be cast as # a Dseq. other_type, other_tail = "blunt", "" other = Dseq(other) err = TypeError("sticky ends not compatible!") # The sticky ends has to be of the same type # or # one or both of is "single" indicating a stranded molecule. if (self_type != other_type) and ("single" not in (self_type, other_type)): raise err # tail length has to be equal for two phosphdiester bonds to form if len(self_tail) != len(other_tail): raise err # Each basepair is checked against the pydna.alphabet basepair_dict # which contains the permitted base pairings. for w, c in zip(self_tail, other_tail[::-1]): try: basepair_dict[(w, c)] except KeyError: raise err return self.__class__( self.watson + other.watson, other.crick + self.crick, self.ovhg ) def __mul__(self: DseqType, number: int) -> DseqType: if not isinstance(number, int): raise TypeError( "TypeError: can't multiply Dseq" f" by non-int of type {type(number)}" ) return Dseq("").join(list(itertools.repeat(self, number))) def _fill_in_left(self: DseqType, nucleotides: str) -> str: stuffer = "" type, se = self.five_prime_end() if type == "5'": for n in rc(se): if n in nucleotides: stuffer += n else: break return self.crick + stuffer, self.ovhg + len(stuffer) def _fill_in_right(self: DseqType, nucleotides: str) -> str: stuffer = "" type, se = self.three_prime_end() if type == "5'": for n in rc(se): if n in nucleotides: stuffer += n else: break return self.watson + stuffer
[docs] def fill_in(self, nucleotides: Union[None, str] = None) -> DseqType: """Fill in of five prime protruding end with a DNA polymerase that has only DNA polymerase activity (such as Exo-Klenow [#]_). Exo-Klenow is a modified version of the Klenow fragment of E. coli DNA polymerase I, which has been engineered to lack both 3-5 proofreading and 5-3 exonuclease activities. and any combination of A, G, C or T. Default are all four nucleotides together. Parameters ---------- nucleotides : str Examples -------- >>> from pydna.dseq import Dseq >>> b=Dseq("caaa", "cttt") >>> b Dseq(-5) caaa tttc >>> b.fill_in() Dseq(-5) caaag gtttc >>> b.fill_in("g") Dseq(-5) caaag gtttc >>> b.fill_in("tac") Dseq(-5) caaa tttc >>> c=Dseq("aaac", "tttg") >>> c Dseq(-5) aaac gttt >>> c.fill_in() Dseq(-5) aaac gttt >>> a=Dseq("aaa", "ttt") >>> a Dseq(-3) aaa ttt >>> a.fill_in() Dseq(-3) aaa ttt References ---------- .. [#] http://en.wikipedia.org/wiki/Klenow_fragment#The_exo-_Klenow_fragment """ if nucleotides is None: nucleotides = "GATCRYWSMKHBVDN" nucleotides = set(nucleotides.lower() + nucleotides.upper()) crick, ovhg = self._fill_in_left(nucleotides) watson = self._fill_in_right(nucleotides) return Dseq(watson, crick, ovhg)
klenow = fill_in # alias
[docs] def nibble_to_blunt(self) -> DseqType: """ Simulates treatment a nuclease with both 5'-3' and 3'-5' single strand specific exonuclease activity (such as mung bean nuclease [#]_) Mung bean nuclease is a nuclease enzyme derived from mung bean sprouts that preferentially degrades single-stranded DNA and RNA into 5'-phosphate- and 3'-hydroxyl-containing nucleotides. Treatment results in blunt DNA, regardless of wheter the protruding end is 5' or 3'. :: ggatcc -> gatcc ctaggg ctagg ggatcc -> ggatc tcctag cctag >>> from pydna.dseq import Dseq >>> b=Dseq("caaa", "cttt") >>> b Dseq(-5) caaa tttc >>> b.mung() Dseq(-3) aaa ttt >>> c=Dseq("aaac", "tttg") >>> c Dseq(-5) aaac gttt >>> c.mung() Dseq(-3) aaa ttt References ---------- .. [#] http://en.wikipedia.org/wiki/Mung_bean_nuclease """ parts = self.get_parts() return self.__class__(parts.middle)
mung = nibble_to_blunt
[docs] def T4(self, nucleotides=None) -> DseqType: """ Fill in 5' protruding ends and nibble 3' protruding ends. This is done using a DNA polymerase providing 3'-5' nuclease activity such as T4 DNA polymerase. This can be done in presence of any combination of the four nucleotides A, G, C or T. T4 DNA polymerase is widely used to “polish” DNA ends because of its strong 3-5 exonuclease activity in the absence of dNTPs, it chews back 3′ overhangs to create blunt ends; in the presence of limiting dNTPs, it can fill in 5′ overhangs; and by carefully controlling reaction time, temperature, and nucleotide supply, you can generate defined recessed or blunt termini. Tuning the nucleotide set can facilitate engineering of partial sticky ends. Default are all four nucleotides together. :: aaagatc-3 aaa 3' ends are always removed. ||| ---> ||| A and T needed or the molecule will 3-ctagttt ttt degrade completely. 5-gatcaaa gatcaaaGATC 5' ends are filled in the ||| ---> ||||||||||| presence of GATC tttctag-5 CTAGtttctag 5-gatcaaa gatcaaaGAT 5' ends are partially filled in the ||| ---> ||||||||| presence of GAT to produce a 1 nt tttctag-5 TAGtttctag 5' overhang 5-gatcaaa gatcaaaGA 5' ends are partially filled in the ||| ---> ||||||| presence of GA to produce a 2 nt tttctag-5 AGtttctag 5' overhang 5-gatcaaa gatcaaaG 5' ends are partially filled in the ||| ---> ||||| presence of G to produce a 3 nt tttctag-5 Gtttctag 5' overhang Parameters ---------- nucleotides : str Examples -------- >>> from pydna.dseq import Dseq >>> a = Dseq.from_representation( ... ''' ... gatcaaa ... tttctag ... ''') >>> a Dseq(-11) gatcaaa tttctag >>> a.T4() Dseq(-11) gatcaaagatc ctagtttctag >>> a.T4("GAT") Dseq(-11) gatcaaagat tagtttctag >>> a.T4("GA") Dseq(-11) gatcaaaga agtttctag >>> a.T4("G") Dseq(-11) gatcaaag gtttctag """ if not nucleotides: nucleotides = "GATCRYWSMKHBVDN" nucleotides = set(nucleotides.lower() + nucleotides.upper()) type, se = self.five_prime_end() if type == "5'": crick, ovhg = self._fill_in_left(nucleotides) else: if type == "3'": ovhg = 0 crick = self.crick[: -len(se)] else: ovhg = 0 crick = self.crick x = len(crick) - 1 while x >= 0: if crick[x] in nucleotides: break x -= 1 ovhg = x - len(crick) + 1 + ovhg crick = crick[: x + 1] if not crick: ovhg = 0 watson = self.watson type, se = self.three_prime_end() if type == "5'": watson = self._fill_in_right(nucleotides) else: if type == "3'": watson = self.watson[: -len(se)] x = len(watson) - 1 while x >= 0: if watson[x] in nucleotides: break x -= 1 watson = watson[: x + 1] return Dseq(watson, crick, ovhg)
t4 = T4 # alias for the T4 method.
[docs] def nibble_five_prime_left(self: DseqType, n: int = 1) -> DseqType: """ 5' => 3' resection at the left side (start) of the molecule. The argument n indicate the number of nucleotides that are to be removed. The outcome of this depend on the structure of the molecule. See the two examples below: The figure below indicates a recess of length two from a blunt DNA fragment. The resulting DNA fragment has a 3' protruding single strand. :: gatc tc |||| --> || ctag ctag The figure below indicates a recess of length two from a DNA fragment with a 5' sticky end resulting in a blunt sequence. :: ttgatc gatc |||| --> |||| ctag ctag >>> from pydna.dseq import Dseq >>> ds = Dseq("gatc") >>> ds Dseq(-4) gatc ctag >>> ds.nibble_five_prime_left(2) Dseq(-4) tc ctag >>> ds.nibble_five_prime_left(3) Dseq(-4) c ctag >>> ds.nibble_five_prime_left(4) Dseq(-4) |||| ctag >>> ds = Dseq.from_representation( ... ''' ... GGgatc ... ctag ... ''') >>> ds Dseq(-6) GGgatc ctag >>> ds.nibble_five_prime_left(2) Dseq(-4) gatc ctag Parameters ---------- n : int, optional The default is 1. This is the number of nucleotides removed. Returns ------- DseqType DESCRIPTION. """ n += max(0, self.ovhg or 0) return Dseq( self._data[:n] .translate(dscode_to_crick_table) .translate(complement_table_for_dscode) .translate(dscode_to_crick_tail_table) .lstrip() + self._data[n:] )
[docs] def nibble_five_prime_right(self: DseqType, n: int = 1) -> DseqType: """ 5' => 3' resection at the right side (end) of the molecule. The argument n indicate the number of nucleotides that are to be removed. The outcome of this depend on the structure of the molecule. See the two examples below: The figure below indicates a recess of length two from a blunt DNA fragment. The resulting DNA fragment has a 3' protruding single strand. :: gatc gatc |||| --> || ctag ct The figure below indicates a recess of length two from a DNA fragment with a 5' sticky end resulting in a blunt sequence. :: gatc gatc |||| --> |||| ctagtt ctag >>> from pydna.dseq import Dseq >>> ds = Dseq("gatc") >>> ds Dseq(-4) gatc ctag >>> ds.nibble_five_prime_right(2) Dseq(-4) gatc ct >>> ds.nibble_five_prime_right(3) Dseq(-4) gatc c >>> ds.nibble_five_prime_right(4) Dseq(-4) gatc |||| >>> ds = Dseq.from_representation( ... ''' ... gatc ... ctagGG ... ''') >>> ds.nibble_five_prime_right(2) Dseq(-4) gatc ctag """ n = len(self) - n ovhg = len(self) if self.right_ovhg is None else self.right_ovhg n -= max(0, ovhg) return Dseq( self._data[:n] + self._data[n:] .translate(dscode_to_watson_table) .translate(dscode_to_watson_tail_table) .lstrip() )
exo1_front = nibble_five_prime_left # TODO: consider using the new names exo1_end = nibble_five_prime_right # TODO: consider using the new names
[docs] def nibble_three_prime_left(self: DseqType, n=1) -> DseqType: """ 3' => 5' resection at the left side (beginning) of the molecule. The argument n indicate the number of nucleotides that are to be removed. The outcome of this depend on the structure of the molecule. See the two examples below: The figure below indicates a recess of length two from a blunt DNA fragment. The resulting DNA fragment has a 5' protruding single strand. :: gatc gatc |||| --> || ctag ag The figure below indicates a recess of length two from a DNA fragment with a 3' sticky end resulting in a blunt sequence. :: gatc gatc |||| --> |||| ttctag ctag >>> from pydna.dseq import Dseq >>> ds = Dseq("gatc") >>> ds Dseq(-4) gatc ctag >>> ds.nibble_three_prime_left(2) Dseq(-4) gatc ag >>> ds.nibble_three_prime_left(3) Dseq(-4) gatc g >>> ds.nibble_three_prime_left(4) Dseq(-4) gatc |||| >>> ds = Dseq.from_representation( ... ''' ... gatc ... CCctag ... ''') >>> ds Dseq(-6) gatc CCctag >>> ds.nibble_three_prime_left(2) Dseq(-4) gatc ctag """ ovhg = len(self) if self.ovhg is None else self.ovhg n -= min(0, ovhg) return Dseq( self._data[:n] .translate(dscode_to_watson_table) .translate(dscode_to_watson_tail_table) .lstrip() + self._data[n:] )
[docs] def nibble_three_prime_right(self: DseqType, n=1) -> DseqType: """ 3' => 5' resection at the right side (end) of the molecule. The argument n indicate the number of nucleotides that are to be removed. The outcome of this depend on the structure of the molecule. See the two examples below: The figure below indicates a recess of length two from a blunt DNA fragment. The resulting DNA fragment has a 5' protruding single strand. :: gatc ga |||| --> || ctag ctag The figure below indicates a recess of length two from a DNA fragment with a 3' sticky end resulting in a blunt sequence. :: gatctt gatc |||| --> |||| ctag ctag >>> from pydna.dseq import Dseq >>> ds = Dseq("gatc") >>> ds Dseq(-4) gatc ctag >>> ds.nibble_three_prime_right(2) Dseq(-4) ga ctag >>> ds.nibble_three_prime_right(3) Dseq(-4) g ctag >>> ds.nibble_three_prime_right(4) Dseq(-4) |||| ctag >>> ds = Dseq.from_representation( ... ''' ... gatcCC ... ctag ... ''') >>> ds.nibble_three_prime_right(2) Dseq(-4) gatc ctag """ n = len(self) - n ovhg = len(self) if self.right_ovhg is None else self.right_ovhg n += min(0, ovhg) return Dseq( self._data[:n] + self._data[n:] .translate(dscode_to_crick_table) .translate(complement_table_for_dscode) .translate(dscode_to_crick_tail_table) .lstrip() )
[docs] def no_cutters( self, batch: Union[RestrictionBatch, None] = None ) -> RestrictionBatch: """Enzymes in a RestrictionBatch not cutting sequence.""" if batch is None: batch = CommOnly ana = batch.search(self) ncut = {enz: sitelist for (enz, sitelist) in ana.items() if not sitelist} return RestrictionBatch(ncut)
[docs] def unique_cutters( self, batch: Union[RestrictionBatch, None] = None ) -> RestrictionBatch: """Enzymes in a RestrictionBatch cutting sequence once.""" if batch is None: batch = CommOnly return self.n_cutters(n=1, batch=batch)
once_cutters = unique_cutters # alias for unique_cutters
[docs] def twice_cutters( self, batch: Union[RestrictionBatch, None] = None ) -> RestrictionBatch: """Enzymes in a RestrictionBatch cutting sequence twice.""" if batch is None: batch = CommOnly return self.n_cutters(n=2, batch=batch)
[docs] def n_cutters( self, n=3, batch: Union[RestrictionBatch, None] = None ) -> RestrictionBatch: """Enzymes in a RestrictionBatch cutting n times.""" if batch is None: batch = CommOnly ana = batch.search(self) ncut = {enz: sitelist for (enz, sitelist) in ana.items() if len(sitelist) == n} return RestrictionBatch(ncut)
[docs] def cutters(self, batch: Union[RestrictionBatch, None] = None) -> RestrictionBatch: """Enzymes in a RestrictionBatch cutting sequence at least once.""" if batch is None: batch = CommOnly ana = batch.search(self) ncut = {enz: sitelist for (enz, sitelist) in ana.items() if sitelist} return RestrictionBatch(ncut)
[docs] def seguid(self) -> str: """SEGUID checksum for the sequence.""" if self.circular: cs = cdseguid( self.watson.upper(), self.crick.upper(), alphabet="{DNA-extended}" ) else: """docstring.""" w = f"{self.ovhg * '-'}{self.watson}{'-' * (-self.ovhg + len(self.crick) - len(self.watson))}".upper() c = f"{'-' * (self.ovhg + len(self.watson) - len(self.crick))}{self.crick}{-self.ovhg * '-'}".upper() cs = ldseguid(w, c, alphabet="{DNA-extended},AU") return cs
[docs] def isblunt(self) -> bool: """isblunt. Return True if Dseq is linear and blunt and false if staggered or circular. Examples -------- >>> from pydna.dseq import Dseq >>> a=Dseq("gat") >>> a Dseq(-3) gat cta >>> a.isblunt() True >>> a=Dseq("gat", "atcg") >>> a Dseq(-4) gat gcta >>> a.isblunt() False >>> a=Dseq("gat", "gatc") >>> a Dseq(-4) gat ctag >>> a.isblunt() False >>> a=Dseq("gat", circular=True) >>> a Dseq(o3) gat cta >>> a.isblunt() False """ parts = self.get_parts() return not any( ( parts.sticky_right5, parts.sticky_right3, parts.sticky_left3, parts.sticky_left5, self.circular, ) )
[docs] def terminal_transferase(self, nucleotides: str = "a") -> DseqType: """ Terminal deoxynucleotidyl transferase (TdT) is a template-independent DNA polymerase that adds nucleotides to the 3′-OH ends of DNA, typically single-stranded or recessed 3′ ends. In cloning, it’s classically used to create homopolymer tails (e.g. poly-dG on a vector and poly-dC on an insert) so that fragments can anneal via complementary overhangs (“tailing” cloning). This activity ia also present in some DNA polymerases, such as Taq polymerase. This property is used in the populat T/A cloning protocol ([#]_). :: gct gcta ||| --> ||| cga acga >>> from pydna.dseq import Dseq >>> a = Dseq("aa") >>> a = Dseq("gct") >>> a Dseq(-3) gct cga >>> a.terminal_transferase() Dseq(-5) gcta acga >>> a.terminal_transferase("G") Dseq(-5) gctG Gcga Parameters ---------- nucleotides : str, optional The default is "a". Returns ------- DseqType DESCRIPTION. References ---------- .. [#] https://en.wikipedia.org/wiki/TA_cloning """ ovhg = self.ovhg if self.ovhg >= 0: ovhg += len(nucleotides) return Dseq(self.watson + nucleotides, self.crick + nucleotides, ovhg)
[docs] def user(self) -> DseqType: """ USER Enzyme treatment. USER Enzyme is a mixture of Uracil DNA glycosylase (UDG) and the DNA glycosylase-lyase Endonuclease VIII. UDG catalyses the excision of an uracil base, forming an abasic or apyrimidinic site (AP site). Endonuclease VIII removes the AP site creating a DNA gap. :: tagaagtaggUat tagaagtagg at ||||||||||||| ---> |||||||||| || atcUtcatccata atc tcatccata >>> a = Dseq("tagaagtaggUat", "atcUtcatccata"[::-1], 0) >>> a Dseq(-13) tagaagtaggUat atcutcatccAta >>> a.user() Dseq(-13) tagaagtagg at atc tcatccAta Returns ------- DseqType DNA fragment with uracile bases removed. """ return Dseq(self._data.translate(bytes.maketrans(b"UuOo", b"ZzEe")))
[docs] def cut(self: DseqType, *enzymes: EnzymesType) -> Tuple[DseqType, ...]: """Returns a list of linear Dseq fragments produced in the digestion. If there are no cuts, an empty list is returned. Parameters ---------- enzymes : enzyme object or iterable of such objects A Bio.Restriction.XXX restriction objects or iterable. Returns ------- frags : list list of Dseq objects formed by the digestion Examples -------- >>> from pydna.dseq import Dseq >>> seq=Dseq("ggatccnnngaattc") >>> seq Dseq(-15) ggatccnnngaattc cctaggnnncttaag >>> from Bio.Restriction import BamHI,EcoRI >>> type(seq.cut(BamHI)) <class 'tuple'> >>> for frag in seq.cut(BamHI): print(repr(frag)) Dseq(-5) g cctag Dseq(-14) gatccnnngaattc gnnncttaag >>> seq.cut(EcoRI, BamHI) == seq.cut(BamHI, EcoRI) True >>> a,b,c = seq.cut(EcoRI, BamHI) >>> a+b+c Dseq(-15) ggatccnnngaattc cctaggnnncttaag >>> """ cutsites = self.get_cutsites(*enzymes) cutsite_pairs = self.get_cutsite_pairs(cutsites) return tuple(self.apply_cut(*cs) for cs in cutsite_pairs)
[docs] def cutsite_is_valid(self, cutsite: CutSiteType) -> bool: """ Check is a cutsite is valid. A cutsite is a nested 2-tuple with this form: ((cut_watson, ovhg), enz), for example ((396, -4), EcoRI) The cut_watson (positive integer) is the cut position of the sequence as for example returned by the Bio.Restriction module. The ovhg (overhang, positive or negative integer or 0) has the same meaning as for restriction enzymes in the Bio.Restriction module and for pydna.dseq.Dseq objects (see docstring for this module and example below) Enzyme can be None. :: Enzyme overhang EcoRI -4 --GAATTC-- --G AATTC-- |||||| --> | | --CTTAAG-- --CTTAA G-- KpnI 4 --GGTACC-- --GGTAC C-- |||||| --> | | --CCATGG-- --C CATGG-- SmaI 0 --CCCGGG-- --CCC GGG-- |||||| --> ||| ||| --GGGCCC-- --GGG CCC-- >>> from Bio.Restriction import EcoRI, KpnI, SmaI >>> EcoRI.ovhg -4 >>> KpnI.ovhg 4 >>> SmaI.ovhg 0 Returns False if: - Cut positions fall outside the sequence (could be moved to Biopython) TODO: example - Overhang is not double stranded TODO: example - Recognition site is not double stranded or is outside the sequence TODO: example - For enzymes that cut twice, it checks that at least one possibility is valid TODO: example Parameters ---------- cutsite : CutSiteType DESCRIPTION. Returns ------- bool True if cutsite can cut the DNA fragment. """ assert cutsite is not None, "cutsite is None" enz = cutsite[1] watson, crick, ovhg = self.get_cut_parameters(cutsite, True) # The overhang is double stranded overhang_dseq = self[watson:crick] if ovhg < 0 else self[crick:watson] if overhang_dseq.ovhg != 0 or overhang_dseq.watson_ovhg != 0: return False # The recognition site is double stranded and within the sequence start_of_recognition_site = watson - enz.fst5 if start_of_recognition_site < 0: start_of_recognition_site += len(self) end_of_recognition_site = start_of_recognition_site + enz.size if self.circular: end_of_recognition_site %= len(self) recognition_site = self[start_of_recognition_site:end_of_recognition_site] if ( len(recognition_site) == 0 or recognition_site.ovhg != 0 or recognition_site.watson_ovhg != 0 ): if enz is None or enz.scd5 is None: return False else: # For enzymes that cut twice, this might be referring to the second one start_of_recognition_site = watson - enz.scd5 if start_of_recognition_site < 0: start_of_recognition_site += len(self) end_of_recognition_site = start_of_recognition_site + enz.size if self.circular: end_of_recognition_site %= len(self) recognition_site = self[ start_of_recognition_site:end_of_recognition_site ] if ( len(recognition_site) == 0 or recognition_site.ovhg != 0 or recognition_site.watson_ovhg != 0 ): return False return True
[docs] def get_cutsites(self: DseqType, *enzymes: EnzymesType) -> List[CutSiteType]: """Returns a list of cutsites, represented represented as `((cut_watson, ovhg), enz)`: - `cut_watson` is a positive integer contained in `[0,len(seq))`, where `seq` is the sequence that will be cut. It represents the position of the cut on the watson strand, using the full sequence as a reference. By "full sequence" I mean the one you would get from `str(Dseq)`. - `ovhg` is the overhang left after the cut. It has the same meaning as `ovhg` in the `Bio.Restriction` enzyme objects, or pydna's `Dseq` property. - `enz` is the enzyme object. It's not necessary to perform the cut, but can be used to keep track of which enzyme was used. Cuts are only returned if the recognition site and overhang are on the double-strand part of the sequence. Parameters ---------- enzymes : Union[RestrictionBatch,list[_AbstractCut]] Returns ------- list[tuple[tuple[int,int], _AbstractCut]] Examples -------- >>> from Bio.Restriction import EcoRI >>> from pydna.dseq import Dseq >>> seq = Dseq('AAGAATTCAAGAATTC') >>> seq.get_cutsites(EcoRI) [((3, -4), EcoRI), ((11, -4), EcoRI)] `cut_watson` is defined with respect to the "full sequence", not the watson strand: >>> dseq = Dseq.from_full_sequence_and_overhangs('aaGAATTCaa', 1, 0) >>> dseq Dseq(-10) aGAATTCaa ttCTTAAGtt >>> dseq.get_cutsites([EcoRI]) [((3, -4), EcoRI)] Cuts are only returned if the recognition site and overhang are on the double-strand part of the sequence. >>> Dseq('GAATTC').get_cutsites([EcoRI]) [((1, -4), EcoRI)] >>> Dseq.from_full_sequence_and_overhangs('GAATTC', -1, 0).get_cutsites([EcoRI]) [] """ if len(enzymes) == 1 and isinstance(enzymes[0], RestrictionBatch): # argument is probably a RestrictionBatch enzymes = [e for e in enzymes[0]] enzymes = list(dict.fromkeys(flatten(enzymes))) # remove duplicate enzymes out = list() for e in enzymes: # Positions of the cut on the watson strand. They are 1-based, so we subtract # 1 to get 0-based positions cuts_watson = [c - 1 for c in e.search(self, linear=(not self.circular))] out += [((w, e.ovhg), e) for w in cuts_watson] return sorted([cutsite for cutsite in out if self.cutsite_is_valid(cutsite)])
[docs] def left_end_position(self) -> Tuple[int, int]: """ The index in the full sequence of the watson and crick start positions. full sequence (str(self)) for all three cases is AAA :: AAA AA AAT TT TTT TTT Returns (0, 1) Returns (1, 0) Returns (0, 0) """ if self.ovhg > 0: return self.ovhg, 0 return 0, -self.ovhg
[docs] def right_end_position(self) -> Tuple[int, int]: """The index in the full sequence of the watson and crick end positions. full sequence (str(self)) for all three cases is AAA ``` AAA AA AAA TT TTT TTT Returns (3, 2) Returns (2, 3) Returns (3, 3) ``` """ if self.watson_ovhg < 0: return len(self) + self.watson_ovhg, len(self) return len(self), len(self) - self.watson_ovhg
[docs] def get_ss_meltsites(self: DseqType, length: int) -> tuple[int, int]: """ Single stranded DNA melt sites Two lists of 2-tuples of integers are returned. Each tuple (`((from, to))`) contains the start and end positions of a single stranded region, shorter or equal to `length`. In the example below, the middle 2 nt part is released from the molecule. :: tagaa ta gtatg ||||| || ||||| --> [(6,8)], [] atcttcatccatac tagaagtaggtatg ||||| || ||||| --> [], [(6,8)] atctt at catac The output of this method is used in the `melt_ss_dna` method in order to determine the start and end positions of single stranded regions. See get_ds_meltsites for melting ds sequences. Examples -------- >>> from pydna.dseq import Dseq >>> ds = Dseq("tagaaqtaqgtatg") >>> ds Dseq(-14) tagaa ta gtatg atcttcatccatac >>> cutsites = ds.get_ss_meltsites(2) >>> cutsites ([(6, 8)], []) >>> ds[6:8] Dseq(-2) ta at >>> ds = Dseq("tagaaptapgtatg") >>> ds Dseq(-14) tagaagtaggtatg atctt at catac >>> cutsites = ds.get_ss_meltsites(2) >>> cutsites ([], [(6, 8)]) """ regex = regex_ss_melt_factory(length) if self.circular: spacer = length cutfrom = self._data[-length:] + self._data + self._data[:length] else: spacer = 0 cutfrom = self._data watson_cuts = [] crick_cuts = [] for m in regex.finditer(cutfrom): if m.lastgroup == "watson": cut1 = m.start() + spacer cut2 = m.end() + spacer watson_cuts.append((cut1, cut2)) else: assert m.lastgroup == "crick" cut1 = m.start() + spacer cut2 = m.end() + spacer crick_cuts.append((cut1, cut2)) return watson_cuts, crick_cuts
[docs] def get_ds_meltsites(self: DseqType, length: int) -> List[CutSiteType]: """ Double stranded DNA melt sites DNA molecules can fall apart by melting if they have internal single stranded regions. In the example below, the molecule has two gaps on opposite sides, two nucleotides apart, which means that it hangs together by two basepairs. This molecule can melt into two separate 8 bp double stranded molecules, each with 3 nt 3' overhangs a depicted below. :: tagaagta gtatg tagaagta gtatg ||||| || ||||| --> ||||| ||||| atctt atccatac atctt atccatac A list of 2-tuples is returned. Each tuple (`((cut_watson, ovhg), None)`) contains cut position and the overhang value in the same format as returned by the get_cutsites method for restriction enzymes. Note that this function deals with melting that results in two double stranded DNA molecules. See get_ss_meltsites for melting of single stranded regions from molecules. Examples -------- >>> from pydna.dseq import Dseq >>> ds = Dseq("tagaaptaqgtatg") >>> ds Dseq(-14) tagaagta gtatg atctt atccatac >>> cutsite = ds.get_ds_meltsites(2) >>> cutsite [((8, 2), None)] """ if length < 1: return tuple() regex = regex_ds_melt_factory(length) if self.circular: spacer = length cutfrom = self._data[-length:] + self._data + self._data[:length] else: spacer = 0 cutfrom = self._data cuts = [] for m in regex.finditer(cutfrom): if m.lastgroup == "watson": cut = (m.end() - spacer, m.end() - m.start()), None else: assert m.lastgroup == "crick" cut = (m.start() - spacer, m.start() - m.end()), None cuts.append(cut) return cuts
[docs] def cast_to_ds_right(self): """ NNNN NNNNGATC |||| --> |||||||| NNNNCTAG NNNNCTAG NNNNGATC NNNNGATC |||| --> |||||||| NNNN NNNNCTAG """ p = self.get_parts() ds_stuffer = (p.sticky_right5 or p.sticky_right3).translate( dscode_to_full_sequence_table ) result = (p.sticky_left5 or p.sticky_left3) + p.middle + ds_stuffer return self.__class__(result, circular=False)
[docs] def cast_to_ds(self): """Sequencially calls cast_to_ds_left and cast_to_ds_right.""" return self.cast_to_ds_left().cast_to_ds_right()
[docs] def cast_to_ds_left(self): """ GATCNNNN GATCNNNN |||| --> |||||||| NNNN CTAGNNNN NNNN GATCNNNN |||| --> |||||||| CTAGNNNN CTAGNNNN """ p = self.get_parts() ds_stuffer = (p.sticky_left5 or p.sticky_left3).translate( dscode_to_full_sequence_table ) result = ds_stuffer + p.middle + (p.sticky_right5 or p.sticky_right3) return self.__class__(result, circular=False)
[docs] def get_cut_parameters( self, cut: Union[CutSiteType, None], is_left: bool ) -> Tuple[int, int, int]: """For a given cut expressed as ((cut_watson, ovhg), enz), returns a tuple (cut_watson, cut_crick, ovhg). - cut_watson: see get_cutsites docs - cut_crick: equivalent of cut_watson in the crick strand - ovhg: see get_cutsites docs The cut can be None if it represents the left or right end of the sequence. Then it will return the position of the watson and crick ends with respect to the "full sequence". The `is_left` parameter is only used in this case. """ if cut is not None: watson, ovhg = cut[0] crick = watson - ovhg if self.circular: crick %= len(self) return watson, crick, ovhg assert not self.circular, "Circular sequences should not have None cuts" if is_left: return *self.left_end_position(), self.ovhg # In the right end, the overhang does not matter return *self.right_end_position(), self.watson_ovhg
[docs] def melt(self, length): """ TBD Parameters ---------- length : TYPE DESCRIPTION. Returns ------- TYPE DESCRIPTION. """ if not length or length < 1: return tuple() # First we need to get rid of single stranded sequences new, strands = self.melt_ss_dna(length) cutsites = new.get_ds_meltsites(length) cutsite_pairs = self.get_cutsite_pairs(cutsites) result = tuple(new.apply_cut(*cutsite_pair) for cutsite_pair in cutsite_pairs) result = tuple([new]) if strands and not result else result return tuple(strands) + tuple(result)
[docs] def melt_ss_dna(self, length) -> tuple["Dseq", list["Dseq"]]: """ Melt to separate single stranded DNA Single stranded DNA molecules shorter or equal to `length` shed from a double stranded DNA molecule without affecting the length of the remaining molecule. In the examples below, the middle 2 nt part is released from the molecule. :: tagaa ta gtatg tagaa gtatg ta ||||| || ||||| --> ||||| ||||| + || atcttcatccatac atcttcatccatac tagaagtaggtatg tagaagtaggtatg ||||| || ||||| --> ||||| ||||| + || atctt at catac atctt catac at Examples -------- >>> from pydna.dseq import Dseq >>> ds = Dseq("tagaaqtaqgtatg") >>> ds Dseq(-14) tagaa ta gtatg atcttcatccatac >>> new, strands = ds.melt_ss_dna(2) >>> new Dseq(-14) tagaa gtatg atcttcatccatac >>> strands[0] Dseq(-2) ta || >>> ds = Dseq("tagaaptapgtatg") >>> ds Dseq(-14) tagaagtaggtatg atctt at catac >>> new, strands = ds.melt_ss_dna(2) >>> new Dseq(-14) tagaagtaggtatg atctt catac >>> strands[0] Dseq(-2) || at """ watsonnicks, cricknicks = self.get_ss_meltsites(length) new, strands = self.shed_ss_dna(watsonnicks, cricknicks) return new, strands
[docs] def shed_ss_dna( self, watson_cutpairs: list[tuple[int, int]] = None, crick_cutpairs: list[tuple[int, int]] = None, ): """ Separate parts of one of the DNA strands Examples -------- >>> from pydna.dseq import Dseq >>> ds = Dseq("tagaagtaggtatg") >>> ds Dseq(-14) tagaagtaggtatg atcttcatccatac >>> new, strands = ds.shed_ss_dna([(6, 8)],[]) >>> new Dseq(-14) tagaag ggtatg atcttcatccatac >>> strands[0] Dseq(-2) ta || >>> new, strands = ds.shed_ss_dna([],[(6, 8)]) >>> new Dseq(-14) tagaagtaggtatg atcttc ccatac >>> strands[0] Dseq(-2) || at >>> ds = Dseq("tagaagtaggtatg") >>> new, (strand1, strand2) = ds.shed_ss_dna([(6, 8), (9, 11)],[]) >>> new Dseq(-14) tagaag g atg atcttcatccatac >>> strand1 Dseq(-2) ta || >>> strand2 Dseq(-2) gt || """ watson_cutpairs = watson_cutpairs or list() crick_cutpairs = crick_cutpairs or list() strands = [] new = bytearray(self._data) for x, y in watson_cutpairs: stuffer = new[x:y] ss = Dseq.quick(new[x:y].translate(dscode_to_watson_tail_table)) new[x:y] = stuffer.translate(dscode_to_crick_tail_table) strands.append(ss) for x, y in crick_cutpairs: stuffer = new[x:y] ss = Dseq.quick(stuffer.translate(dscode_to_crick_tail_table)) new[x:y] = stuffer.translate(dscode_to_watson_tail_table) strands.append(ss) return Dseq.quick(new), strands
[docs] def apply_cut(self, left_cut: CutSiteType, right_cut: CutSiteType) -> "Dseq": """Extracts a subfragment of the sequence between two cuts. For more detail see the documentation of get_cutsite_pairs. Parameters ---------- left_cut : Union[tuple[tuple[int,int], _AbstractCut], None] right_cut: Union[tuple[tuple[int,int], _AbstractCut], None] Returns ------- Dseq Examples -------- >>> from Bio.Restriction import EcoRI >>> from pydna.dseq import Dseq >>> dseq = Dseq('aaGAATTCaaGAATTCaa') >>> cutsites = dseq.get_cutsites([EcoRI]) >>> cutsites [((3, -4), EcoRI), ((11, -4), EcoRI)] >>> p1, p2, p3 = dseq.get_cutsite_pairs(cutsites) >>> p1 (None, ((3, -4), EcoRI)) >>> dseq.apply_cut(*p1) Dseq(-7) aaG ttCTTAA >>> p2 (((3, -4), EcoRI), ((11, -4), EcoRI)) >>> dseq.apply_cut(*p2) Dseq(-12) AATTCaaG GttCTTAA >>> p3 (((11, -4), EcoRI), None) >>> dseq.apply_cut(*p3) Dseq(-7) AATTCaa Gtt >>> dseq = Dseq('TTCaaGAA', circular=True) >>> cutsites = dseq.get_cutsites([EcoRI]) >>> cutsites [((6, -4), EcoRI)] >>> pair = dseq.get_cutsite_pairs(cutsites)[0] >>> pair (((6, -4), EcoRI), ((6, -4), EcoRI)) >>> dseq.apply_cut(*pair) Dseq(-12) AATTCaaG GttCTTAA """ if cuts_overlap(left_cut, right_cut, len(self)): raise ValueError("Cuts by {} {} overlap.".format(left_cut[1], right_cut[1])) left_watson, left_crick, ovhg_left = self.get_cut_parameters(left_cut, True) right_watson, right_crick, _ = self.get_cut_parameters(right_cut, False) return Dseq( self[left_watson:right_watson]._data.translate(dscode_to_watson_table), self[left_crick:right_crick] .reverse_complement() ._data.translate(dscode_to_watson_table), ovhg=ovhg_left, )
[docs] def get_cutsite_pairs( self, cutsites: List[CutSiteType] ) -> List[Tuple[Union[None, CutSiteType], Union[None, CutSiteType]]]: """Returns pairs of cutsites that render the edges of the resulting fragments. A fragment produced by restriction is represented by a tuple of length 2 that may contain cutsites or `None`: - Two cutsites: represents the extraction of a fragment between those two cutsites, in that orientation. To represent the opening of a circular molecule with a single cutsite, we put the same cutsite twice. - `None`, cutsite: represents the extraction of a fragment between the left edge of linear sequence and the cutsite. - cutsite, `None`: represents the extraction of a fragment between the cutsite and the right edge of a linear sequence. Parameters ---------- cutsites : list[tuple[tuple[int,int], _AbstractCut]] Returns ------- list[tuple[tuple[tuple[int,int], _AbstractCut]|None],tuple[tuple[int,int], _AbstractCut]|None] Examples -------- >>> from Bio.Restriction import EcoRI >>> from pydna.dseq import Dseq >>> dseq = Dseq('aaGAATTCaaGAATTCaa') >>> cutsites = dseq.get_cutsites([EcoRI]) >>> cutsites [((3, -4), EcoRI), ((11, -4), EcoRI)] >>> dseq.get_cutsite_pairs(cutsites) [(None, ((3, -4), EcoRI)), (((3, -4), EcoRI), ((11, -4), EcoRI)), (((11, -4), EcoRI), None)] >>> dseq = Dseq('TTCaaGAA', circular=True) >>> cutsites = dseq.get_cutsites([EcoRI]) >>> cutsites [((6, -4), EcoRI)] >>> dseq.get_cutsite_pairs(cutsites) [(((6, -4), EcoRI), ((6, -4), EcoRI))] """ if len(cutsites) == 0: return [] if not self.circular: cutsites = [None, *cutsites, None] else: # Add the first cutsite at the end, for circular cuts cutsites.append(cutsites[0]) return list(zip(cutsites, cutsites[1:]))
[docs] def get_parts(self): """ Returns a DseqParts instance containing the parts (strings) of a dsDNA sequence. DseqParts instance field names: :: "sticky_left5" | | "sticky_right5" | | --- --- GGGATCC TAGGTCA ---- | "middle" "sticky_left3" | | "sticky_right3" | | --- --- ATCCAGT CCCTAGG ---- | "middle" "single_watson" (only an upper strand) | ------- ATCCAGT ||||||| "single_crick" (only a lower strand) | ------- ||||||| CCCTAGG Up to seven groups (0..6) are captured, but some are mutually exclusive which means that one of them is an empty string: 0 or 1, not both, a DNA fragment has either 5' or 3' sticky end. 2 or 5 or 6, a DNA molecule has a ds region or is single stranded. 3 or 4, not both, either 5' or 3' sticky end. Note that internal single stranded regions are not identified and will be contained in the middle part if they are present. Examples -------- >>> from pydna.dseq import Dseq >>> ds = Dseq("PPPATCFQZ") >>> ds Dseq(-9) GGGATC TAGTCA >>> parts = ds.get_parts() >>> parts DseqParts(sticky_left5='PPP', sticky_left3='', middle='ATC', sticky_right3='', sticky_right5='FQZ', single_watson='', single_crick='') >>> Dseq(parts.sticky_left5) Dseq(-3) GGG ||| >>> Dseq(parts.middle) Dseq(-3) ATC TAG >>> Dseq(parts.sticky_right5) Dseq(-3) ||| TCA Parameters ---------- datastring : str A string with dscode. Returns ------- namedtuple Seven string fields describing the DNA molecule. fragment(sticky_left5='', sticky_left3='', middle='', sticky_right3='', sticky_right5='', single_watson='', single_crick='') """ return get_parts(self._data.decode("ascii"))