#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Provides the Dseq class for handling double stranded DNA sequences.
Dseq is a subclass of :class:`Bio.Seq.Seq`. The Dseq class
is mostly useful as a part of the :class:`pydna.dseqrecord.Dseqrecord` class
which can hold more meta data.
The Dseq class support the notion of circular and linear DNA topology.
"""
import itertools
import re
import copy
import sys
import math
import inspect
from typing import List, Tuple, Union
from Bio.Restriction import RestrictionBatch
from Bio.Restriction import CommOnly
from seguid import ldseguid
from seguid import cdseguid
from pydna.seq import Seq
from Bio.Seq import _SeqAbstractBaseClass
from Bio.Data.IUPACData import unambiguous_dna_weights
from Bio.Data.IUPACData import unambiguous_rna_weights
from Bio.Data.IUPACData import atom_weights
from pydna._pretty import pretty_str
from pydna.utils import rc
from pydna.utils import flatten
from pydna.utils import cuts_overlap
from pydna.alphabet import basepair_dict
from pydna.alphabet import dscode_to_watson_table
from pydna.alphabet import dscode_to_crick_table
from pydna.alphabet import regex_ds_melt_factory
from pydna.alphabet import regex_ss_melt_factory
from pydna.alphabet import dscode_to_full_sequence_table
from pydna.alphabet import dscode_to_watson_tail_table
from pydna.alphabet import dscode_to_crick_tail_table
from pydna.alphabet import complement_table_for_dscode
from pydna.alphabet import letters_not_in_dscode
from pydna.alphabet import get_parts
from pydna.alphabet import representation_tuple
from pydna.alphabet import dsbreaks
from pydna.common_sub_strings import common_sub_strings
from pydna.types import DseqType, EnzymesType, CutSiteType
# Sequences larger than this gets a truncated representation.
length_limit_for_repr = 30
placeholder = letters_not_in_dscode[-1]
[docs]
class CircularBytes(bytes):
"""
A circular bytes sequence: indexing and slicing wrap around index 0.
"""
def __new__(cls, value: bytes | bytearray | memoryview):
return super().__new__(cls, bytes(value))
def __getitem__(self, key):
n = len(self)
if n == 0:
if isinstance(key, slice):
return self.__class__(b"")
raise IndexError("CircularBytes index out of range (empty bytes)")
if isinstance(key, int):
return super().__getitem__(key % n)
if isinstance(key, slice):
start, stop, step = key.start, key.stop, key.step
step = 1 if step is None else step
if step == 0:
raise ValueError("slice step cannot be zero")
if step > 0:
start = 0 if start is None else start
stop = n if stop is None else stop
while stop <= start:
stop += n
rng = range(start, stop, step)
else:
start = (n - 1) if start is None else start
stop = -1 if stop is None else stop
while stop >= start:
stop -= n
rng = range(start, stop, step)
limit = n if step % n == 0 else n * 2
out = bytearray()
count = 0
for i in rng:
out.append(super().__getitem__(i % n))
count += 1
if count > limit:
break
return self.__class__(bytes(out))
return super().__getitem__(key)
[docs]
def cutaround(self, start: int, length: int) -> bytes:
"""
Return a circular slice of given length starting at index `start`.
Can exceed len(self), wrapping around as needed.
Examples
--------
s = CircularBytes(b"ABCDE")
assert s.cutaround(3, 7) == b"DEABCDE"
assert s.cutaround(-1, 4) == b"EABC"
"""
n = len(self)
if n == 0 or length <= 0:
return self.__class__(b"")
start %= n
out = bytearray()
for i in range(length):
out.append(self[(start + i) % n])
return self.__class__(bytes(out))
[docs]
def find(
self,
sub: bytes | bytearray | memoryview | str,
start: int = 0,
end: int | None = None,
) -> int:
"""
Find a subsequence in the circular sequence, possibly
wrapping across the origin.
Returns -1 if not found.
"""
n = len(self)
if n == 0:
return -1
end = n if end is None else min(end, n)
doubled = self + self
try:
sub = sub.encode("ascii")
except AttributeError:
pass
pos = doubled.find(bytes(sub), start, n + len(sub) - 1)
if pos == -1 or pos >= n:
return -1
return pos
[docs]
class Dseq(Seq):
"""Dseq describes a double stranded DNA fragment, linear or circular.
Dseq can be initiated in two ways, using two strings, each representing the
Watson (upper, sense) strand, the Crick (lower, antisense) strand and an
optional value describing the stagger betwen the strands on the left side (ovhg).
Alternatively, a single string represenation using dsIUPAC codes can be used.
If a single string is used, the letters of that string are interpreted as base
pairs rather than single bases. For example "A" would indicate the basepair
"A/T". An expanded IUPAC code is used where the letters PEXI have been assigned
to GATC on the Watson strand with no paring base on the Crick strand G/"", A/"",
T/"" and C/"". The letters QFZJ have been assigned the opposite base pairs with
an empty Watson strand ""/G, ""/A, ""/T, and ""/C.
::
PEXIGATCQFZJ would indicate the linear double-stranded fragment:
GATCGATC
CTAGCTAG
Parameters
----------
watson : str
a string representing the Watson (sense) DNA strand or a basepair
represenation.
crick : str, optional
a string representing the Crick (antisense) DNA strand.
ovhg : int, optional
A positive or negative number to describe the stagger between the
Watson and Crick strands.
see below for a detailed explanation.
circular : bool, optional
True indicates that sequence is circular, False that it is linear.
Examples
--------
Dseq is a subclass of the Biopython Bio.Seq.Seq class. The constructor
can accept two strings representing the Watson (sense) and Crick(antisense)
DNA strands. These are interpreted as single stranded DNA. There is a check
for complementarity between the strands.
If the DNA molecule is staggered on the left side, an integer ovhg
(overhang) must be given, describing the stagger between the Watson and Crick strand
in the 5' end of the fragment.
Additionally, the optional boolean parameter circular can be given to indicate if the
DNA molecule is circular.
The most common usage of the Dseq class is probably not to use it directly, but to
create it as part of a Dseqrecord object (see :class:`pydna.dseqrecord.Dseqrecord`).
This works in the same way as for the relationship between the :class:`Bio.Seq.Seq` and
:class:`Bio.SeqRecord.SeqRecord` classes in Biopython.
There are multiple ways of creating a Dseq object directly listed below, but you can also
use the function Dseq.from_full_sequence_and_overhangs() to create a Dseq:
Two arguments (string, string), no overhang provided:
>>> from pydna.dseq import Dseq
>>> Dseq("gggaaat","ttt")
Dseq(-7)
gggaaat
ttt
If Watson and Crick are given, but not ovhg, an attempt will be made to find the best annealing
between the strands. There are important limitations to this. If there are several ways to
anneal the strands, this will fail. For long fragments it is quite slow.
Three arguments (string, string, ovhg=int):
The ovhg parameter is an integer describing the length of the Crick strand overhang on the
left side (the 5' end of Watson strand).
The ovhg parameter controls the stagger at the five prime end::
dsDNA overhang
nnn... 2
nnnnn...
nnnn... 1
nnnnn...
nnnnn... 0
nnnnn...
nnnnn... -1
nnnn...
nnnnn... -2
nnn...
Example of creating Dseq objects with different amounts of stagger:
>>> Dseq(watson="att", crick="acata", ovhg=-2)
Dseq(-7)
att
ataca
>>> Dseq(watson="ata",crick="acata",ovhg=-1)
Dseq(-6)
ata
ataca
>>> Dseq(watson="taa",crick="actta",ovhg=0)
Dseq(-5)
taa
attca
>>> Dseq(watson="aag",crick="actta",ovhg=1)
Dseq(-5)
aag
attca
>>> Dseq(watson="agt",crick="actta",ovhg=2)
Dseq(-5)
agt
attca
If the ovhg parameter is specified a Crick strand also needs to be supplied, or
an exception is raised.
>>> Dseq(watson="agt", ovhg=2)
Traceback (most recent call last):
...
ValueError: ovhg (overhang) defined without a crick strand.
The shape or topology of the fragment is set by the circular parameter, True or False (default).
>>> Dseq("aaa", "ttt", ovhg = 0) # A linear sequence by default
Dseq(-3)
aaa
ttt
>>> Dseq("aaa", "ttt", ovhg = 0, circular = False) # A linear sequence if circular is False
Dseq(-3)
aaa
ttt
>>> Dseq("aaa", "ttt", ovhg = 0, circular = True) # A circular sequence
Dseq(o3)
aaa
ttt
>>> Dseq("aaa", "ttt", ovhg=1, circular = False)
Dseq(-4)
aaa
ttt
>>> Dseq("aaa","ttt",ovhg=-1)
Dseq(-4)
aaa
ttt
>>> Dseq("aaa", "ttt", circular = True , ovhg=0)
Dseq(o3)
aaa
ttt
>>> a=Dseq("tttcccc","aaacccc")
>>> a
Dseq(-11)
tttcccc
ccccaaa
>>> a.ovhg
4
>>> b=Dseq("ccccttt","ccccaaa")
>>> b
Dseq(-11)
ccccttt
aaacccc
>>> b.ovhg
-4
>>>
dsIUPAC [#]_ is an nn extension to the IUPAC alphabet used to describe ss regions:
::
aaaGATC GATCccc ad-hoc representations
CTAGttt gggCTAG
QFZJaaaPEXI PEXIcccQFZJ dsIUPAC
Coercing to string
>>> str(a)
'ggggtttcccc'
A Dseq object can be longer that either the watson or crick strands.
::
<-- length -->
GATCCTTT
AAAGCCTAG
<-- length -->
GATCCTTT
AAAGCCCTA
The slicing of a linear Dseq object works mostly as it does for a string.
>>> s="ggatcc"
>>> s[2:3]
'a'
>>> s[2:4]
'at'
>>> s[2:4:-1]
''
>>> s[::2]
'gac'
>>> from pydna.dseq import Dseq
>>> d=Dseq(s, circular=False)
>>> d[2:3]
Dseq(-1)
a
t
>>> d[2:4]
Dseq(-2)
at
ta
>>> d[2:4:-1]
Dseq(-0)
<BLANKLINE>
<BLANKLINE>
>>> d[::2]
Dseq(-3)
gac
ctg
The slicing of a circular Dseq object has a slightly different meaning.
>>> s="ggAtCc"
>>> d=Dseq(s, circular=True)
>>> d
Dseq(o6)
ggAtCc
ccTaGg
>>> d[4:3]
Dseq(-5)
CcggA
GgccT
The slice [X:X] produces an empty slice for a string, while this
will return the linearized sequence starting at X:
>>> s="ggatcc"
>>> d=Dseq(s, circular=True)
>>> d
Dseq(o6)
ggatcc
cctagg
>>> d[3:3]
Dseq(-6)
tccgga
aggcct
>>>
See Also
--------
pydna.dseqrecord.Dseqrecord
"""
def __init__(
self,
watson: Union[str, bytes],
crick: Union[str, bytes, None] = None,
ovhg=None,
circular=False,
pos=0,
):
if isinstance(watson, (bytes, bytearray)):
# watson is decoded to a string if needed.
watson = watson.decode("ascii")
if isinstance(crick, (bytes, bytearray)):
# crick is decoded to a string if needed.
crick = crick.decode("ascii")
if crick is None:
if ovhg is not None:
raise ValueError("ovhg (overhang) defined without a crick strand.")
"""
Giving only the watson string implies inferring the Crick complementary strand
from the Watson sequence. The watson string can contain dscode letters wich will
be interpreted as outlined in the pydna.alphabet module.
The _data property must be a byte string for compatibility with
Biopython Bio.Seq.Seq
"""
data = watson
self._data = data.encode("ascii")
else:
"""
Crick strand given, ovhg is optional. An important consequence is that the
watson and crick strands are interpreted as single stranded DNA that is
supposed to anneal.
If ovhg was not given, we try to guess the value below. This will fail
if there are two or more ways to anneal with equal length of the double
stranded part.
"""
if ovhg is None: # ovhg not given, try to guess from sequences
limit = int(math.log(len(watson)) / math.log(4))
olaps = common_sub_strings(
str(watson).lower(),
str(rc(crick).lower()),
limit,
)
"""No overlaps found, strands do not anneal"""
if len(olaps) == 0:
raise ValueError(
"Could not anneal the two strands."
f" looked for annealing with at least {limit} basepairs"
" Please provide and overhang value (ovhg parameter)"
)
"""
We extract the positions and length of the first (longest) overlap,
since common_sub_strings sorts the overlaps by length, longest first.
"""
(pos_watson, pos_crick, longest_olap_length), *rest = olaps
"""
We see if there is another overlap of the same length
This means that annealing is ambigous. User should provide
and ovhg value.
"""
if any(
olap_length >= longest_olap_length for _, _, olap_length in rest
):
raise ValueError(
"More than one way of annealing the"
" strands. Please provide ovhg value"
)
ovhg = pos_crick - pos_watson
"""
Pad both strands on left side ovhg spaces
a negative number gives no padding,
"""
sense = ovhg * " " + watson
antisense = -ovhg * " " + crick[::-1]
max_len = max(len(sense), len(antisense))
"""pad both strands on right side to same size."""
sense = sense.ljust(max_len)
antisense = antisense.ljust(max_len)
"""both strands padded so that bsepairs align"""
assert len(sense) == len(antisense)
data = []
for w, c in zip(sense, antisense):
try:
data.append(basepair_dict[w, c])
except KeyError as err:
print(f"Base mismatch in representation {err}")
raise ValueError(f"Base mismatch in representation: {err}")
data = "".join(data).strip()
self._data = data.encode("ascii")
self.circular = circular
self.pos = pos
if circular:
data += data[0:1]
dsb = dsbreaks(data)
if dsb:
msg = "".join(dsb)
raise ValueError(
f"Molecule is internally split in {len(dsb)} location(s):\n\n{msg}".strip()
)
[docs]
@classmethod
def quick(cls, data: bytes, *args, circular=False, pos=0, **kwargs):
"""Fastest way to instantiate an object of the Dseq class.
No checks of parameters are made.
Does not call Bio.Seq.Seq.__init__() which has lots of time consuming checks.
"""
obj = cls.__new__(cls)
obj.circular = circular
obj.pos = pos
obj._data = data
return obj
[docs]
@classmethod
def from_representation(cls, dsdna: str, *args, **kwargs):
obj = cls.__new__(cls)
obj.circular = False
obj.pos = 0
clean = inspect.cleandoc("\n" + dsdna)
watson, crick = [
ln
for ln in clean.splitlines()
if ln.strip() and not ln.strip().startswith("Dseq(")
]
ovhgw = len(watson) - len(watson.lstrip())
ovhgc = -(len(crick) - len(crick.lstrip()))
ovhg = ovhgw or ovhgc
watson = watson.strip()
crick = crick.strip()[::-1]
return Dseq(watson, crick, ovhg)
[docs]
@classmethod
def from_full_sequence_and_overhangs(
cls, full_sequence: str, crick_ovhg: int, watson_ovhg: int
):
"""Create a linear Dseq object from a full sequence and the 3' overhangs of each strand.
The order of the parameters is like this because the 3' overhang of the crick strand is the one
on the left side of the sequence.
Parameters
----------
full_sequence: str
The full sequence of the Dseq object.
crick_ovhg: int
The overhang of the crick strand in the 3' end. Equivalent to Dseq.ovhg.
watson_ovhg: int
The overhang of the watson strand in the 5' end.
Returns
-------
Dseq
A Dseq object.
Examples
--------
>>> Dseq.from_full_sequence_and_overhangs('AAAAAA', crick_ovhg=2, watson_ovhg=2)
Dseq(-6)
AAAA
TTTT
>>> Dseq.from_full_sequence_and_overhangs('AAAAAA', crick_ovhg=-2, watson_ovhg=2)
Dseq(-6)
AAAAAA
TT
>>> Dseq.from_full_sequence_and_overhangs('AAAAAA', crick_ovhg=2, watson_ovhg=-2)
Dseq(-6)
AA
TTTTTT
>>> Dseq.from_full_sequence_and_overhangs('AAAAAA', crick_ovhg=-2, watson_ovhg=-2)
Dseq(-6)
AAAA
TTTT
"""
full_sequence_rev = str(Dseq(full_sequence).reverse_complement())
watson = full_sequence
crick = full_sequence_rev
# If necessary, we trim the left side
if crick_ovhg < 0:
crick = crick[:crick_ovhg]
elif crick_ovhg > 0:
watson = watson[crick_ovhg:]
# If necessary, we trim the right side
if watson_ovhg < 0:
watson = watson[:watson_ovhg]
elif watson_ovhg > 0:
crick = crick[watson_ovhg:]
return Dseq(watson, crick=crick, ovhg=crick_ovhg)
@property
def watson(self) -> str:
"""
The watson (upper) strand of the double stranded fragment 5'-3'.
Returns
-------
TYPE
DESCRIPTION.
"""
return self._data.decode("ascii").translate(dscode_to_watson_table).strip()
@property
def crick(self) -> str:
"""
The crick (lower) strand of the double stranded fragment 5'-3'.
Returns
-------
TYPE
DESCRIPTION.
"""
return self._data.decode("ascii").translate(dscode_to_crick_table).strip()[::-1]
@property
def left_ovhg(self) -> int:
"""
The 5' overhang of the lower strand compared the the upper.
See module docstring for more information.
Returns
-------
TYPE
DESCRIPTION.
"""
parts = self.get_parts()
if parts.single_watson or parts.single_crick:
return None
return -len(parts.sticky_left5) or len(parts.sticky_left3)
ovhg = left_ovhg
@property
def right_ovhg(self) -> int:
"""Overhang at the right side (end)."""
parts = self.get_parts()
if parts.single_watson or parts.single_crick:
return None
return -len(parts.sticky_right5) or len(parts.sticky_right3)
watson_ovhg = right_ovhg
def __str__(self) -> str:
"""
A string representation of the sequence. The returned string
is the watson strand of a blunt version of the sequence.
>>> ds = Dseq.from_representation(
... '''
... GAATTC
... TAA
... ''')
>>> str(ds)
'GAATTC'
>>> ds = Dseq.from_representation(
... '''
... ATT
... CTTAAG
... ''')
>>> str(ds)
'GAATTC'
Returns
-------
str
A string representation of the sequence.
"""
return bytes(self).decode("ascii")
to_blunt_string = __str__ # alias of __str__ # TODO: consider removing
def __bytes__(self) -> bytes:
return self._data.translate(dscode_to_full_sequence_table)
[docs]
def mw(self) -> float:
"""The molecular weight of the DNA/RNA molecule in g/mol.
The molecular weight data in Biopython Bio.Data.IUPACData
is used. The DNA is assumed to have a 5'-phosphate as many
DNA fragments from restriction digestion do:
::
P - G-A-T-T-A-C-A - OH
| | | | | | |
OH - C-T-A-A-T-G-T - P
The molecular weights listed in the unambiguous_dna_weights
dictionary refers to free monophosphate nucleotides.
One water molecule is removed for every phopshodiester bond
formed between nucleotides. For linear molecules, the weight
of one water molecule is added to account for the terminal
hydroxyl group and a hydrogen on the 5' terminal phosphate
group.
::
P - G---A---T - OH P - C---A - OH
| | | | |
OH - C---T---A---A---T---G---T - P
If the DNA is discontinuous, the internal 5'- end is assumed
to have a phosphate and the 3'- a hydroxyl group:
Examples
--------
>>> from pydna.dseq import Dseq
>>> ds_lin_obj = Dseq("GATTACA")
>>> ds_lin_obj
Dseq(-7)
GATTACA
CTAATGT
>>> round(ds_lin_obj.mw(), 1)
4359.8
>>> ds_circ_obj = Dseq("GATTACA", circular = True)
>>> round(ds_circ_obj.mw(), 1)
4323.8
>>> ssobj = Dseq("PEXXEIE")
>>> ssobj
Dseq(-7)
GATTACA
|||||||
>>> round(ssobj.mw(), 1)
2184.4
>>> ds_lin_obj2 = Dseq("GATZFCA")
>>> ds_lin_obj2
Dseq(-7)
GAT CA
CTAATGT
>>> round(ds_lin_obj2.mw(), 1)
3724.4
"""
h2o = atom_weights["H"] * 2 + atom_weights["O"]
mwd = unambiguous_rna_weights | unambiguous_dna_weights | {" ": 0}
watsn_weight = sum(mwd[nt] - h2o for nt in self.watson.upper())
crick_weight = sum(mwd[nt] - h2o for nt in self.crick.upper())
watsn_weight += h2o * len(re.findall(r" +", self.watson))
crick_weight += h2o * len(re.findall(r" +", self.crick))
if watsn_weight and not self.circular:
watsn_weight += h2o
if crick_weight and not self.circular:
crick_weight += h2o
return watsn_weight + crick_weight
[docs]
def find(
self, sub: Union[_SeqAbstractBaseClass, str, bytes], start=0, end=sys.maxsize
) -> int:
"""This method behaves like the python string method of the same name.
Returns an integer, the index of the first occurrence of substring
argument sub in the (sub)sequence given by [start:end].
Returns -1 if the subsequence is NOT found.
The search is case sensitive.
Parameters
----------
sub : string or Seq object
a string or another Seq object to look for.
start : int, optional
slice start.
end : int, optional
slice end.
Examples
--------
>>> from pydna.dseq import Dseq
>>> seq = Dseq("agtaagt")
>>> seq
Dseq(-7)
agtaagt
tcattca
>>> seq.find("taa")
2
>>> seq = Dseq(watson="agta",crick="actta",ovhg=-2)
>>> seq
Dseq(-7)
agta
attca
>>> seq.find("taa")
-1
>>> seq = Dseq(watson="agta",crick="actta",ovhg=-2)
>>> seq
Dseq(-7)
agta
attca
>>> seq.find("ta")
2
"""
if self.circular:
result = CircularBytes(self._data).find(sub, start, end)
else:
result = super().find(sub, start, end)
return result
def __contains__(self, sub: [str, bytes]) -> bool:
return self.find(sub) != -1
def __getitem__(self, sl: [slice, int]) -> DseqType:
if isinstance(sl, int):
sl = slice(sl, sl + 1, 1)
sl = slice(sl.start, sl.stop, sl.step)
if self.circular:
cb = CircularBytes(self._data)
return self.quick(cb[sl])
return super().__getitem__(sl)
def __eq__(self, other: DseqType) -> bool:
"""Compare to another Dseq object OR an object that implements
watson, crick and ovhg properties.
This comparison is case insensitive.
"""
try:
same = (
other.watson.lower() == self.watson.lower()
and other.crick.lower() == self.crick.lower()
and other.ovhg == self.ovhg
and self.circular == other.circular
)
# Also test for alphabet ?
except AttributeError:
same = False
return same
def __repr__(self, lim: int = length_limit_for_repr) -> pretty_str:
header = f"{self.__class__.__name__}({({False: '-', True: 'o'}[self.circular])}{len(self)})"
w, c = representation_tuple(
self._data.decode("ascii"), length_limit_for_repr=length_limit_for_repr
)
w = w or "|" * len(c)
c = c or "|" * len(w)
return pretty_str(header + "\n" + w + "\n" + c)
[docs]
def reverse_complement(self) -> "Dseq":
"""Dseq object where watson and crick have switched places.
This represents the same double stranded sequence.
Examples
--------
>>> from pydna.dseq import Dseq
>>> a=Dseq("catcgatc")
>>> a
Dseq(-8)
catcgatc
gtagctag
>>> b=a.reverse_complement()
>>> b
Dseq(-8)
gatcgatg
ctagctac
>>>
"""
return Dseq.quick(rc(self._data), circular=self.circular)
rc = reverse_complement # alias for reverse_complement
[docs]
def shifted(self: DseqType, shift: int) -> DseqType:
"""
Shifted copy of a circular Dseq object.
>>> ds = Dseq("TAAG", circular = True)
>>> ds.shifted(1) # First bp moved to right side:
Dseq(o4)
AAGT
TTCA
>>> ds.shifted(-1) # Last bp moved to left side:
Dseq(o4)
GTAA
CATT
"""
if not self.circular:
raise TypeError("DNA is not circular.")
shift = shift % len(self)
if not shift:
return copy.deepcopy(self)
else:
return (self[shift:] + self[:shift]).looped()
[docs]
def looped(self: DseqType) -> DseqType:
"""Circularized Dseq object.
This can only be done if the two ends are compatible,
otherwise a TypeError is raised.
Examples
--------
>>> from pydna.dseq import Dseq
>>> a=Dseq("catcgatc")
>>> a
Dseq(-8)
catcgatc
gtagctag
>>> a.looped()
Dseq(o8)
catcgatc
gtagctag
>>> b = Dseq("iatcgatj")
>>> b
Dseq(-8)
catcgat
tagctag
>>> b.looped()
Dseq(o7)
catcgat
gtagcta
>>> c = Dseq("jatcgati")
>>> c
Dseq(-8)
atcgatc
gtagcta
>>> c.looped()
Dseq(o7)
catcgat
gtagcta
>>> d = Dseq("ietcgazj")
>>> d
Dseq(-8)
catcga
agctag
>>> d.looped()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/local/lib/python2.7/dist-packages/pydna/dsdna.py", line 357, in looped
if type5 == type3 and str(sticky5) == str(rc(sticky3)):
TypeError: DNA cannot be circularized.
5' and 3' sticky ends not compatible!
>>>
"""
if self.circular:
return copy.deepcopy(self)
type5, sticky5 = self.five_prime_end()
type3, sticky3 = self.three_prime_end()
err = TypeError(
"DNA cannot be circularized.\n" "5' and 3' sticky ends not compatible!"
)
if type5 != type3:
raise err
try:
# Test if sticky ends are compatible
self + self
except TypeError:
raise err
new = self.cast_to_ds_left()[: len(self) - len(sticky3)]
new.circular = True
return new
[docs]
def five_prime_end(self) -> Tuple[str, str]:
"""Returns a 2-tuple of trings describing the structure of the 5' end of
the DNA fragment.
The tuple contains (type , sticky) where type is eiter "5'" or "3'".
sticky is always in lower case and contains the sequence of the
protruding end in 5'-3' direction.
See examples below:
Examples
--------
>>> from pydna.dseq import Dseq
>>> a = Dseq("aa", "tttg", ovhg=2)
>>> a
Dseq(-4)
aa
gttt
>>> a.five_prime_end()
("3'", 'tg')
>>> a = Dseq("caaa", "tt", ovhg=-2)
>>> a
Dseq(-4)
caaa
tt
>>> a.five_prime_end()
("5'", 'ca')
>>> a = Dseq("aa", "tt")
>>> a
Dseq(-2)
aa
tt
>>> a.five_prime_end()
('blunt', '')
See also
--------
pydna.dseq.Dseq.three_prime_end
"""
# See docstring for function pydna.utils.get_parts for details
# on what is contained in parts.
parts = self.get_parts()
sticky5 = parts.sticky_left5.translate(dscode_to_watson_table)
sticky3 = parts.sticky_left3.translate(dscode_to_crick_table)[::-1]
single_watson = parts.single_watson.translate(dscode_to_watson_table)
single_crick = parts.single_crick.translate(dscode_to_crick_table)[::-1]
# The walrus operator returns the value being assigned, so
# we can test if it is empty or not.
if sticky := single_watson:
type_ = "single"
elif sticky := single_crick:
type_ = "single"
elif sticky5 == sticky3 == "":
type_, sticky = "blunt", ""
elif sticky := sticky5:
type_ = "5'"
elif sticky := sticky3:
type_ = "3'"
return type_, sticky.lower()
[docs]
def three_prime_end(self) -> Tuple[str, str]:
"""Returns a tuple describing the structure of the 5' end of
the DNA fragment
>>> a = Dseq("aa", "gttt", ovhg=0)
>>> a
Dseq(-4)
aa
tttg
>>> a.three_prime_end()
("5'", 'gt')
>>> a = Dseq("aaac", "tt", ovhg=0)
>>> a
Dseq(-4)
aaac
tt
>>> a.three_prime_end()
("3'", 'ac')
>>> from pydna.dseq import Dseq
>>> a=Dseq("aaa", "ttt")
>>> a
Dseq(-3)
aaa
ttt
>>> a.three_prime_end()
('blunt', '')
See also
--------
pydna.dseq.Dseq.five_prime_end
"""
# See docstring for function pydna.utils.get_parts for details
# on what is contained in parts.
parts = self.get_parts()
sticky5 = parts.sticky_right5.translate(dscode_to_crick_table)[::-1]
sticky3 = parts.sticky_right3.translate(dscode_to_watson_table)
single_watson = parts.single_watson.translate(dscode_to_watson_table)
single_crick = parts.single_crick.translate(dscode_to_crick_table)[::-1]
# The walrus operator returns the value being assigned, so
# we can test if it is empty or not.
if sticky := single_watson:
type_ = "single"
elif sticky := single_crick:
type_ = "single"
elif sticky5 == sticky3 == "":
type_, sticky = "blunt", ""
elif sticky := sticky5:
type_ = "5'"
elif sticky := sticky3:
type_ = "3'"
return type_, sticky.lower()
def __add__(self: DseqType, other: [DseqType, str, bytes]) -> DseqType:
"""
Adding two Dseq objects together.
>>> ds = Dseq("a", "t", ovhg=0)
>>> ds
Dseq(-1)
a
t
>>> ds + ds
Dseq(-2)
aa
tt
>>> "g" + ds # adding a string of left side returns a Dseq
Dseq(-2)
ga
ct
>>> ds + "c" # adding a string of right side returns a Dseq
Dseq(-2)
ac
tg
Parameters
----------
other : [DseqType, str, bytes]
Object to be added.
Raises
------
TypeError
Preventing adding to a circular sequence.
Returns
-------
DseqType
A new Dseq object.
"""
if self.circular:
raise TypeError("circular DNA cannot be ligated!")
try:
if other.circular:
raise TypeError("circular DNA cannot be ligated!")
except AttributeError:
pass
# If other evaluates to False, return a copy of self.
if not other:
return copy.deepcopy(self)
# If self evaluates to False, return a copy of other.
elif not self:
return copy.deepcopy(other)
# get right side end properties for self.
self_type, self_tail = self.three_prime_end()
try:
other_type, other_tail = other.five_prime_end()
except AttributeError:
# if other does not have the expected properties
# most likely it is a string that can be cast as
# a Dseq.
other_type, other_tail = "blunt", ""
other = Dseq(other)
err = TypeError("sticky ends not compatible!")
# The sticky ends has to be of the same type
# or
# one or both of is "single" indicating a stranded molecule.
if (self_type != other_type) and ("single" not in (self_type, other_type)):
raise err
# tail length has to be equal for two phosphdiester bonds to form
if len(self_tail) != len(other_tail):
raise err
# Each basepair is checked against the pydna.alphabet basepair_dict
# which contains the permitted base pairings.
for w, c in zip(self_tail, other_tail[::-1]):
try:
basepair_dict[(w, c)]
except KeyError:
raise err
return self.__class__(
self.watson + other.watson, other.crick + self.crick, self.ovhg
)
def __mul__(self: DseqType, number: int) -> DseqType:
if not isinstance(number, int):
raise TypeError(
"TypeError: can't multiply Dseq" f" by non-int of type {type(number)}"
)
return Dseq("").join(list(itertools.repeat(self, number)))
def _fill_in_left(self: DseqType, nucleotides: str) -> str:
stuffer = ""
type, se = self.five_prime_end()
if type == "5'":
for n in rc(se):
if n in nucleotides:
stuffer += n
else:
break
return self.crick + stuffer, self.ovhg + len(stuffer)
def _fill_in_right(self: DseqType, nucleotides: str) -> str:
stuffer = ""
type, se = self.three_prime_end()
if type == "5'":
for n in rc(se):
if n in nucleotides:
stuffer += n
else:
break
return self.watson + stuffer
[docs]
def fill_in(self, nucleotides: Union[None, str] = None) -> DseqType:
"""Fill in of five prime protruding end with a DNA polymerase
that has only DNA polymerase activity (such as Exo-Klenow [#]_).
Exo-Klenow is a modified version of the Klenow fragment of E.
coli DNA polymerase I, which has been engineered to lack both
3-5 proofreading and 5-3 exonuclease activities.
and any combination of A, G, C or T. Default are all four
nucleotides together.
Parameters
----------
nucleotides : str
Examples
--------
>>> from pydna.dseq import Dseq
>>> b=Dseq("caaa", "cttt")
>>> b
Dseq(-5)
caaa
tttc
>>> b.fill_in()
Dseq(-5)
caaag
gtttc
>>> b.fill_in("g")
Dseq(-5)
caaag
gtttc
>>> b.fill_in("tac")
Dseq(-5)
caaa
tttc
>>> c=Dseq("aaac", "tttg")
>>> c
Dseq(-5)
aaac
gttt
>>> c.fill_in()
Dseq(-5)
aaac
gttt
>>> a=Dseq("aaa", "ttt")
>>> a
Dseq(-3)
aaa
ttt
>>> a.fill_in()
Dseq(-3)
aaa
ttt
References
----------
.. [#] http://en.wikipedia.org/wiki/Klenow_fragment#The_exo-_Klenow_fragment
"""
if nucleotides is None:
nucleotides = "GATCRYWSMKHBVDN"
nucleotides = set(nucleotides.lower() + nucleotides.upper())
crick, ovhg = self._fill_in_left(nucleotides)
watson = self._fill_in_right(nucleotides)
return Dseq(watson, crick, ovhg)
klenow = fill_in # alias
[docs]
def nibble_to_blunt(self) -> DseqType:
"""
Simulates treatment a nuclease with both 5'-3' and 3'-5' single
strand specific exonuclease activity (such as mung bean nuclease [#]_)
Mung bean nuclease is a nuclease enzyme derived from mung bean sprouts
that preferentially degrades single-stranded DNA and RNA into
5'-phosphate- and 3'-hydroxyl-containing nucleotides.
Treatment results in blunt DNA, regardless of wheter the protruding end
is 5' or 3'.
::
ggatcc -> gatcc
ctaggg ctagg
ggatcc -> ggatc
tcctag cctag
>>> from pydna.dseq import Dseq
>>> b=Dseq("caaa", "cttt")
>>> b
Dseq(-5)
caaa
tttc
>>> b.mung()
Dseq(-3)
aaa
ttt
>>> c=Dseq("aaac", "tttg")
>>> c
Dseq(-5)
aaac
gttt
>>> c.mung()
Dseq(-3)
aaa
ttt
References
----------
.. [#] http://en.wikipedia.org/wiki/Mung_bean_nuclease
"""
parts = self.get_parts()
return self.__class__(parts.middle)
mung = nibble_to_blunt
[docs]
def T4(self, nucleotides=None) -> DseqType:
"""
Fill in 5' protruding ends and nibble 3' protruding ends.
This is done using a DNA polymerase providing 3'-5' nuclease activity
such as T4 DNA polymerase. This can be done in presence of any
combination of the four nucleotides A, G, C or T.
T4 DNA polymerase is widely used to “polish” DNA ends because of its
strong 3-5 exonuclease activity in the absence of dNTPs, it chews
back 3′ overhangs to create blunt ends; in the presence of limiting
dNTPs, it can fill in 5′ overhangs; and by carefully controlling
reaction time, temperature, and nucleotide supply, you can generate
defined recessed or blunt termini.
Tuning the nucleotide set can facilitate engineering of partial
sticky ends. Default are all four nucleotides together.
::
aaagatc-3 aaa 3' ends are always removed.
||| ---> ||| A and T needed or the molecule will
3-ctagttt ttt degrade completely.
5-gatcaaa gatcaaaGATC 5' ends are filled in the
||| ---> ||||||||||| presence of GATC
tttctag-5 CTAGtttctag
5-gatcaaa gatcaaaGAT 5' ends are partially filled in the
||| ---> ||||||||| presence of GAT to produce a 1 nt
tttctag-5 TAGtttctag 5' overhang
5-gatcaaa gatcaaaGA 5' ends are partially filled in the
||| ---> ||||||| presence of GA to produce a 2 nt
tttctag-5 AGtttctag 5' overhang
5-gatcaaa gatcaaaG 5' ends are partially filled in the
||| ---> ||||| presence of G to produce a 3 nt
tttctag-5 Gtttctag 5' overhang
Parameters
----------
nucleotides : str
Examples
--------
>>> from pydna.dseq import Dseq
>>> a = Dseq.from_representation(
... '''
... gatcaaa
... tttctag
... ''')
>>> a
Dseq(-11)
gatcaaa
tttctag
>>> a.T4()
Dseq(-11)
gatcaaagatc
ctagtttctag
>>> a.T4("GAT")
Dseq(-11)
gatcaaagat
tagtttctag
>>> a.T4("GA")
Dseq(-11)
gatcaaaga
agtttctag
>>> a.T4("G")
Dseq(-11)
gatcaaag
gtttctag
"""
if not nucleotides:
nucleotides = "GATCRYWSMKHBVDN"
nucleotides = set(nucleotides.lower() + nucleotides.upper())
type, se = self.five_prime_end()
if type == "5'":
crick, ovhg = self._fill_in_left(nucleotides)
else:
if type == "3'":
ovhg = 0
crick = self.crick[: -len(se)]
else:
ovhg = 0
crick = self.crick
x = len(crick) - 1
while x >= 0:
if crick[x] in nucleotides:
break
x -= 1
ovhg = x - len(crick) + 1 + ovhg
crick = crick[: x + 1]
if not crick:
ovhg = 0
watson = self.watson
type, se = self.three_prime_end()
if type == "5'":
watson = self._fill_in_right(nucleotides)
else:
if type == "3'":
watson = self.watson[: -len(se)]
x = len(watson) - 1
while x >= 0:
if watson[x] in nucleotides:
break
x -= 1
watson = watson[: x + 1]
return Dseq(watson, crick, ovhg)
t4 = T4 # alias for the T4 method.
[docs]
def nibble_five_prime_left(self: DseqType, n: int = 1) -> DseqType:
"""
5' => 3' resection at the left side (start) of the molecule.
The argument n indicate the number of nucleotides that are to be
removed. The outcome of this depend on the structure of the molecule.
See the two examples below:
The figure below indicates a recess of length two from a blunt DNA
fragment. The resulting DNA fragment has a 3' protruding single strand.
::
gatc tc
|||| --> ||
ctag ctag
The figure below indicates a recess of length two from a DNA fragment
with a 5' sticky end resulting in a blunt sequence.
::
ttgatc gatc
|||| --> ||||
ctag ctag
>>> from pydna.dseq import Dseq
>>> ds = Dseq("gatc")
>>> ds
Dseq(-4)
gatc
ctag
>>> ds.nibble_five_prime_left(2)
Dseq(-4)
tc
ctag
>>> ds.nibble_five_prime_left(3)
Dseq(-4)
c
ctag
>>> ds.nibble_five_prime_left(4)
Dseq(-4)
||||
ctag
>>> ds = Dseq.from_representation(
... '''
... GGgatc
... ctag
... ''')
>>> ds
Dseq(-6)
GGgatc
ctag
>>> ds.nibble_five_prime_left(2)
Dseq(-4)
gatc
ctag
Parameters
----------
n : int, optional
The default is 1. This is the number of nucleotides removed.
Returns
-------
DseqType
DESCRIPTION.
"""
n += max(0, self.ovhg or 0)
return Dseq(
self._data[:n]
.translate(dscode_to_crick_table)
.translate(complement_table_for_dscode)
.translate(dscode_to_crick_tail_table)
.lstrip()
+ self._data[n:]
)
[docs]
def nibble_five_prime_right(self: DseqType, n: int = 1) -> DseqType:
"""
5' => 3' resection at the right side (end) of the molecule.
The argument n indicate the number of nucleotides that are to be
removed. The outcome of this depend on the structure of the molecule.
See the two examples below:
The figure below indicates a recess of length two from a blunt DNA
fragment. The resulting DNA fragment has a 3' protruding single strand.
::
gatc gatc
|||| --> ||
ctag ct
The figure below indicates a recess of length two from a DNA fragment
with a 5' sticky end resulting in a blunt sequence.
::
gatc gatc
|||| --> ||||
ctagtt ctag
>>> from pydna.dseq import Dseq
>>> ds = Dseq("gatc")
>>> ds
Dseq(-4)
gatc
ctag
>>> ds.nibble_five_prime_right(2)
Dseq(-4)
gatc
ct
>>> ds.nibble_five_prime_right(3)
Dseq(-4)
gatc
c
>>> ds.nibble_five_prime_right(4)
Dseq(-4)
gatc
||||
>>> ds = Dseq.from_representation(
... '''
... gatc
... ctagGG
... ''')
>>> ds.nibble_five_prime_right(2)
Dseq(-4)
gatc
ctag
"""
n = len(self) - n
ovhg = len(self) if self.right_ovhg is None else self.right_ovhg
n -= max(0, ovhg)
return Dseq(
self._data[:n]
+ self._data[n:]
.translate(dscode_to_watson_table)
.translate(dscode_to_watson_tail_table)
.lstrip()
)
exo1_front = nibble_five_prime_left # TODO: consider using the new names
exo1_end = nibble_five_prime_right # TODO: consider using the new names
[docs]
def nibble_three_prime_left(self: DseqType, n=1) -> DseqType:
"""
3' => 5' resection at the left side (beginning) of the molecule.
The argument n indicate the number of nucleotides that are to be
removed. The outcome of this depend on the structure of the molecule.
See the two examples below:
The figure below indicates a recess of length two from a blunt DNA
fragment. The resulting DNA fragment has a 5' protruding single strand.
::
gatc gatc
|||| --> ||
ctag ag
The figure below indicates a recess of length two from a DNA fragment
with a 3' sticky end resulting in a blunt sequence.
::
gatc gatc
|||| --> ||||
ttctag ctag
>>> from pydna.dseq import Dseq
>>> ds = Dseq("gatc")
>>> ds
Dseq(-4)
gatc
ctag
>>> ds.nibble_three_prime_left(2)
Dseq(-4)
gatc
ag
>>> ds.nibble_three_prime_left(3)
Dseq(-4)
gatc
g
>>> ds.nibble_three_prime_left(4)
Dseq(-4)
gatc
||||
>>> ds = Dseq.from_representation(
... '''
... gatc
... CCctag
... ''')
>>> ds
Dseq(-6)
gatc
CCctag
>>> ds.nibble_three_prime_left(2)
Dseq(-4)
gatc
ctag
"""
ovhg = len(self) if self.ovhg is None else self.ovhg
n -= min(0, ovhg)
return Dseq(
self._data[:n]
.translate(dscode_to_watson_table)
.translate(dscode_to_watson_tail_table)
.lstrip()
+ self._data[n:]
)
[docs]
def nibble_three_prime_right(self: DseqType, n=1) -> DseqType:
"""
3' => 5' resection at the right side (end) of the molecule.
The argument n indicate the number of nucleotides that are to be
removed. The outcome of this depend on the structure of the molecule.
See the two examples below:
The figure below indicates a recess of length two from a blunt DNA
fragment. The resulting DNA fragment has a 5' protruding single strand.
::
gatc ga
|||| --> ||
ctag ctag
The figure below indicates a recess of length two from a DNA fragment
with a 3' sticky end resulting in a blunt sequence.
::
gatctt gatc
|||| --> ||||
ctag ctag
>>> from pydna.dseq import Dseq
>>> ds = Dseq("gatc")
>>> ds
Dseq(-4)
gatc
ctag
>>> ds.nibble_three_prime_right(2)
Dseq(-4)
ga
ctag
>>> ds.nibble_three_prime_right(3)
Dseq(-4)
g
ctag
>>> ds.nibble_three_prime_right(4)
Dseq(-4)
||||
ctag
>>> ds = Dseq.from_representation(
... '''
... gatcCC
... ctag
... ''')
>>> ds.nibble_three_prime_right(2)
Dseq(-4)
gatc
ctag
"""
n = len(self) - n
ovhg = len(self) if self.right_ovhg is None else self.right_ovhg
n += min(0, ovhg)
return Dseq(
self._data[:n]
+ self._data[n:]
.translate(dscode_to_crick_table)
.translate(complement_table_for_dscode)
.translate(dscode_to_crick_tail_table)
.lstrip()
)
[docs]
def no_cutters(
self, batch: Union[RestrictionBatch, None] = None
) -> RestrictionBatch:
"""Enzymes in a RestrictionBatch not cutting sequence."""
if batch is None:
batch = CommOnly
ana = batch.search(self)
ncut = {enz: sitelist for (enz, sitelist) in ana.items() if not sitelist}
return RestrictionBatch(ncut)
[docs]
def unique_cutters(
self, batch: Union[RestrictionBatch, None] = None
) -> RestrictionBatch:
"""Enzymes in a RestrictionBatch cutting sequence once."""
if batch is None:
batch = CommOnly
return self.n_cutters(n=1, batch=batch)
once_cutters = unique_cutters # alias for unique_cutters
[docs]
def twice_cutters(
self, batch: Union[RestrictionBatch, None] = None
) -> RestrictionBatch:
"""Enzymes in a RestrictionBatch cutting sequence twice."""
if batch is None:
batch = CommOnly
return self.n_cutters(n=2, batch=batch)
[docs]
def n_cutters(
self, n=3, batch: Union[RestrictionBatch, None] = None
) -> RestrictionBatch:
"""Enzymes in a RestrictionBatch cutting n times."""
if batch is None:
batch = CommOnly
ana = batch.search(self)
ncut = {enz: sitelist for (enz, sitelist) in ana.items() if len(sitelist) == n}
return RestrictionBatch(ncut)
[docs]
def cutters(self, batch: Union[RestrictionBatch, None] = None) -> RestrictionBatch:
"""Enzymes in a RestrictionBatch cutting sequence at least once."""
if batch is None:
batch = CommOnly
ana = batch.search(self)
ncut = {enz: sitelist for (enz, sitelist) in ana.items() if sitelist}
return RestrictionBatch(ncut)
[docs]
def seguid(self) -> str:
"""SEGUID checksum for the sequence."""
if self.circular:
cs = cdseguid(
self.watson.upper(), self.crick.upper(), alphabet="{DNA-extended}"
)
else:
"""docstring."""
w = f"{self.ovhg * '-'}{self.watson}{'-' * (-self.ovhg + len(self.crick) - len(self.watson))}".upper()
c = f"{'-' * (self.ovhg + len(self.watson) - len(self.crick))}{self.crick}{-self.ovhg * '-'}".upper()
cs = ldseguid(w, c, alphabet="{DNA-extended},AU")
return cs
[docs]
def isblunt(self) -> bool:
"""isblunt.
Return True if Dseq is linear and blunt and
false if staggered or circular.
Examples
--------
>>> from pydna.dseq import Dseq
>>> a=Dseq("gat")
>>> a
Dseq(-3)
gat
cta
>>> a.isblunt()
True
>>> a=Dseq("gat", "atcg")
>>> a
Dseq(-4)
gat
gcta
>>> a.isblunt()
False
>>> a=Dseq("gat", "gatc")
>>> a
Dseq(-4)
gat
ctag
>>> a.isblunt()
False
>>> a=Dseq("gat", circular=True)
>>> a
Dseq(o3)
gat
cta
>>> a.isblunt()
False
"""
parts = self.get_parts()
return not any(
(
parts.sticky_right5,
parts.sticky_right3,
parts.sticky_left3,
parts.sticky_left5,
self.circular,
)
)
[docs]
def terminal_transferase(self, nucleotides: str = "a") -> DseqType:
"""
Terminal deoxynucleotidyl transferase (TdT) is a template-independent
DNA polymerase that adds nucleotides to the 3′-OH ends of DNA, typically
single-stranded or recessed 3′ ends. In cloning, it’s classically used
to create homopolymer tails (e.g. poly-dG on a vector and poly-dC on an insert)
so that fragments can anneal via complementary overhangs (“tailing” cloning).
This activity ia also present in some DNA polymerases, such as Taq polymerase.
This property is used in the populat T/A cloning protocol ([#]_).
::
gct gcta
||| --> |||
cga acga
>>> from pydna.dseq import Dseq
>>> a = Dseq("aa")
>>> a = Dseq("gct")
>>> a
Dseq(-3)
gct
cga
>>> a.terminal_transferase()
Dseq(-5)
gcta
acga
>>> a.terminal_transferase("G")
Dseq(-5)
gctG
Gcga
Parameters
----------
nucleotides : str, optional
The default is "a".
Returns
-------
DseqType
DESCRIPTION.
References
----------
.. [#] https://en.wikipedia.org/wiki/TA_cloning
"""
ovhg = self.ovhg
if self.ovhg >= 0:
ovhg += len(nucleotides)
return Dseq(self.watson + nucleotides, self.crick + nucleotides, ovhg)
[docs]
def user(self) -> DseqType:
"""
USER Enzyme treatment.
USER Enzyme is a mixture of Uracil DNA glycosylase (UDG) and the
DNA glycosylase-lyase Endonuclease VIII.
UDG catalyses the excision of an uracil base, forming an abasic
or apyrimidinic site (AP site). Endonuclease VIII removes the AP
site creating a DNA gap.
::
tagaagtaggUat tagaagtagg at
||||||||||||| ---> |||||||||| ||
atcUtcatccata atc tcatccata
>>> a = Dseq("tagaagtaggUat", "atcUtcatccata"[::-1], 0)
>>> a
Dseq(-13)
tagaagtaggUat
atcutcatccAta
>>> a.user()
Dseq(-13)
tagaagtagg at
atc tcatccAta
Returns
-------
DseqType
DNA fragment with uracile bases removed.
"""
return Dseq(self._data.translate(bytes.maketrans(b"UuOo", b"ZzEe")))
[docs]
def cut(self: DseqType, *enzymes: EnzymesType) -> Tuple[DseqType, ...]:
"""Returns a list of linear Dseq fragments produced in the digestion.
If there are no cuts, an empty list is returned.
Parameters
----------
enzymes : enzyme object or iterable of such objects
A Bio.Restriction.XXX restriction objects or iterable.
Returns
-------
frags : list
list of Dseq objects formed by the digestion
Examples
--------
>>> from pydna.dseq import Dseq
>>> seq=Dseq("ggatccnnngaattc")
>>> seq
Dseq(-15)
ggatccnnngaattc
cctaggnnncttaag
>>> from Bio.Restriction import BamHI,EcoRI
>>> type(seq.cut(BamHI))
<class 'tuple'>
>>> for frag in seq.cut(BamHI): print(repr(frag))
Dseq(-5)
g
cctag
Dseq(-14)
gatccnnngaattc
gnnncttaag
>>> seq.cut(EcoRI, BamHI) == seq.cut(BamHI, EcoRI)
True
>>> a,b,c = seq.cut(EcoRI, BamHI)
>>> a+b+c
Dseq(-15)
ggatccnnngaattc
cctaggnnncttaag
>>>
"""
cutsites = self.get_cutsites(*enzymes)
cutsite_pairs = self.get_cutsite_pairs(cutsites)
return tuple(self.apply_cut(*cs) for cs in cutsite_pairs)
[docs]
def cutsite_is_valid(self, cutsite: CutSiteType) -> bool:
"""
Check is a cutsite is valid.
A cutsite is a nested 2-tuple with this form:
((cut_watson, ovhg), enz), for example ((396, -4), EcoRI)
The cut_watson (positive integer) is the cut position of the sequence as for example
returned by the Bio.Restriction module.
The ovhg (overhang, positive or negative integer or 0) has the same meaning as
for restriction enzymes in the Bio.Restriction module and for
pydna.dseq.Dseq objects (see docstring for this module and example below)
Enzyme can be None.
::
Enzyme overhang
EcoRI -4 --GAATTC-- --G AATTC--
|||||| --> | |
--CTTAAG-- --CTTAA G--
KpnI 4 --GGTACC-- --GGTAC C--
|||||| --> | |
--CCATGG-- --C CATGG--
SmaI 0 --CCCGGG-- --CCC GGG--
|||||| --> ||| |||
--GGGCCC-- --GGG CCC--
>>> from Bio.Restriction import EcoRI, KpnI, SmaI
>>> EcoRI.ovhg
-4
>>> KpnI.ovhg
4
>>> SmaI.ovhg
0
Returns False if:
- Cut positions fall outside the sequence (could be moved to Biopython)
TODO: example
- Overhang is not double stranded
TODO: example
- Recognition site is not double stranded or is outside the sequence
TODO: example
- For enzymes that cut twice, it checks that at least one possibility is valid
TODO: example
Parameters
----------
cutsite : CutSiteType
DESCRIPTION.
Returns
-------
bool
True if cutsite can cut the DNA fragment.
"""
assert cutsite is not None, "cutsite is None"
enz = cutsite[1]
watson, crick, ovhg = self.get_cut_parameters(cutsite, True)
# The overhang is double stranded
overhang_dseq = self[watson:crick] if ovhg < 0 else self[crick:watson]
if overhang_dseq.ovhg != 0 or overhang_dseq.watson_ovhg != 0:
return False
# The recognition site is double stranded and within the sequence
start_of_recognition_site = watson - enz.fst5
if start_of_recognition_site < 0:
start_of_recognition_site += len(self)
end_of_recognition_site = start_of_recognition_site + enz.size
if self.circular:
end_of_recognition_site %= len(self)
recognition_site = self[start_of_recognition_site:end_of_recognition_site]
if (
len(recognition_site) == 0
or recognition_site.ovhg != 0
or recognition_site.watson_ovhg != 0
):
if enz is None or enz.scd5 is None:
return False
else:
# For enzymes that cut twice, this might be referring to the second one
start_of_recognition_site = watson - enz.scd5
if start_of_recognition_site < 0:
start_of_recognition_site += len(self)
end_of_recognition_site = start_of_recognition_site + enz.size
if self.circular:
end_of_recognition_site %= len(self)
recognition_site = self[
start_of_recognition_site:end_of_recognition_site
]
if (
len(recognition_site) == 0
or recognition_site.ovhg != 0
or recognition_site.watson_ovhg != 0
):
return False
return True
[docs]
def get_cutsites(self: DseqType, *enzymes: EnzymesType) -> List[CutSiteType]:
"""Returns a list of cutsites, represented represented as `((cut_watson, ovhg), enz)`:
- `cut_watson` is a positive integer contained in `[0,len(seq))`, where `seq` is the sequence
that will be cut. It represents the position of the cut on the watson strand, using the full
sequence as a reference. By "full sequence" I mean the one you would get from `str(Dseq)`.
- `ovhg` is the overhang left after the cut. It has the same meaning as `ovhg` in
the `Bio.Restriction` enzyme objects, or pydna's `Dseq` property.
- `enz` is the enzyme object. It's not necessary to perform the cut, but can be
used to keep track of which enzyme was used.
Cuts are only returned if the recognition site and overhang are on the double-strand
part of the sequence.
Parameters
----------
enzymes : Union[RestrictionBatch,list[_AbstractCut]]
Returns
-------
list[tuple[tuple[int,int], _AbstractCut]]
Examples
--------
>>> from Bio.Restriction import EcoRI
>>> from pydna.dseq import Dseq
>>> seq = Dseq('AAGAATTCAAGAATTC')
>>> seq.get_cutsites(EcoRI)
[((3, -4), EcoRI), ((11, -4), EcoRI)]
`cut_watson` is defined with respect to the "full sequence", not the
watson strand:
>>> dseq = Dseq.from_full_sequence_and_overhangs('aaGAATTCaa', 1, 0)
>>> dseq
Dseq(-10)
aGAATTCaa
ttCTTAAGtt
>>> dseq.get_cutsites([EcoRI])
[((3, -4), EcoRI)]
Cuts are only returned if the recognition site and overhang are on the double-strand
part of the sequence.
>>> Dseq('GAATTC').get_cutsites([EcoRI])
[((1, -4), EcoRI)]
>>> Dseq.from_full_sequence_and_overhangs('GAATTC', -1, 0).get_cutsites([EcoRI])
[]
"""
if len(enzymes) == 1 and isinstance(enzymes[0], RestrictionBatch):
# argument is probably a RestrictionBatch
enzymes = [e for e in enzymes[0]]
enzymes = list(dict.fromkeys(flatten(enzymes))) # remove duplicate enzymes
out = list()
for e in enzymes:
# Positions of the cut on the watson strand. They are 1-based, so we subtract
# 1 to get 0-based positions
cuts_watson = [c - 1 for c in e.search(self, linear=(not self.circular))]
out += [((w, e.ovhg), e) for w in cuts_watson]
return sorted([cutsite for cutsite in out if self.cutsite_is_valid(cutsite)])
[docs]
def left_end_position(self) -> Tuple[int, int]:
"""
The index in the full sequence of the watson and crick start positions.
full sequence (str(self)) for all three cases is AAA
::
AAA AA AAT
TT TTT TTT
Returns (0, 1) Returns (1, 0) Returns (0, 0)
"""
if self.ovhg > 0:
return self.ovhg, 0
return 0, -self.ovhg
[docs]
def right_end_position(self) -> Tuple[int, int]:
"""The index in the full sequence of the watson and crick end positions.
full sequence (str(self)) for all three cases is AAA
```
AAA AA AAA
TT TTT TTT
Returns (3, 2) Returns (2, 3) Returns (3, 3)
```
"""
if self.watson_ovhg < 0:
return len(self) + self.watson_ovhg, len(self)
return len(self), len(self) - self.watson_ovhg
[docs]
def get_ss_meltsites(self: DseqType, length: int) -> tuple[int, int]:
"""
Single stranded DNA melt sites
Two lists of 2-tuples of integers are returned. Each tuple
(`((from, to))`) contains the start and end positions of a single
stranded region, shorter or equal to `length`.
In the example below, the middle 2 nt part is released from the
molecule.
::
tagaa ta gtatg
||||| || ||||| --> [(6,8)], []
atcttcatccatac
tagaagtaggtatg
||||| || ||||| --> [], [(6,8)]
atctt at catac
The output of this method is used in the `melt_ss_dna` method in order
to determine the start and end positions of single stranded regions.
See get_ds_meltsites for melting ds sequences.
Examples
--------
>>> from pydna.dseq import Dseq
>>> ds = Dseq("tagaaqtaqgtatg")
>>> ds
Dseq(-14)
tagaa ta gtatg
atcttcatccatac
>>> cutsites = ds.get_ss_meltsites(2)
>>> cutsites
([(6, 8)], [])
>>> ds[6:8]
Dseq(-2)
ta
at
>>> ds = Dseq("tagaaptapgtatg")
>>> ds
Dseq(-14)
tagaagtaggtatg
atctt at catac
>>> cutsites = ds.get_ss_meltsites(2)
>>> cutsites
([], [(6, 8)])
"""
regex = regex_ss_melt_factory(length)
if self.circular:
spacer = length
cutfrom = self._data[-length:] + self._data + self._data[:length]
else:
spacer = 0
cutfrom = self._data
watson_cuts = []
crick_cuts = []
for m in regex.finditer(cutfrom):
if m.lastgroup == "watson":
cut1 = m.start() + spacer
cut2 = m.end() + spacer
watson_cuts.append((cut1, cut2))
else:
assert m.lastgroup == "crick"
cut1 = m.start() + spacer
cut2 = m.end() + spacer
crick_cuts.append((cut1, cut2))
return watson_cuts, crick_cuts
[docs]
def get_ds_meltsites(self: DseqType, length: int) -> List[CutSiteType]:
"""
Double stranded DNA melt sites
DNA molecules can fall apart by melting if they have internal single
stranded regions. In the example below, the molecule has two gaps
on opposite sides, two nucleotides apart, which means that it hangs
together by two basepairs.
This molecule can melt into two separate 8 bp double stranded
molecules, each with 3 nt 3' overhangs a depicted below.
::
tagaagta gtatg tagaagta gtatg
||||| || ||||| --> ||||| |||||
atctt atccatac atctt atccatac
A list of 2-tuples is returned. Each tuple (`((cut_watson, ovhg), None)`)
contains cut position and the overhang value in the same format as
returned by the get_cutsites method for restriction enzymes.
Note that this function deals with melting that results in two double
stranded DNA molecules.
See get_ss_meltsites for melting of single stranded regions from
molecules.
Examples
--------
>>> from pydna.dseq import Dseq
>>> ds = Dseq("tagaaptaqgtatg")
>>> ds
Dseq(-14)
tagaagta gtatg
atctt atccatac
>>> cutsite = ds.get_ds_meltsites(2)
>>> cutsite
[((8, 2), None)]
"""
if length < 1:
return tuple()
regex = regex_ds_melt_factory(length)
if self.circular:
spacer = length
cutfrom = self._data[-length:] + self._data + self._data[:length]
else:
spacer = 0
cutfrom = self._data
cuts = []
for m in regex.finditer(cutfrom):
if m.lastgroup == "watson":
cut = (m.end() - spacer, m.end() - m.start()), None
else:
assert m.lastgroup == "crick"
cut = (m.start() - spacer, m.start() - m.end()), None
cuts.append(cut)
return cuts
[docs]
def cast_to_ds_right(self):
"""
NNNN NNNNGATC
|||| --> ||||||||
NNNNCTAG NNNNCTAG
NNNNGATC NNNNGATC
|||| --> ||||||||
NNNN NNNNCTAG
"""
p = self.get_parts()
ds_stuffer = (p.sticky_right5 or p.sticky_right3).translate(
dscode_to_full_sequence_table
)
result = (p.sticky_left5 or p.sticky_left3) + p.middle + ds_stuffer
return self.__class__(result, circular=False)
[docs]
def cast_to_ds(self):
"""Sequencially calls cast_to_ds_left and cast_to_ds_right."""
return self.cast_to_ds_left().cast_to_ds_right()
[docs]
def cast_to_ds_left(self):
"""
GATCNNNN GATCNNNN
|||| --> ||||||||
NNNN CTAGNNNN
NNNN GATCNNNN
|||| --> ||||||||
CTAGNNNN CTAGNNNN
"""
p = self.get_parts()
ds_stuffer = (p.sticky_left5 or p.sticky_left3).translate(
dscode_to_full_sequence_table
)
result = ds_stuffer + p.middle + (p.sticky_right5 or p.sticky_right3)
return self.__class__(result, circular=False)
[docs]
def get_cut_parameters(
self, cut: Union[CutSiteType, None], is_left: bool
) -> Tuple[int, int, int]:
"""For a given cut expressed as ((cut_watson, ovhg), enz), returns
a tuple (cut_watson, cut_crick, ovhg).
- cut_watson: see get_cutsites docs
- cut_crick: equivalent of cut_watson in the crick strand
- ovhg: see get_cutsites docs
The cut can be None if it represents the left or right end of the sequence.
Then it will return the position of the watson and crick ends with respect
to the "full sequence". The `is_left` parameter is only used in this case.
"""
if cut is not None:
watson, ovhg = cut[0]
crick = watson - ovhg
if self.circular:
crick %= len(self)
return watson, crick, ovhg
assert not self.circular, "Circular sequences should not have None cuts"
if is_left:
return *self.left_end_position(), self.ovhg
# In the right end, the overhang does not matter
return *self.right_end_position(), self.watson_ovhg
[docs]
def melt(self, length):
"""
TBD
Parameters
----------
length : TYPE
DESCRIPTION.
Returns
-------
TYPE
DESCRIPTION.
"""
if not length or length < 1:
return tuple()
# First we need to get rid of single stranded sequences
new, strands = self.melt_ss_dna(length)
cutsites = new.get_ds_meltsites(length)
cutsite_pairs = self.get_cutsite_pairs(cutsites)
result = tuple(new.apply_cut(*cutsite_pair) for cutsite_pair in cutsite_pairs)
result = tuple([new]) if strands and not result else result
return tuple(strands) + tuple(result)
[docs]
def melt_ss_dna(self, length) -> tuple["Dseq", list["Dseq"]]:
"""
Melt to separate single stranded DNA
Single stranded DNA molecules shorter or equal to `length` shed from
a double stranded DNA molecule without affecting the length of the
remaining molecule.
In the examples below, the middle 2 nt part is released from the
molecule.
::
tagaa ta gtatg tagaa gtatg ta
||||| || ||||| --> ||||| ||||| + ||
atcttcatccatac atcttcatccatac
tagaagtaggtatg tagaagtaggtatg
||||| || ||||| --> ||||| ||||| + ||
atctt at catac atctt catac at
Examples
--------
>>> from pydna.dseq import Dseq
>>> ds = Dseq("tagaaqtaqgtatg")
>>> ds
Dseq(-14)
tagaa ta gtatg
atcttcatccatac
>>> new, strands = ds.melt_ss_dna(2)
>>> new
Dseq(-14)
tagaa gtatg
atcttcatccatac
>>> strands[0]
Dseq(-2)
ta
||
>>> ds = Dseq("tagaaptapgtatg")
>>> ds
Dseq(-14)
tagaagtaggtatg
atctt at catac
>>> new, strands = ds.melt_ss_dna(2)
>>> new
Dseq(-14)
tagaagtaggtatg
atctt catac
>>> strands[0]
Dseq(-2)
||
at
"""
watsonnicks, cricknicks = self.get_ss_meltsites(length)
new, strands = self.shed_ss_dna(watsonnicks, cricknicks)
return new, strands
[docs]
def shed_ss_dna(
self,
watson_cutpairs: list[tuple[int, int]] = None,
crick_cutpairs: list[tuple[int, int]] = None,
):
"""
Separate parts of one of the DNA strands
Examples
--------
>>> from pydna.dseq import Dseq
>>> ds = Dseq("tagaagtaggtatg")
>>> ds
Dseq(-14)
tagaagtaggtatg
atcttcatccatac
>>> new, strands = ds.shed_ss_dna([(6, 8)],[])
>>> new
Dseq(-14)
tagaag ggtatg
atcttcatccatac
>>> strands[0]
Dseq(-2)
ta
||
>>> new, strands = ds.shed_ss_dna([],[(6, 8)])
>>> new
Dseq(-14)
tagaagtaggtatg
atcttc ccatac
>>> strands[0]
Dseq(-2)
||
at
>>> ds = Dseq("tagaagtaggtatg")
>>> new, (strand1, strand2) = ds.shed_ss_dna([(6, 8), (9, 11)],[])
>>> new
Dseq(-14)
tagaag g atg
atcttcatccatac
>>> strand1
Dseq(-2)
ta
||
>>> strand2
Dseq(-2)
gt
||
"""
watson_cutpairs = watson_cutpairs or list()
crick_cutpairs = crick_cutpairs or list()
strands = []
new = bytearray(self._data)
for x, y in watson_cutpairs:
stuffer = new[x:y]
ss = Dseq.quick(new[x:y].translate(dscode_to_watson_tail_table))
new[x:y] = stuffer.translate(dscode_to_crick_tail_table)
strands.append(ss)
for x, y in crick_cutpairs:
stuffer = new[x:y]
ss = Dseq.quick(stuffer.translate(dscode_to_crick_tail_table))
new[x:y] = stuffer.translate(dscode_to_watson_tail_table)
strands.append(ss)
return Dseq.quick(new), strands
[docs]
def apply_cut(self, left_cut: CutSiteType, right_cut: CutSiteType) -> "Dseq":
"""Extracts a subfragment of the sequence between two cuts.
For more detail see the documentation of get_cutsite_pairs.
Parameters
----------
left_cut : Union[tuple[tuple[int,int], _AbstractCut], None]
right_cut: Union[tuple[tuple[int,int], _AbstractCut], None]
Returns
-------
Dseq
Examples
--------
>>> from Bio.Restriction import EcoRI
>>> from pydna.dseq import Dseq
>>> dseq = Dseq('aaGAATTCaaGAATTCaa')
>>> cutsites = dseq.get_cutsites([EcoRI])
>>> cutsites
[((3, -4), EcoRI), ((11, -4), EcoRI)]
>>> p1, p2, p3 = dseq.get_cutsite_pairs(cutsites)
>>> p1
(None, ((3, -4), EcoRI))
>>> dseq.apply_cut(*p1)
Dseq(-7)
aaG
ttCTTAA
>>> p2
(((3, -4), EcoRI), ((11, -4), EcoRI))
>>> dseq.apply_cut(*p2)
Dseq(-12)
AATTCaaG
GttCTTAA
>>> p3
(((11, -4), EcoRI), None)
>>> dseq.apply_cut(*p3)
Dseq(-7)
AATTCaa
Gtt
>>> dseq = Dseq('TTCaaGAA', circular=True)
>>> cutsites = dseq.get_cutsites([EcoRI])
>>> cutsites
[((6, -4), EcoRI)]
>>> pair = dseq.get_cutsite_pairs(cutsites)[0]
>>> pair
(((6, -4), EcoRI), ((6, -4), EcoRI))
>>> dseq.apply_cut(*pair)
Dseq(-12)
AATTCaaG
GttCTTAA
"""
if cuts_overlap(left_cut, right_cut, len(self)):
raise ValueError("Cuts by {} {} overlap.".format(left_cut[1], right_cut[1]))
left_watson, left_crick, ovhg_left = self.get_cut_parameters(left_cut, True)
right_watson, right_crick, _ = self.get_cut_parameters(right_cut, False)
return Dseq(
self[left_watson:right_watson]._data.translate(dscode_to_watson_table),
self[left_crick:right_crick]
.reverse_complement()
._data.translate(dscode_to_watson_table),
ovhg=ovhg_left,
)
[docs]
def get_cutsite_pairs(
self, cutsites: List[CutSiteType]
) -> List[Tuple[Union[None, CutSiteType], Union[None, CutSiteType]]]:
"""Returns pairs of cutsites that render the edges of the resulting fragments.
A fragment produced by restriction is represented by a tuple of length 2 that
may contain cutsites or `None`:
- Two cutsites: represents the extraction of a fragment between those two
cutsites, in that orientation. To represent the opening of a circular
molecule with a single cutsite, we put the same cutsite twice.
- `None`, cutsite: represents the extraction of a fragment between the left
edge of linear sequence and the cutsite.
- cutsite, `None`: represents the extraction of a fragment between the cutsite
and the right edge of a linear sequence.
Parameters
----------
cutsites : list[tuple[tuple[int,int], _AbstractCut]]
Returns
-------
list[tuple[tuple[tuple[int,int], _AbstractCut]|None],tuple[tuple[int,int], _AbstractCut]|None]
Examples
--------
>>> from Bio.Restriction import EcoRI
>>> from pydna.dseq import Dseq
>>> dseq = Dseq('aaGAATTCaaGAATTCaa')
>>> cutsites = dseq.get_cutsites([EcoRI])
>>> cutsites
[((3, -4), EcoRI), ((11, -4), EcoRI)]
>>> dseq.get_cutsite_pairs(cutsites)
[(None, ((3, -4), EcoRI)), (((3, -4), EcoRI), ((11, -4), EcoRI)), (((11, -4), EcoRI), None)]
>>> dseq = Dseq('TTCaaGAA', circular=True)
>>> cutsites = dseq.get_cutsites([EcoRI])
>>> cutsites
[((6, -4), EcoRI)]
>>> dseq.get_cutsite_pairs(cutsites)
[(((6, -4), EcoRI), ((6, -4), EcoRI))]
"""
if len(cutsites) == 0:
return []
if not self.circular:
cutsites = [None, *cutsites, None]
else:
# Add the first cutsite at the end, for circular cuts
cutsites.append(cutsites[0])
return list(zip(cutsites, cutsites[1:]))
[docs]
def get_parts(self):
"""
Returns a DseqParts instance containing the parts (strings) of a dsDNA
sequence. DseqParts instance field names:
::
"sticky_left5"
|
| "sticky_right5"
| |
--- ---
GGGATCC
TAGGTCA
----
|
"middle"
"sticky_left3"
|
| "sticky_right3"
| |
--- ---
ATCCAGT
CCCTAGG
----
|
"middle"
"single_watson" (only an upper strand)
|
-------
ATCCAGT
|||||||
"single_crick" (only a lower strand)
|
-------
|||||||
CCCTAGG
Up to seven groups (0..6) are captured, but some are mutually exclusive
which means that one of them is an empty string:
0 or 1, not both, a DNA fragment has either 5' or 3' sticky end.
2 or 5 or 6, a DNA molecule has a ds region or is single stranded.
3 or 4, not both, either 5' or 3' sticky end.
Note that internal single stranded regions are not identified and will
be contained in the middle part if they are present.
Examples
--------
>>> from pydna.dseq import Dseq
>>> ds = Dseq("PPPATCFQZ")
>>> ds
Dseq(-9)
GGGATC
TAGTCA
>>> parts = ds.get_parts()
>>> parts
DseqParts(sticky_left5='PPP', sticky_left3='', middle='ATC', sticky_right3='', sticky_right5='FQZ', single_watson='', single_crick='')
>>> Dseq(parts.sticky_left5)
Dseq(-3)
GGG
|||
>>> Dseq(parts.middle)
Dseq(-3)
ATC
TAG
>>> Dseq(parts.sticky_right5)
Dseq(-3)
|||
TCA
Parameters
----------
datastring : str
A string with dscode.
Returns
-------
namedtuple
Seven string fields describing the DNA molecule.
fragment(sticky_left5='', sticky_left3='',
middle='',
sticky_right3='', sticky_right5='',
single_watson='', single_crick='')
"""
return get_parts(self._data.decode("ascii"))