Source code for pydna.alphabet

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
dscode - The nucleic acid alphabet used in pydna

This file serves to define dscode, the DNA alphabet used in pydna.
Each symbol represents a basepair (two opposing bases in the two antiparalell
DNA strands).

The alphabet is defined in the end of this docstring which serve as the single
source of thruth. The alphabet is used to construct the codestrings dictionary
with has the following keys (strings) in the order indicated:

1. un_ambiguous_ds_dna
2. ds_rna
3. ambiguous_ds_dna
4. single_stranded_dna_rna
5. loops_dna_rna
6. mismatched_dna_rna
7. gap

Each value of the codestrings dictionary is a multiline string. This string
has five lines following this form:

::

    W             1   Watson symbol
    |             2   Pipe
    C             3   Crick symbol
    <empty line>  4
    S             5   dscode symbol

W (line 1) and C (line 2) are complementary bases in a double stranded DNA
molecule and S (line 5) are the symbols of the alphabet used to
describe the base pair above the symbol.

Line 2 must contain only the pipe character, indicating basepairing and
line 4 must be empty. The lines must be of equal length and a series ot
tests are performed to ensure the integrity of the alphabet.

The string definition as well as the keys for the codestrings dict follow this
line and is contained in the last 13 lines of the docstring:

un_ambiguous_ds_dna
|    ds_rna
|    |  ambiguous_ds_dna
|    |  |           single_stranded_dna_rna
|    |  |           |          loops_dna_rna
|    |  |           |          |          mismatched_dna_rna
|    |  |           |          |          |                  gap
|    |  |           |          |          |                  |
GATC UA RYMKSWHBVDN GATC••••U• -----AGCTU AAACCCGGGTTTUUUGCT •
|||| || ||||||||||| |||||||||| |||||||||| |||||||||||||||||| |
CTAG AU YRKMSWDVBHN ••••CTAG•U AGCTU----- ACGACTAGTCGTGCTUUU •

GATC UO RYMKSWHBVDN PEXIQFZJ$% 0123456789 !#{}&*()<>@:?[]=_; •

"""
import re
from dataclasses import dataclass

__all__ = [
    # Core alphabet dictionaries
    "basepair_dict",
    "annealing_dict",
    "annealing_dict_w_holes",
    "complement_dict_for_dscode",
    # Translation tables (str.translate, bytes.translate)
    "complement_table_for_dscode",
    "dscode_to_watson_table",
    "dscode_to_crick_table",
    "dscode_to_watson_tail_table",
    "dscode_to_crick_tail_table",
    "dscode_to_full_sequence_table",
    # Alphabet subsets
    "ds_letters",
    "ss_letters_watson",
    "ss_letters_crick",
    # Regex helpers and factories
    "iupac_compl_regex",
    "regex_ss_melt_factory",
    "regex_ds_melt_factory",
    # Data structures
    "DseqParts",
    # Public helper functions
    "get_parts",
    "dsbreaks",
    "representation_tuple",
    "anneal_strands",
]


# An alias for whitespace
emptyspace = chr(32)

# ============================================================================
# Alphabet definition extracted from module docstring
# ============================================================================

lines = __doc__.rstrip().splitlines()[-13:]  # last 13 docstring lines are read

assert not lines[-2]  # line 4 has to be empty
assert set(lines[-4]) == {" ", "|"}  # line 2 has to have pipes only.

uppers = lines[-5]
pipes = lines[-4]
lowers = lines[-3]
dscode = lines[-1]

# Make sure all lineas are equal in length
assert (
    len(uppers.split())
    == len(lowers.split())
    == len(pipes.split())
    == len(dscode.split())
)

# Extract the keys from the docstring
names = [x.strip("| ") for x in lines[: len(dscode.split())]]

# ============================================================================
# Construct the codestrings dict
# ============================================================================

codestrings = {}

for upper, pipe, lower, code, name in zip(
    uppers.split(), pipes.split(), lowers.split(), dscode.split(), names
):
    codestrings[name.strip()] = f"{upper}\n{pipe}\n{lower}\n\n{code}\n".replace(
        "•", emptyspace
    )


# ============================================================================
# Define ascii letters not used in the alphabet
# ============================================================================

letters_not_in_dscode = "lL\"',-./\\^`|+~"


# ============================================================================
# for loop below carries out a series of consistency checks
# ============================================================================

for name, codestring in codestrings.items():

    lines = codestring.splitlines()

    assert len(lines) == 5, f'codestring["{name}"] does not have 5 lines'

    # We want the Watson, Crick and Symbol lines only
    # Second line has to be pipes ("|") and fourth has to be empty

    watsn, pipes, crick, empty, symbl = lines

    # Check so that all letters are ascii symbols.
    assert all(
        ln.isascii() for ln in (watsn, crick, symbl)
    ), f'codestring["{name}"] has non-ascii letters'

    # Verify so that all chars that have uppercase are uppercase.
    assert all(
        ln.isupper() for ln in (watsn, crick, symbl) if ln.isalpha()
    ), f'codestring["{name}"] has non-uppercase letters'

    # check so that pipes contain only "|"
    assert set(pipes) == set(
        "|"
    ), f'codestring["{name}"] has non-pipe character(s) in line 2'

    # check so strings are the same length
    assert all(
        len(ln) == len(watsn) for ln in (watsn, pipes, crick, symbl)
    ), f'codestring["{name}"] has lines of unequal length'

    # Check that the the letters in the letters_not_in_dscode string
    # are not used.
    assert not any(
        [letter in letters_not_in_dscode for letter in symbl]
    ), f'codestring["{name}"] has chars outside alphabet'


# ============================================================================
# The `codes` dictionary is a dict of dicts containing the information of the
# code strings in the form if a dict with string names as keys, each containing
# a {tuple: string} dict with this structure:
#
#                                  (Watson letter, Crick letter): dscode symbol
# ============================================================================

codes = dict()

for name, codestring in codestrings.items():

    lines = codestring.splitlines()

    watsons, _, cricks, _, symbols = lines

    # d is an alias of codes[name] used in this loop for code clarity.
    codes[name] = d = dict()

    for watson, crick, symbol in zip(watsons, cricks, symbols):
        d[watson, crick] = symbol

del d  # delete alias

# ============================================================================
# The `basepair_dict` dictionary is a merge of a subset of the `codes`dict.
# ============================================================================

basepair_dict = (
    codes["un_ambiguous_ds_dna"]
    | codes["ambiguous_ds_dna"]
    | codes["ds_rna"]
    | codes["single_stranded_dna_rna"]
    # | codes["mismatched_dna_rna"]
    # | codes["loops_dna_rna"]
    | codes["gap"]
)


# ============================================================================
# The `annealing_dict` dictionary contain letters for single stranded
# DNA and their dscode after annealing
# ============================================================================

# The annealing_dict_of_str is constructed below. It contains the information
# needed to tell if two DNA fragments (like a and b below) can anneal.

# This of course only concerns single stranded regions.

# The dict has the form (x, y): s

# Where x and y are bases in a and b and the symbol s is the resulting dscode
# symbol for the base pair that is formed.

# The letters x and y are from the values in the
# codes["single_stranded_dna_rna"] dictionary.

# For, example: One key-value pair is ('P', 'Q'): 'G' which matches the first
# of the four new base pairings formed between a and b in the example below.

#   (a)
#   gggPEXI    (dscode for a)

#   gggGATC
#   ccc
#           aaa (b)
#       CTAGttt

#       QFZJaaa (dscode for b)


#   gggGATCaaa  (annealing product between a and b)
#   cccCTAGttt

# This loops through the base pairs where the upper or lower
# positions are empty. (w, c), s would be ("G", " "), "P"
# in the first iteration.

annealing_dict = dict()

temp = codes["un_ambiguous_ds_dna"] | codes["ds_rna"]

# Alias to make the code below more readable.
d = codes["single_stranded_dna_rna"]

for (x, y), symbol in d.items():
    if y == emptyspace:
        other = next(b for a, b in temp if a == x)
        symbol_other = d[emptyspace, other]
        annealing_dict[symbol, symbol_other] = temp[x, other]
        annealing_dict[symbol_other, symbol] = temp[x, other]
    elif x == emptyspace:
        other = next(a for a, b in temp if b == y)
        symbol_other = d[other, emptyspace]
        annealing_dict[symbol, symbol_other] = temp[other, y]
        annealing_dict[symbol_other, symbol] = temp[other, y]
    else:
        raise ValueError("This should not happen")

del d, temp

# ============================================================================
# The `annealing_dict_w_holes`contains the `annealing_dict`
# and additional key pairs where one position is empty
# ============================================================================

temp = {}

for (x, y), symbol in annealing_dict.items():

    temp[x, emptyspace] = x
    temp[emptyspace, y] = y

annealing_dict_w_holes = annealing_dict | temp

del temp


# ============================================================================
# translation tables
# ============================================================================

# A collection of translation tables are a practical way to obtain Watson and Crick
# from dscode or the reverse complement strands when needed.

# These are meant to be used by the str.translate or bytes.translate methods.


# ============================================================================
# The translation table "complement_table_for_dscode" is used to obtain the
# complement of a DNA sequence in dscode format.
# ============================================================================

complement_dict_for_dscode = {
    s: basepair_dict[c, w] for (w, c), s in basepair_dict.items()
}

from_letters = "".join(complement_dict_for_dscode.keys())
to_letters = "".join(complement_dict_for_dscode.values())

from_letters += from_letters.lower()
to_letters += to_letters.lower()

complement_table_for_dscode = bytes.maketrans(
    from_letters.encode("ascii"), to_letters.encode("ascii")
)


# ============================================================================
# dscode_to_watson_table and dscode_to_crick_table
# ============================================================================

# dscode_to_watson_table and dscode_to_crick_table are used to obtain the Watson
# and (reverse) Crick strands from dscode.

# Three extra letters (placeholder1, placeholder2, interval) are added to the
# table and used in the representation_tuple function to
# add range indicators ("..") in the watson or crick strings for
# representation of long sequences.

dscode_sense = ""
dscode_compl = ""
watson = ""
crick = ""
dscode_sense_lower = ""
dscode_compl_lower = ""
watson_lower = ""
crick_lower = ""

for (w, c), dscode in basepair_dict.items():
    dscode_sense += dscode
    dscode_compl += basepair_dict[c, w]
    watson += w
    crick += c
    dscode_lower = dscode.lower()
    if dscode_lower in dscode_sense:
        continue
    dscode_sense_lower += dscode_lower
    watson_lower += w.lower()
    crick_lower += c.lower()
    dscode_compl_lower += dscode_compl.lower()

# dscode_sense += dscode_sense.lower()
# dscode_compl += dscode_compl.lower()
# watson += watson.lower()
# crick += crick.lower()

placeholder1 = "~"
placeholder2 = "+"
interval = "."

assert placeholder1 in letters_not_in_dscode
assert placeholder2 in letters_not_in_dscode
assert interval in letters_not_in_dscode

dscode_to_watson_table = bytes.maketrans(
    (dscode_sense + dscode_sense_lower + placeholder1 + placeholder2).encode("ascii"),
    (watson + watson_lower + emptyspace + interval).encode("ascii"),
)

dscode_to_crick_table = bytes.maketrans(
    (dscode_sense + dscode_sense_lower + placeholder1 + placeholder2).encode("ascii"),
    (crick + crick_lower + interval + emptyspace).encode("ascii"),
)


# ============================================================================
# dscode_to_watson_tail_table
# ============================================================================


watson_tail_letter_dict = {
    w: s for (w, c), s in codes["single_stranded_dna_rna"].items() if c.isspace()
}

from_letters = "".join(watson_tail_letter_dict.keys())
to_letters = "".join(watson_tail_letter_dict.values())

from_letters += from_letters.lower()
to_letters += to_letters.lower()

dscode_to_watson_tail_table = bytes.maketrans(
    from_letters.encode("ascii"), to_letters.encode("ascii")
)

from_letters_full = five_prime_ss_letters = to_letters
to_letters_full = from_letters

# ============================================================================
# dscode_to_crick_tail_table
# ============================================================================

crick_tail_letter_dict = {
    complement_dict_for_dscode[c]: s
    for (w, c), s in codes["single_stranded_dna_rna"].items()
    if w.isspace()
}

from_letters = "".join(crick_tail_letter_dict.keys())
to_letters = "".join(crick_tail_letter_dict.values())

from_letters += from_letters.lower()
to_letters += to_letters.lower()

dscode_to_crick_tail_table = bytes.maketrans(
    from_letters.encode("ascii"), to_letters.encode("ascii")
)

three_prime_ss_letters = to_letters
from_letters_full += to_letters
to_letters_full += from_letters


# ============================================================================
# dscode_to_full_sequence_table
# ============================================================================


dscode_to_full_sequence_table = bytes.maketrans(
    from_letters_full.encode("ascii"), to_letters_full.encode("ascii")
)


# This loop adds upper and lower case symbols
mixed_case_dict = {}

for (x, y), symbol in basepair_dict.items():
    mixed_case_dict[x.lower(), y.lower()] = symbol.lower()
    mixed_case_dict[x.lower(), y.upper()] = symbol.lower()
    mixed_case_dict[x.upper(), y.lower()] = symbol.upper()

    if x == emptyspace:
        mixed_case_dict[x, y.lower()] = symbol.lower()
        mixed_case_dict[x, y.upper()] = symbol.upper()
    if y == emptyspace:
        mixed_case_dict[x.lower(), y] = symbol.lower()
        mixed_case_dict[x.upper(), y] = symbol.upper()

# Add mixed case entries to the dict
basepair_dict.update(mixed_case_dict)

mixed_case_dict = {}

# This loop adds upper and lower case symbols
for (x, y), symbol in annealing_dict.items():
    mixed_case_dict[x.lower(), y.lower()] = symbol.lower()
    mixed_case_dict[x.lower(), y.upper()] = symbol.lower()
    mixed_case_dict[x.upper(), y.lower()] = symbol.upper()

# Add mixed case entries to the dict
annealing_dict.update(mixed_case_dict)

ds_letters = (
    "".join(codes["un_ambiguous_ds_dna"].values())
    + "".join(codes["ds_rna"].values())
    + "".join(codes["ambiguous_ds_dna"].values())
)

ss_letters_watson = "".join(
    s for (w, c), s in codes["single_stranded_dna_rna"].items() if c == emptyspace
)
ss_letters_crick = "".join(
    s for (w, c), s in codes["single_stranded_dna_rna"].items() if w == emptyspace
)

ds_letters += ds_letters.lower()
ss_letters_watson += ss_letters_watson.lower()
ss_letters_crick += ss_letters_crick.lower()


# ============================================================================
# iupac_compl_regex dict of regexes below cover IUPAC Ambiguity Code
# complements and is used in the amplify module.
# ============================================================================

iupac_compl_regex = {
    "A": "(?:T|U)",
    "C": "(?:G)",
    "G": "(?:C)",
    "T": "(?:A)",
    "U": "(?:A)",
    "R": "(?:T|C|Y)",
    "Y": "(?:G|A|R)",
    "S": "(?:G|C|S)",
    "W": "(?:A|T|W)",
    "K": "(?:C|AM)",
    "M": "(?:T|G|K)",
    "B": "(?:C|G|A|V)",
    "D": "(?:A|C|T|H)",
    "H": "(?:A|G|T|D)",
    "V": "(?:T|C|G|B)",
    "N": "(?:A|G|C|T|N)",
}

# This loop adds upper and lower case symbols
# mixed_case_dict = {}

for (x, y), symbol in annealing_dict_w_holes.items():
    mixed_case_dict[x.lower(), y.lower()] = symbol.lower()
    mixed_case_dict[x.lower(), y.upper()] = symbol.lower()
    mixed_case_dict[x.upper(), y.lower()] = symbol.upper()
# Add mixed case entries to the dict
annealing_dict_w_holes.update(mixed_case_dict)

# ============================================================================
# DseqParts dataclass
# ============================================================================



[docs]
@dataclass
class DseqParts:
    sticky_left5: str
    sticky_left3: str
    middle: str
    sticky_right3: str
    sticky_right5: str
    single_watson: str
    single_crick: str

    def __iter__(self):
        """
        Allow unpacking DseqParts instances.
        >>> from pydna.alphabet import get_parts
        >>> sticky_left5, sticky_left3, middle, sticky_right3, sticky_right5, single_watson, single_crick = get_parts("eeATCGuggCCGgg")
        >>> sticky_left5
        'ee'
        >>> middle
        'ATCGuggCCGgg'
        """
        return iter(
            (
                self.sticky_left5,
                self.sticky_left3,
                self.middle,
                self.sticky_right3,
                self.sticky_right5,
                self.single_watson,
                self.single_crick,
            )
        )

    def __getitem__(self, index: int) -> str:
        """
        Allow indexing DseqParts instances.
        >>> from pydna.alphabet import get_parts
        >>> parts = get_parts("eeATCGuggCCGgg")
        >>> parts[0]
        'ee'
        >>> parts[2]
        'ATCGuggCCGgg'
        """
        return tuple(self)[index]




[docs]
def get_parts(datastring: str) -> DseqParts:
    """
    Returns a DseqParts instance containing the parts of a dsDNA sequence.

    The datastring argument should contain a string with dscode symbols.

    A regular expression is used to capture the single stranded regions at
    the ends as well as the ds region in the middle, if any.

    The figure below numbers the regex capture groups and what they capture
    as well as the DseqParts instance field name for each group.

    ::

         group 0 "sticky_left5"
         |
         |      group 3"sticky_right5"
         |      |
        ---    ---
        GGGATCC
           TAGGTCA
           ----
             |
             group 2 "middle"



         group 1 "sticky_left3"
         |
         |      group 4 "sticky_right3"
         |      |
        ---    ---
           ATCCAGT
        CCCTAGG
           ----
             |
             group 2 "middle"



           group 5 "single_watson" (only an upper strand)
           |
        -------
        ATCCAGT
        |||||||



           group 6 "single_crick" (only a lower strand)
           |
        -------

        |||||||
        CCCTAGG

    Examples
    --------
    >>>

    Up to seven groups (0..6) are captured.s ome are mutually exclusive
    which means that one of them is an empty string:

    0 or 1, not both, a DNA fragment has either 5' or 3' sticky end.

    2 or 5 or 6, a DNA molecule has a ds region or is entirely single stranded.

    3 or 4, not both, either 5' or 3' sticky end.

    Note that internal single stranded regions are not identified and will
    be contained in the middle part if they are present.

    Parameters
    ----------
    datastring : str
        A string with dscode.

    Returns
    -------
    DseqParts
        Seven string fields describing the DNA molecule.
        DseqParts(sticky_left5='', sticky_left3='',
                  middle='',
                  sticky_right3='', sticky_right5='',
                  single_watson='', single_crick='')

    """

    m = re.match(
        f"([{ss_letters_watson}]*)"  # capture group 0 ssDNA in watson strand
        f"([{ss_letters_crick}]*)"  # "             1 ssDNA in crick strand
        f"(?=[{ds_letters}])"  # positive lookahead for dsDNA, no capture
        "(.*)"  # capture group 2 everything in the middle
        f"(?<=[{ds_letters}])"  # positive look behind for dsDNA, no capture
        f"([{ss_letters_watson}]*)"  # capture group 3 ssDNA in watson strand
        f"([{ss_letters_crick}]*)|"  # "             4 ssDNA in crick strand
        f"([{ss_letters_watson}]+)|"  # "             5 if data contains only upper strand
        f"([{ss_letters_crick}]+)",  # "             6 if data contains only lower strand
        datastring,
    )

    result = m.groups() if m else (None, None, None, None, None, None, None)

    result = ["" if e is None else e for e in result]

    return DseqParts(
        sticky_left5=result[0],
        sticky_left3=result[1],
        middle=result[2],
        sticky_right3=result[3],
        sticky_right5=result[4],
        single_watson=result[5],
        single_crick=result[6],
    )




[docs]
def dsbreaks(datastring: str) -> list[str]:
    """
    Find double strand breaks in DNA in dscode format.

    An empty watson position next to an empty crick position in the dsDNA
    leads to a discontinuous DNA. This function is used to show breaks in
    DNA in Dseq.__init__.

    >>> from pydna.alphabet import dsbreaks
    >>> x, = dsbreaks("GATPFTAA")
    >>> print(x)
    [0:8]
    GATG TAA
    CTA TATT
    >>> dsbreaks("GATC")
    []

    Parameters
    ----------
    data : str
        A string representing DNA in dscode format.

    Returns
    -------
    list[str]
        A list of 3-line

    """

    wl = re.escape(five_prime_ss_letters)
    cl = re.escape(three_prime_ss_letters)

    breaks = []
    regex = (
        "(.{0,3})"  # return context if present.
        f"([{wl}][{cl}]|[{cl}][{wl}])"  # find adjacent single strand chars.
        "(.{0,3})"  # return context if present.
    )
    for mobj in re.finditer(regex, datastring):
        chunk = mobj.group()
        w, c = representation_tuple(chunk)
        breaks.append(f"[{mobj.start()}:{mobj.end()}]\n{w}\n{c}\n")
    return breaks




[docs]
def representation_tuple(
    datastring: str = "", length_limit_for_repr: int = 30, chunk: int = 4
):
    """
    Two line string representation of a sequence of dscode symbols.

    See pydna.alphabet module for the definition of the pydna dscode
    alphabet. The dscode has a symbol (ascii) character for base pairs
    and single stranded DNA.

    This function is used by the Dseq.__repr__() method.

    Parameters
    ----------
    data : TYPE, optional
        DESCRIPTION. The default is "".

    Returns
    -------
    str
        A two line string containing The Watson and Crick strands.

    """

    (
        sticky_left5,
        sticky_left3,
        middle,
        sticky_right5,
        sticky_right3,
        single_watson,
        single_crick,
    ) = get_parts(datastring)

    if len(datastring) > length_limit_for_repr:
        """
        We need to shorten the repr if the sequence is longer than
        limit imposed by length_limit_for_repr.

        The representation has three parts, so we divide by three for each part.

        Long DNA strands are interrupted by interval notation, like agc..att
        where the two dots indicate intervening hidden sequence.


        Dseq(-71)
        GAAA..AATCaaaa..aaaa
                  tttt..ttttCTAA..AAAG

        placeholder1, placeholder2 are two letters that are replaced by
        interval characters in the upper or lower strands by the translation
        """

        part_limit = length_limit_for_repr // 3

        if len(sticky_left5) > part_limit:
            sticky_left5 = (
                sticky_left5[:chunk] + placeholder2 * 2 + sticky_left5[-chunk:]
            )

        if len(sticky_left3) > part_limit:
            sticky_left3 = (
                sticky_left3[:chunk] + placeholder1 * 2 + sticky_left3[-chunk:]
            )

        if len(middle) > part_limit:
            middle = middle[:4] + interval * 2 + middle[-4:]

        if len(sticky_right5) > part_limit:
            sticky_right5 = (
                sticky_right5[:chunk] + placeholder2 * 2 + sticky_right5[-chunk:]
            )

        if len(sticky_right3) > part_limit:
            sticky_right3 = (
                sticky_right3[:chunk] + placeholder1 * 2 + sticky_right3[-chunk:]
            )

    # The processed string that will be used to
    # obtain a watson and crick strand
    processed_dscode = (sticky_left5 or sticky_left3) + middle + (
        sticky_right5 or sticky_right3
    ) or single_watson + single_crick

    watson = processed_dscode.translate(dscode_to_watson_table).rstrip()
    crick = processed_dscode.translate(dscode_to_crick_table).rstrip()

    return watson, crick




[docs]
def regex_ss_melt_factory(length: int) -> re.Pattern:
    """
    A regular expression for finding double-stranded regions flanked by single-stranded DNA
    that can be melted to shed a single-stranded fragment.

    This function returns a regular expression that finds double-stranded regions
    (of length <= length) that are flanked by single-stranded regions on the same
    side in dscode format. These regions are useful to identify as potential melt
    sites, since melting them leads to the shedding of a single-stranded fragment.

    The regular expression finds double stranded patches flanked by empty
    positions on the same side (see figure below). Melting of this kind of
    sites leads to the shedding of a single stranded fragment.

    ::

        GFTTAJA   <-- dscode representing the ds DNA below.

        G TTA A   <-- "TTA" is found by the regex for length <= 3
        CTAATGT


    Examples
    --------
    >>> from pydna.dseq import Dseq
    >>> regex = regex_ss_melt_factory(3)
    >>> s = Dseq("GFTTAJA")
    >>> s
    Dseq(-7)
    G TTA A
    CTAATGT
    >>> mobj = regex.search(s._data)
    >>> mobj.groupdict()
    {'watson': b'TTA', 'crick': None}


    Parameters
    ----------
    length : int
        Max length of double stranded region flanked by single stranded
        regions.

    Returns
    -------
    TYPE
        regular expression object.

    """

    regex = (
        f"(?P<watson>((?<=[{ss_letters_crick}]))"
        f"([{ds_letters}]{{1,{length}}})"
        f"((?=[^{ss_letters_watson}{ds_letters}])))|"
        f"(?P<crick>((?<=[{ss_letters_watson}]))"
        f"([{ds_letters}]{{1,{length}}})"
        f"((?=[^{ss_letters_crick}{ds_letters}])))"
    )

    return re.compile(regex.encode("ascii"))




[docs]
def regex_ds_melt_factory(length: int) -> re.Pattern:
    """
    A regular expression for finding double-stranded regions flanked by single-stranded DNA
    that can be melted to shed multiple double stranded fragments.

    This function returns a regular expression that finds double-stranded regions
    (of length <= length) that are flanked by single-stranded regions on opposite
    sides in dscode format. These regions are useful to identify as potential melt
    sites, since melting them leads to separation into multiple double stranded fragments.

    The regular expression finds double stranded patches flanked by empty
    positions on opposite sides(see figure below). Melting of this kind of
    sites leads to separation into multiple double stranded fragments.

    ::
        aaaGFTTAIAttt   <-- dscode

        aaaG TTACAttt   <-- "TTA" is found by the regex for length <= 3
        tttCTAAT Taaa

    Examples
    --------

    >>> from pydna.dseq import Dseq
    >>> regex = regex_ds_melt_factory(3)
    >>> s = Dseq("aaaGFTTAIAttt")
    >>> s
    Dseq(-13)
    aaaG TTACAttt
    tttCTAAT Taaa
    >>> mobj = regex.search(s._data)
    >>> mobj.groupdict()
    {'watson': None, 'crick': b'TTA'}

    Parameters
    ----------
    length : int
        Max length of double stranded region flanked by single stranded
        regions.

    Returns
    -------
    TYPE
        regular expression object.

    """

    regex = (
        f"(?P<watson>((?<=[{ss_letters_watson}])|^)"
        f"([{ds_letters}]{{1,{length}}})"
        f"((?=[^{ss_letters_watson}{ds_letters}])|$))|"
        f"(?P<crick>((?<=[{ss_letters_crick}])|^)"
        f"([{ds_letters}]{{1,{length}}})"
        f"((?=[^{ss_letters_crick}{ds_letters}])|$))"
    )

    return re.compile(regex.encode("ascii"))




[docs]
def anneal_strands(strand_a: str, strand_b: str) -> bool:
    """
    Test if two DNA strands containing dscode anneal or not.

    Both strands are assumed to be given in 5' -> 3' direction.

    Examples
    --------

    >>> from pydna.alphabet import anneal_strands
    >>> a = "TTA"
    >>> b = "AAT"[::-1]
    >>> anneal_strands(a, b)
    True
    >>> anneal_strands(b, a)
    True
    >>> c = "UUA"
    >>> anneal_strands(c, b)
    True
    >>> anneal_strands(a.lower(), b)
    True
    >>> anneal_strands("TG", "AA")
    False

    Parameters
    ----------
    watson : str
        A single DNA strand.
    crick : str
        A single DNA strand.

    Returns
    -------
    bool
        True if annealing is perfect.

    """
    w = strand_a.translate(dscode_to_watson_table)
    c = strand_b.translate(complement_table_for_dscode).translate(
        dscode_to_crick_table
    )[::-1]
    for x, y in zip(w, c):
        try:
            basepair_dict[(x, y)]
        except KeyError:
            return False
    return True