Source code for pydna.cre_lox

# -*- coding: utf-8 -*-
from itertools import product
from pydna.dseqrecord import Dseqrecord
from Bio.Data.IUPACData import ambiguous_dna_values
from Bio.Seq import reverse_complement
from pydna.sequence_regex import compute_regex_site, dseqrecord_finditer
from Bio.SeqFeature import Location, SimpleLocation, SeqFeature
from pydna.utils import shift_location

# We create a dictionary to map ambiguous bases to their consensus base
# For example, ambigous_base_dict['ACGT'] -> 'N'
ambiguous_base_dict = {}
for ambiguous, bases in ambiguous_dna_values.items():
    ambiguous_base_dict["".join(sorted(bases))] = ambiguous

# To handle N values
ambiguous_base_dict["N"] = "N"

# This is the original loxP sequence, here for reference
LOXP_SEQUENCE = "ATAACTTCGTATAGCATACATTATACGAAGTTAT"

loxP_sequences = [
    # https://blog.addgene.org/plasmids-101-cre-lox
    # loxP
    "ATAACTTCGTATANNNTANNNTATACGAAGTTAT",
    # PMID:12202778
    # lox66
    "ATAACTTCGTATANNNTANNNTATACGAACGGTA",
    # lox71
    "TACCGTTCGTATANNNTANNNTATACGAAGTTAT",
]

loxP_consensus = ""

for pos in range(len(LOXP_SEQUENCE)):
    all_letters = set(seq[pos] for seq in loxP_sequences)
    key = "".join(sorted(all_letters))
    loxP_consensus += ambiguous_base_dict[key]

# We compute the regex for the forward and reverse loxP sequences
loxP_regex = (
    compute_regex_site(loxP_consensus),
    compute_regex_site(reverse_complement(loxP_consensus)),
)



[docs]
def cre_loxP_overlap(
    x: Dseqrecord, y: Dseqrecord, _l: None = None
) -> list[tuple[int, int, int]]:
    """Find matching loxP sites between two sequences."""
    out = list()
    for pattern in loxP_regex:
        matches_x = dseqrecord_finditer(pattern, x)
        matches_y = dseqrecord_finditer(pattern, y)

        for match_x, match_y in product(matches_x, matches_y):
            value_x = match_x.group()
            value_y = match_y.group()
            if value_x[13:21] == value_y[13:21]:
                out.append((match_x.start() + 13, match_y.start() + 13, 8))
    # Unique values (keeping the order)
    unique_out = []
    for item in out:
        if item not in unique_out:
            unique_out.append(item)
    return unique_out



loxP_dict = {
    "loxP": "ATAACTTCGTATANNNTANNNTATACGAAGTTAT",
    "lox66": "ATAACTTCGTATANNNTANNNTATACGAACGGTA",
    "lox71": "TACCGTTCGTATANNNTANNNTATACGAAGTTAT",
    "loxP_mutant": "TACCGTTCGTATANNNTANNNTATACGAACGGTA",
}



[docs]
def get_regex_dict(original_dict: dict[str, str]) -> dict[str, str]:
    """Get the regex dictionary for the original dictionary."""
    out = dict()
    for site in original_dict:
        consensus_seq = original_dict[site]
        is_palindromic = consensus_seq == reverse_complement(consensus_seq)
        out[site] = {
            "forward_regex": compute_regex_site(original_dict[site]),
            "reverse_regex": (
                None
                if is_palindromic
                else compute_regex_site(reverse_complement(original_dict[site]))
            ),
        }
    return out




[docs]
def find_loxP_sites(seq: Dseqrecord) -> dict[str, list[Location]]:
    """Find all loxP sites in a sequence and return a dictionary with the name and positions of the sites."""

    out = dict()
    regex_dict = get_regex_dict(loxP_dict)
    for site in loxP_dict:

        for pattern in ["forward_regex", "reverse_regex"]:
            # Palindromic sequences have no reverse complement
            if regex_dict[site][pattern] is None:
                continue
            matches = list(dseqrecord_finditer(regex_dict[site][pattern], seq))
            for match in matches:
                if site not in out:
                    out[site] = []
                strand = 1 if pattern == "forward_regex" else -1
                loc = SimpleLocation(match.start(), match.end(), strand)
                loc = shift_location(loc, 0, len(seq))
                out[site].append(loc)
    return out




[docs]
def annotate_loxP_sites(seq: Dseqrecord) -> Dseqrecord:
    sites = find_loxP_sites(seq)
    for site in sites:
        for loc in sites[site]:
            # Don't add the same feature twice
            if not any(
                f.location == loc
                and f.type == "protein_bind"
                and f.qualifiers.get("label", []) == [site]
                for f in seq.features
            ):
                seq.features.append(
                    SeqFeature(loc, type="protein_bind", qualifiers={"label": [site]})
                )
    return seq