Source code for pydna.cre_lox

# -*- coding: utf-8 -*-
from itertools import product
from pydna.dseqrecord import Dseqrecord
from Bio.Data.IUPACData import ambiguous_dna_values
from Bio.Seq import reverse_complement
from pydna.sequence_regex import compute_regex_site, dseqrecord_finditer
from Bio.SeqFeature import Location, SimpleLocation, SeqFeature
from pydna.utils import shift_location

# We create a dictionary to map ambiguous bases to their consensus base
# For example, ambigous_base_dict['ACGT'] -> 'N'
ambiguous_base_dict = {}
for ambiguous, bases in ambiguous_dna_values.items():
    ambiguous_base_dict["".join(sorted(bases))] = ambiguous

# To handle N values
ambiguous_base_dict["N"] = "N"

# This is the original loxP sequence, here for reference
LOXP_SEQUENCE = "ATAACTTCGTATAGCATACATTATACGAAGTTAT"

loxP_sequences = [
    # https://blog.addgene.org/plasmids-101-cre-lox
    # loxP
    "ATAACTTCGTATANNNTANNNTATACGAAGTTAT",
    # PMID:12202778
    # lox66
    "ATAACTTCGTATANNNTANNNTATACGAACGGTA",
    # lox71
    "TACCGTTCGTATANNNTANNNTATACGAAGTTAT",
]

loxP_consensus = ""

for pos in range(len(LOXP_SEQUENCE)):
    all_letters = set(seq[pos] for seq in loxP_sequences)
    key = "".join(sorted(all_letters))
    loxP_consensus += ambiguous_base_dict[key]

# We compute the regex for the forward and reverse loxP sequences
loxP_regex = (
    compute_regex_site(loxP_consensus),
    compute_regex_site(reverse_complement(loxP_consensus)),
)


[docs] def cre_loxP_overlap( x: Dseqrecord, y: Dseqrecord, _l: None = None ) -> list[tuple[int, int, int]]: """Find matching loxP sites between two sequences.""" out = list() for pattern in loxP_regex: matches_x = dseqrecord_finditer(pattern, x) matches_y = dseqrecord_finditer(pattern, y) for match_x, match_y in product(matches_x, matches_y): value_x = match_x.group() value_y = match_y.group() if value_x[13:21] == value_y[13:21]: out.append((match_x.start() + 13, match_y.start() + 13, 8)) # Unique values (keeping the order) unique_out = [] for item in out: if item not in unique_out: unique_out.append(item) return unique_out
loxP_dict = { "loxP": "ATAACTTCGTATANNNTANNNTATACGAAGTTAT", "lox66": "ATAACTTCGTATANNNTANNNTATACGAACGGTA", "lox71": "TACCGTTCGTATANNNTANNNTATACGAAGTTAT", "loxP_mutant": "TACCGTTCGTATANNNTANNNTATACGAACGGTA", }
[docs] def get_regex_dict(original_dict: dict[str, str]) -> dict[str, str]: """Get the regex dictionary for the original dictionary.""" out = dict() for site in original_dict: consensus_seq = original_dict[site] is_palindromic = consensus_seq == reverse_complement(consensus_seq) out[site] = { "forward_regex": compute_regex_site(original_dict[site]), "reverse_regex": ( None if is_palindromic else compute_regex_site(reverse_complement(original_dict[site])) ), } return out
[docs] def find_loxP_sites(seq: Dseqrecord) -> dict[str, list[Location]]: """Find all loxP sites in a sequence and return a dictionary with the name and positions of the sites.""" out = dict() regex_dict = get_regex_dict(loxP_dict) for site in loxP_dict: for pattern in ["forward_regex", "reverse_regex"]: # Palindromic sequences have no reverse complement if regex_dict[site][pattern] is None: continue matches = list(dseqrecord_finditer(regex_dict[site][pattern], seq)) for match in matches: if site not in out: out[site] = [] strand = 1 if pattern == "forward_regex" else -1 loc = SimpleLocation(match.start(), match.end(), strand) loc = shift_location(loc, 0, len(seq)) out[site].append(loc) return out
[docs] def annotate_loxP_sites(seq: Dseqrecord) -> Dseqrecord: sites = find_loxP_sites(seq) for site in sites: for loc in sites[site]: # Don't add the same feature twice if not any( f.location == loc and f.type == "protein_bind" and f.qualifiers.get("label", []) == [site] for f in seq.features ): seq.features.append( SeqFeature(loc, type="protein_bind", qualifiers={"label": [site]}) ) return seq