Source code for pydna.sequence_regex
# -*- coding: utf-8 -*-
from pydna.dseqrecord import Dseqrecord
import re
from Bio.Data.IUPACData import ambiguous_dna_values
custom_ambiguous_only_dna_values = {**ambiguous_dna_values}
for normal_base in "ACGT":
del custom_ambiguous_only_dna_values[normal_base]
[docs]
def compute_regex_site(site: str) -> str:
"""
Creates a regex pattern from a string that may contain degenerate bases.
Args:
site: The string to convert to a regex pattern.
Returns:
The regex pattern.
"""
upper_site = site.upper()
for k, v in custom_ambiguous_only_dna_values.items():
if len(v) > 1:
upper_site = upper_site.replace(k, f"[{''.join(v)}]")
# Make case insensitive
upper_site = f"(?i){upper_site}"
return upper_site
[docs]
def dseqrecord_finditer(pattern: str, seq: Dseqrecord) -> list[re.Match]:
"""
Finds all matches of a regex pattern in a Dseqrecord.
Args:
pattern: The regex pattern to search for.
seq: The Dseqrecord to search in.
Returns:
A list of matches.
"""
query = str(seq.seq) if not seq.circular else str(seq.seq) * 2
matches = re.finditer(pattern, query)
return (m for m in matches if m.start() <= len(seq))