Source code for pydna.opencloning_models

# -*- coding: utf-8 -*-
"""
This module provides classes that roughly map to the `OpenCloning <https://opencloning.org>`_
data model, which is defined using `LinkML <https://linkml.io>`, and available as a python
package `opencloning-linkml <https://pypi.org/project/opencloning-linkml/>`_. These classes
are documented there, and the ones in this module essentially replace the fields pointing to
sequences and primers (which use ids in the data model) to ``Dseqrecord`` and ``Primer``
objects, respectively. Similarly, it uses Location from ``Biopython`` instead of a string,
which is what the data model uses.

When using pydna to plan cloning, it stores the provenance of ``Dseqrecord`` objects in
their ``source`` attribute. Not all methods generate sources so far, so refer to the
documentation notebooks for examples on how to use this feature. The ``history`` method of
``Dseqrecord`` objects can be used to get a string representation of the provenance of the
sequence. You can also use the ``CloningStrategy`` class to create a JSON representation of
the cloning strategy. That ``CloningStrategy`` can be loaded in the OpenCloning web interface
to see a representation of the cloning strategy.


Contributing
============

Not all fields can be readily serialized to be converted to regular types in pydantic. For
instance, the ``coordinates`` field of the ``GenomeCoordinatesSource`` class is a
``SimpleLocation`` object, or the ``input`` field of ``Source`` is a list of ``SourceInput``
objects, which can be ``Dseqrecord`` or ``Primer`` objects, or ``AssemblyFragment`` objects.
For these type of fields, you have to define a ``field_serializer`` method to serialize them
to the correct type.

"""
from __future__ import annotations

from typing import Optional, Union, Any, ClassVar, Type
from pydantic_core import core_schema
from contextlib import contextmanager
from threading import local

from pydantic import BaseModel, ConfigDict, Field, field_serializer, field_validator

from opencloning_linkml.datamodel import (
    CloningStrategy as _BaseCloningStrategy,
    DatabaseSource as _DatabaseSource,
    Primer as _PrimerModel,
    Source as _Source,
    TextFileSequence as _TextFileSequence,
    AssemblySource as _AssemblySource,
    SourceInput as _SourceInput,
    AssemblyFragment as _AssemblyFragment,
    ManuallyTypedSource as _ManuallyTypedSource,
    RestrictionAndLigationSource as _RestrictionAndLigationSource,
    GibsonAssemblySource as _GibsonAssemblySource,
    RestrictionEnzymeDigestionSource as _RestrictionEnzymeDigestionSource,
    SequenceCutSource as _SequenceCutSource,
    RestrictionSequenceCut as _RestrictionSequenceCut,
    SequenceCut as _SequenceCut,
    InFusionSource as _InFusionSource,
    OverlapExtensionPCRLigationSource as _OverlapExtensionPCRLigationSource,
    InVivoAssemblySource as _InVivoAssemblySource,
    LigationSource as _LigationSource,
    GatewaySource as _GatewaySource,
    GatewayReactionType,
    AnnotationTool,
    HomologousRecombinationSource as _HomologousRecombinationSource,
    CreLoxRecombinationSource as _CreLoxRecombinationSource,
    PCRSource as _PCRSource,
    CRISPRSource as _CRISPRSource,
    RepositoryIdSource as _RepositoryIdSource,
    UploadedFileSource as _UploadedFileSource,
    AddgeneIdSource as _AddgeneIdSource,
    AddgeneSequenceType,
    BenchlingUrlSource as _BenchlingUrlSource,
    SnapGenePlasmidSource as _SnapGenePlasmidSource,
    EuroscarfSource as _EuroscarfSource,
    WekWikGeneIdSource as _WekWikGeneIdSource,
    SEVASource as _SEVASource,
    IGEMSource as _IGEMSource,
    OpenDNACollectionsSource as _OpenDNACollectionsSource,
    GenomeCoordinatesSource as _GenomeCoordinatesSource,
    OligoHybridizationSource as _OligoHybridizationSource,
    PolymeraseExtensionSource as _PolymeraseExtensionSource,
    AnnotationSource as _AnnotationSource,
    AnnotationReport as _AnnotationReport,
    PlannotateAnnotationReport as _PlannotateAnnotationReport,
    ReverseComplementSource as _ReverseComplementSource,
    NCBISequenceSource as _NCBISequenceSource,
)
from Bio.SeqFeature import Location, LocationParserError, SimpleLocation
from Bio.Restriction.Restriction import AbstractCut
import networkx as nx
from typing import List

from Bio.SeqIO.InsdcIO import _insdc_location_string as format_feature_location

from pydna.types import CutSiteType, SubFragmentRepresentationAssembly
from pydna.utils import create_location
from typing import TYPE_CHECKING

if TYPE_CHECKING:  # pragma: no cover
    from pydna.dseqrecord import Dseqrecord
    from pydna.primer import Primer


# Thread-local storage for ID strategy
_thread_local = local()



[docs]
@contextmanager
def id_mode(use_python_internal_id: bool = True):
    """Context manager that is used to determine how ids are assigned to objects when
    mapping them to the OpenCloning data model. If ``use_python_internal_id`` is True,
    the built-in python ``id()`` function is used to assign ids to objects. That function
    produces a unique integer for each object in python, so it's guaranteed to be unique.
    If ``use_python_internal_id`` is False, the object's ``.id`` attribute
    (must be a string integer) is used to assign ids to objects. This is useful
    when the objects already have meaningful ids,
    and you want to keep references to them in ``SourceInput`` objects (which sequences and
    primers are used in a particular source).

    Parameters
    ----------
    use_python_internal_id: bool
        If True, use Python's built-in id() function.
        If False, use the object's .id attribute (must be a string integer).

    Examples
    --------
    >>> from pydna.dseqrecord import Dseqrecord
    >>> from pydna.opencloning_models import get_id, id_mode
    >>> dseqr = Dseqrecord("ATGC")
    >>> dseqr.name = "my_sequence"
    >>> dseqr.id = "123"
    >>> get_id(dseqr) == id(dseqr)
    True
    >>> with id_mode(use_python_internal_id=False):
    ...     get_id(dseqr)
    123
    """
    old_value = getattr(_thread_local, "use_python_internal_id", True)
    _thread_local.use_python_internal_id = use_python_internal_id
    try:
        yield
    finally:
        _thread_local.use_python_internal_id = old_value




[docs]
def get_id(obj: "Primer" | "Dseqrecord") -> int:
    """Get ID using the current strategy from thread-local storage (see id_mode)
    Parameters
    ----------
    obj: Primer | Dseqrecord
        The object to get the id of

    Returns
    -------
    int: The id of the object

    """
    use_python_internal_id = getattr(_thread_local, "use_python_internal_id", True)
    if use_python_internal_id:
        return id(obj)
    if not isinstance(obj.id, str) or not obj.id.isdigit():
        raise ValueError(
            f"If use_python_internal_id is False, id must be a string representing an integer, "
            f"but object {obj} has an invalid id: {obj.id}"
        )
    return int(obj.id)




[docs]
class SequenceLocationStr(str):
    """A string representation of a sequence location, genbank-like."""


[docs]
    @classmethod
    def from_biopython_location(cls, location: Location):
        return cls(format_feature_location(location, None))



[docs]
    def to_biopython_location(self) -> Location:
        return Location.fromstring(self)



[docs]
    @classmethod
    def field_validator(cls, v):
        if isinstance(v, str):
            value = cls(v)
            try:
                value.to_biopython_location()
            except LocationParserError as err:
                raise ValueError(f"Location {v!r} is not a valid location") from err
            return value
        raise ValueError(f"Location must be a string or a {cls.__name__}")


    @classmethod
    def __get_pydantic_core_schema__(
        cls,
        source_type,
        handler,
    ) -> core_schema.CoreSchema:
        """Generate Pydantic core schema for SequenceLocationStr."""
        return core_schema.with_info_after_validator_function(
            cls._validate,
            core_schema.str_schema(),
        )

    @classmethod
    def _validate(cls, value: str, info):
        """Validate and create SequenceLocationStr instance."""
        return cls.field_validator(value)


[docs]
    @classmethod
    def from_start_and_end(
        cls, start: int, end: int, seq_len: int | None = None, strand: int | None = 1
    ):
        return cls.from_biopython_location(create_location(start, end, seq_len, strand))



[docs]
    def get_ncbi_format_coordinates(self) -> str:
        """Return start, end, strand in the same format as the NCBI eutils API (1-based, inclusive)"""
        return (
            self.to_biopython_location().start + 1,
            self.to_biopython_location().end,
            self.to_biopython_location().strand,
        )





[docs]
class ConfiguredBaseModel(BaseModel):
    model_config = ConfigDict(
        validate_assignment=True,
        validate_default=True,
        extra="forbid",
        arbitrary_types_allowed=True,
        use_enum_values=True,
        strict=False,
    )
    pass




[docs]
class TextFileSequence(_TextFileSequence):


[docs]
    @classmethod
    def from_dseqrecord(cls, dseqr: "Dseqrecord"):
        return cls(
            id=get_id(dseqr),
            sequence_file_format="genbank",
            overhang_crick_3prime=dseqr.seq.ovhg,
            overhang_watson_3prime=dseqr.seq.watson_ovhg,
            file_content=dseqr.format("genbank"),
        )





[docs]
class PrimerModel(_PrimerModel):


[docs]
    @classmethod
    def from_primer(cls, primer: "Primer"):
        return cls(
            id=get_id(primer),
            name=primer.name,
            sequence=str(primer.seq),
        )





[docs]
class SourceInput(ConfiguredBaseModel):
    sequence: object

    @field_validator("sequence")
    @classmethod
    def _validate_sequence_field(cls, value: Any):
        """Separate validation to avoid circular imports."""

        from pydna.dseqrecord import Dseqrecord
        from pydna.primer import Primer

        if isinstance(value, (Dseqrecord, Primer)):
            return value
        module = type(value).__module__
        name = type(value).__name__
        raise TypeError(f"sequence must be Dseqrecord or Primer; got {module}.{name}")


[docs]
    def to_pydantic_model(self) -> _SourceInput:
        return _SourceInput(sequence=get_id(self.sequence))





[docs]
class AssemblyFragment(SourceInput):

    left_location: Optional[Location] = Field(default=None)
    right_location: Optional[Location] = Field(default=None)
    reverse_complemented: bool


[docs]
    @staticmethod
    def from_biopython_location(location: Location | None):
        if location is None:
            return None
        return SequenceLocationStr.from_biopython_location(location)



[docs]
    def to_pydantic_model(self) -> _AssemblyFragment:
        return _AssemblyFragment(
            sequence=get_id(self.sequence),
            left_location=self.from_biopython_location(self.left_location),
            right_location=self.from_biopython_location(self.right_location),
            reverse_complemented=self.reverse_complemented,
        )





[docs]
class Source(ConfiguredBaseModel):
    input: list[Union[SourceInput, AssemblyFragment]] = Field(default_factory=list)
    TARGET_MODEL: ClassVar[Type[_Source]] = _Source


[docs]
    @field_serializer("input")
    def serialize_input(
        self, input: list[Union[SourceInput, AssemblyFragment]]
    ) -> list[_SourceInput | _AssemblyFragment]:
        return [fragment.to_pydantic_model() for fragment in input]



[docs]
    def to_pydantic_model(self, seq_id: int):
        model_dict = self.model_dump()
        model_dict["id"] = seq_id
        return self.TARGET_MODEL(**model_dict)



[docs]
    def to_unserialized_dict(self):
        """
        Converts into a dictionary without serializing the fields.
        This is used to be able to recast.
        """
        return {field: getattr(self, field) for field in self.__pydantic_fields__}



[docs]
    def add_to_history_graph(self, history_graph: nx.DiGraph, seq: "Dseqrecord"):
        """
        Add the source to the history graph.

        It does not use the get_id function, because it just uses it to have unique identifiers
        for graph nodes, not to store them anywhere.
        """
        from pydna.dseqrecord import Dseqrecord

        history_graph.add_node(id(seq), label=f"{seq.name} ({repr(seq)})")
        history_graph.add_node(id(self), label=str(self.TARGET_MODEL.__name__))
        history_graph.add_edge(id(seq), id(self))
        for fragment in self.input:
            fragment_seq = fragment.sequence
            # This could be a Primer as well, which doesn't have a source
            if isinstance(fragment_seq, Dseqrecord) and fragment_seq.source is not None:
                fragment_seq.source.add_to_history_graph(history_graph, fragment_seq)
            else:
                history_graph.add_node(
                    id(fragment_seq),
                    label=f"{fragment_seq.name} ({repr(fragment_seq)})",
                )
            history_graph.add_edge(id(self), id(fragment_seq))



[docs]
    def history_string(self, seq: "Dseqrecord"):
        """
        Returns a string representation of the cloning history of the sequence.
        See dseqrecord.history() for examples.
        """
        history_graph = nx.DiGraph()
        self.add_to_history_graph(history_graph, seq)
        return "\n".join(
            nx.generate_network_text(history_graph, with_labels=True, sources=[id(seq)])
        )





[docs]
class AssemblySource(Source):
    circular: bool

    TARGET_MODEL: ClassVar[Type[_AssemblySource]] = _AssemblySource


[docs]
    @classmethod
    def from_subfragment_representation(
        cls,
        assembly: SubFragmentRepresentationAssembly,
        fragments: list["Dseqrecord"],
        is_circular: bool,
    ):

        input_list = []
        for f_index, loc1, loc2 in assembly:
            input_list.append(
                AssemblyFragment(
                    sequence=fragments[abs(f_index) - 1],
                    left_location=loc1,
                    right_location=loc2,
                    reverse_complemented=f_index < 0,
                )
            )

        return AssemblySource(input=input_list, circular=is_circular)





[docs]
class DatabaseSource(Source):
    TARGET_MODEL: ClassVar[Type[_DatabaseSource]] = _DatabaseSource

    database_id: int




[docs]
class UploadedFileSource(Source):

    TARGET_MODEL: ClassVar[Type[_UploadedFileSource]] = _UploadedFileSource

    file_name: str
    index_in_file: int
    sequence_file_format: str




[docs]
class RepositoryIdSource(Source):

    TARGET_MODEL: ClassVar[Type[_RepositoryIdSource]] = _RepositoryIdSource

    repository_id: str

    # location: Location



[docs]
class RepositoryIdSourceWithSequenceFileUrl(RepositoryIdSource):
    """
    Auxiliary class to avoid code duplication in the sources that have
    a sequence file url.
    """

    sequence_file_url: Optional[str] = None




[docs]
class AddgeneIdSource(RepositoryIdSourceWithSequenceFileUrl):
    TARGET_MODEL: ClassVar[Type[_AddgeneIdSource]] = _AddgeneIdSource

    addgene_sequence_type: Optional[AddgeneSequenceType] = None




[docs]
class BenchlingUrlSource(RepositoryIdSource):
    TARGET_MODEL: ClassVar[Type[_BenchlingUrlSource]] = _BenchlingUrlSource




[docs]
class SnapGenePlasmidSource(RepositoryIdSource):
    TARGET_MODEL: ClassVar[Type[_SnapGenePlasmidSource]] = _SnapGenePlasmidSource




[docs]
class EuroscarfSource(RepositoryIdSource):
    TARGET_MODEL: ClassVar[Type[_EuroscarfSource]] = _EuroscarfSource




[docs]
class WekWikGeneIdSource(RepositoryIdSourceWithSequenceFileUrl):
    TARGET_MODEL: ClassVar[Type[_WekWikGeneIdSource]] = _WekWikGeneIdSource




[docs]
class SEVASource(RepositoryIdSourceWithSequenceFileUrl):
    TARGET_MODEL: ClassVar[Type[_SEVASource]] = _SEVASource




[docs]
class IGEMSource(RepositoryIdSourceWithSequenceFileUrl):
    TARGET_MODEL: ClassVar[Type[_IGEMSource]] = _IGEMSource




[docs]
class OpenDNACollectionsSource(RepositoryIdSourceWithSequenceFileUrl):
    TARGET_MODEL: ClassVar[Type[_OpenDNACollectionsSource]] = _OpenDNACollectionsSource




[docs]
class NCBISequenceSource(RepositoryIdSource):
    TARGET_MODEL: ClassVar[Type[_NCBISequenceSource]] = _NCBISequenceSource
    coordinates: SimpleLocation | None = None




[docs]
class GenomeCoordinatesSource(NCBISequenceSource):
    TARGET_MODEL: ClassVar[Type[_GenomeCoordinatesSource]] = _GenomeCoordinatesSource

    assembly_accession: Optional[str] = None
    locus_tag: Optional[str] = None
    gene_id: Optional[int] = None
    coordinates: SimpleLocation


[docs]
    @field_serializer("coordinates")
    def serialize_coordinates(self, coordinates: SimpleLocation) -> str:
        return SequenceLocationStr.from_biopython_location(coordinates)





[docs]
class RestrictionAndLigationSource(AssemblySource):
    restriction_enzymes: list[AbstractCut]

    TARGET_MODEL: ClassVar[Type[_RestrictionAndLigationSource]] = (
        _RestrictionAndLigationSource
    )


[docs]
    @field_serializer("restriction_enzymes")
    def serialize_restriction_enzymes(
        self, restriction_enzymes: list[AbstractCut]
    ) -> list[str]:
        return [str(enzyme) for enzyme in restriction_enzymes]





[docs]
class GibsonAssemblySource(AssemblySource):
    TARGET_MODEL: ClassVar[Type[_GibsonAssemblySource]] = _GibsonAssemblySource




[docs]
class InFusionSource(AssemblySource):
    TARGET_MODEL: ClassVar[Type[_InFusionSource]] = _InFusionSource




[docs]
class OverlapExtensionPCRLigationSource(AssemblySource):
    TARGET_MODEL: ClassVar[Type[_OverlapExtensionPCRLigationSource]] = (
        _OverlapExtensionPCRLigationSource
    )




[docs]
class InVivoAssemblySource(AssemblySource):
    TARGET_MODEL: ClassVar[Type[_InVivoAssemblySource]] = _InVivoAssemblySource




[docs]
class LigationSource(AssemblySource):
    TARGET_MODEL: ClassVar[Type[_LigationSource]] = _LigationSource




[docs]
class GatewaySource(AssemblySource):
    TARGET_MODEL: ClassVar[Type[_GatewaySource]] = _GatewaySource
    reaction_type: GatewayReactionType
    greedy: bool = Field(default=False)




[docs]
class HomologousRecombinationSource(AssemblySource):
    TARGET_MODEL: ClassVar[Type[_HomologousRecombinationSource]] = (
        _HomologousRecombinationSource
    )




[docs]
class CRISPRSource(HomologousRecombinationSource):
    TARGET_MODEL: ClassVar[Type[_CRISPRSource]] = _CRISPRSource




[docs]
class CreLoxRecombinationSource(AssemblySource):
    TARGET_MODEL: ClassVar[Type[_CreLoxRecombinationSource]] = (
        _CreLoxRecombinationSource
    )




[docs]
class RecombinaseSource(AssemblySource):
    pass




[docs]
class PCRSource(AssemblySource):
    TARGET_MODEL: ClassVar[Type[_PCRSource]] = _PCRSource
    add_primer_features: bool = Field(default=False)




[docs]
class SequenceCutSource(Source):
    left_edge: CutSiteType | None
    right_edge: CutSiteType | None

    @property
    def TARGET_MODEL(self):
        return (
            _RestrictionEnzymeDigestionSource
            if self._has_enzyme()
            else _SequenceCutSource
        )


[docs]
    @field_serializer("left_edge", "right_edge")
    def serialize_cut_site(
        self, cut_site: CutSiteType | None
    ) -> _RestrictionSequenceCut | _SequenceCut | None:
        return self._cutsite_to_model(cut_site)


    @staticmethod
    def _cutsite_to_model(cut_site: CutSiteType | None):
        if cut_site is None:
            return None
        watson, overhang = cut_site[0]
        enzyme_or_none = cut_site[1]
        if isinstance(enzyme_or_none, AbstractCut):
            return _RestrictionSequenceCut(
                cut_watson=watson,
                overhang=overhang,
                restriction_enzyme=str(enzyme_or_none),
            )
        return _SequenceCut(cut_watson=watson, overhang=overhang)


[docs]
    @classmethod
    def from_parent(
        cls, parent: "Dseqrecord", left_edge: CutSiteType, right_edge: CutSiteType
    ):
        return cls(
            input=[SourceInput(sequence=parent)],
            left_edge=left_edge,
            right_edge=right_edge,
        )


    def _has_enzyme(self) -> bool:
        def has_enzyme(edge):
            return edge is not None and isinstance(edge[1], AbstractCut)

        return has_enzyme(self.left_edge) or has_enzyme(self.right_edge)




[docs]
class OligoHybridizationSource(Source):
    TARGET_MODEL: ClassVar[Type[_OligoHybridizationSource]] = _OligoHybridizationSource

    overhang_crick_3prime: Optional[int] = None




[docs]
class PolymeraseExtensionSource(Source):
    TARGET_MODEL: ClassVar[Type[_PolymeraseExtensionSource]] = (
        _PolymeraseExtensionSource
    )




[docs]
class AnnotationSource(Source):
    TARGET_MODEL: ClassVar[Type[_AnnotationSource]] = _AnnotationSource

    annotation_tool: AnnotationTool
    annotation_tool_version: Optional[str] = None
    annotation_report: Optional[
        list[_AnnotationReport | _PlannotateAnnotationReport]
    ] = None




[docs]
class ReverseComplementSource(Source):
    TARGET_MODEL: ClassVar[Type[_ReverseComplementSource]] = _ReverseComplementSource




[docs]
class CloningStrategy(_BaseCloningStrategy):

    # For now, we don't add anything, but the classes will not have the new
    # methods if this is used
    # It will be used for validation for now
    primers: Optional[List[PrimerModel]] = Field(
        default_factory=list,
        description="""The primers that are used in the cloning strategy""",
        json_schema_extra={
            "linkml_meta": {"alias": "primers", "domain_of": ["CloningStrategy"]}
        },
    )


[docs]
    def add_primer(self, primer: "Primer"):
        existing_ids = {seq.id for seq in self.primers}
        if get_id(primer) in existing_ids:
            return
        self.primers.append(PrimerModel.from_primer(primer))



[docs]
    def add_dseqrecord(self, dseqr: "Dseqrecord"):
        from pydna.dseqrecord import Dseqrecord

        existing_ids = {seq.id for seq in self.sequences}
        if get_id(dseqr) in existing_ids:
            return
        self.sequences.append(TextFileSequence.from_dseqrecord(dseqr))
        if dseqr.source is not None:
            self.sources.append(dseqr.source.to_pydantic_model(get_id(dseqr)))
            this_source: Source = dseqr.source
            for source_input in this_source.input:
                if isinstance(source_input.sequence, Dseqrecord):
                    self.add_dseqrecord(source_input.sequence)
                else:
                    self.add_primer(source_input.sequence)
        else:
            self.sources.append(_ManuallyTypedSource(id=get_id(dseqr), input=[]))



[docs]
    def reassign_ids(self):
        all_ids = (
            {seq.id for seq in self.sequences}
            | {source.id for source in self.sources}
            | {primer.id for primer in self.primers}
        )
        id_mappings = {id: i + 1 for i, id in enumerate(sorted(all_ids))}
        for seq in self.sequences:
            seq.id = id_mappings[seq.id]
        for primer in self.primers:
            primer.id = id_mappings[primer.id]
        for source in self.sources:
            source.id = id_mappings[source.id]
            for assembly_fragment in source.input:
                assembly_fragment.sequence = id_mappings[assembly_fragment.sequence]



[docs]
    @classmethod
    def from_dseqrecords(cls, dseqrs: list["Dseqrecord"], description: str = ""):
        cloning_strategy = cls(sources=[], sequences=[], description=description)
        for dseqr in dseqrs:
            cloning_strategy.add_dseqrecord(dseqr)
        return cloning_strategy



[docs]
    def model_dump_json(self, *args, **kwargs):
        if getattr(_thread_local, "use_python_internal_id", True):
            # Make a deep copy of the cloning strategy and reassign ids
            cs = self.__deepcopy__()
            cs.reassign_ids()
            return super(CloningStrategy, cs).model_dump_json(*args, **kwargs)
        return super().model_dump_json(*args, **kwargs)



[docs]
    def model_dump(self, *args, **kwargs):
        if getattr(_thread_local, "use_python_internal_id", True):
            cs = self.__deepcopy__()
            cs.reassign_ids()
            return super(CloningStrategy, cs).model_dump(*args, **kwargs)
        return super().model_dump(*args, **kwargs)