Source code for dnachisel.DnaOptimizationProblem.DnaOptimizationProblem

"""Define the DnaOptimizationProblem class.

DnaOptimizationProblem is where the whole problem is defined: sequence,
constraints, objectives.
"""

from Bio.SeqRecord import SeqRecord
from proglog import default_bar_logger
from ..Specification.SpecificationSet import SpecificationSet
from ..biotools import sequences_differences_array
from ..MutationSpace import MutationSpace
from ..reports.optimization_reports import (
    write_optimization_report,
    write_no_solution_report,
)
from .NoSolutionError import NoSolutionError
from . import mixins



[docs]
class DnaOptimizationProblem(
    mixins.ConstraintsSolverMixin,
    mixins.ObjectivesMaximizerMixin,
    mixins.RecordRepresentationMixin,
):
    """Problem specifications: sequence, constraints, optimization objectives.

    The original constraints, objectives, and original sequence of the problem
    are stored in the DNA problem. This class also has methods to display
    reports on the constraints and objectives, as well as solving the
    constraints and objectives.

    Examples
    --------

    >>> from dnachisel import *
    >>> problem = DnaOptimizationProblem(
    >>>     sequence = "ATGCGTGTGTGC...",
    >>>     constraints = [constraint1, constraint2, ...],
    >>>     objectives = [objective1, objective2, ...]
    >>> )
    >>> problem.resolve_constraints()
    >>> problem.optimize()
    >>> print(problem.constraints_text_summary())
    >>> print(problem.objectives_text_summary())


    Parameters
    ----------

    sequence
      A string of ATGC characters (they must be upper case!), e.g. "ATTGTGTA"

    constraints
      A list of objects of type ``Specification``.

    objectives
      A list of objects of type ``Specification`` specifying what must be
      optimized in the problem. Note that each objective has a float ``boost``
      parameter. The larger the boost, the more the objective is taken into
      account during the optimization.

    logger
      Either None for no logger, 'bar' for a tqdm progress bar logger, or
      any ProgLog progress bar logger.

    mutations_space
      A MutationSpace indicating the possible mutations. In most case the
      mutation space will be left to None and computed at problem
      initialization (which can be slightly compute-intensive), however some
      core DNA Chisel methods will create optimization problems with a provided
      mutation_space to save computing time.

    Attributes
    ----------

    randomization_threshold
      The algorithm will use an exhaustive search when the size of the mutation
      space (=the number of possible variants) is above this threshold, and
      a (guided) random search when it is above.

    max_random_iters
      When using a random search, stop after this many iterations

    mutations_per_iteration
      When using a random search, produce this many sequence mutations each
      iteration.

    optimization_stagnation_tolerance
      When using a random search, stop if the score hasn't improved in
      the last "this many" iterations

    local_extensions
      Try local resolution several times if it fails, increasing the mutable
      zone by [N1, N2...] nucleotides on each side, until resolution works.
      (by default, an extension of 0bp is tried, then 5bp.

    Notes
    -----

    The dictionary ``self.possible_mutations`` is of the form
    ``{location1 : list1, location2: list2...}``
    where ``location`` is either a single index (e.g. 10) indicating the
    position of a nucleotide to be muted, or a couple ``(start, end)``
    indicating a whole segment whose sub-sequence should be replaced.
    The ``list`` s are lists of possible sequences to replace each location,
    e.g. for the mutation of a whole codon ``(3,6): ["ATT", "ACT", "AGT"]``.
    """

    randomization_threshold = 10000
    max_random_iters = 1000
    mutations_per_iteration = 2
    optimization_stagnation_tolerance = 100
    local_extensions = (0, 5)

    def __init__(
        self,
        sequence,
        constraints=None,
        objectives=None,
        logger="bar",
        mutation_space=None,
    ):
        """Initialize"""
        if isinstance(sequence, SeqRecord):
            self.record = sequence
            self.sequence = str(sequence.seq).upper()
        else:
            self.record = None
            self.sequence = sequence.upper()
        self.constraints = [] if constraints is None else list(constraints)
        self.objectives = [] if objectives is None else list(objectives)
        self.logger = default_bar_logger(
            logger,
            bars=("objective", "constraint", "location"),
            ignored_bars=("mutation",),
            min_time_interval=0.2,
        )
        self.mutation_space = mutation_space
        self.initialize()


[docs]
    def initialize(self):
        """Precompute specification sets, evaluations, and mutation space."""

        # Find the specifications (objectives, constraints) which are actually
        #  SpecificationSets, and unpack these to complete the lists of
        # objectives and constraints.
        for specs in (self.constraints, self.objectives):
            specsets = [spec for spec in specs if isinstance(spec, SpecificationSet)]
            specs_in_sets = [
                spec for specset in specsets for spec in specset.specifications.values()
            ]
            for specset in specsets:
                specs.remove(specset)
            specs.extend(specs_in_sets)

        # INITIALIZE THE CONSTRAINTS AND OBJECTIVES

        self.constraints = [
            constraint.initialized_on_problem(self, role="constraint")
            for constraint in self.constraints
        ]
        self.objectives = [
            objective.initialized_on_problem(self, role="objective")
            for objective in self.objectives
        ]

        # INITIALIZE THE "BEFORE" CLASS ATTRIBUTES, USED IN REPORTS

        self.sequence_before = self.sequence
        self._constraints_before = None
        self._objectives_before = None

        # INITIALIZE THE MUTATION SPACE

        if self.mutation_space is None:
            self.mutation_space = MutationSpace.from_optimization_problem(self)
            # If the original sequence is outside of the allowed mutations
            # space, replace the sequence by a sequence which complies with
            # the mutation space.
            self.sequence = self.mutation_space.constrain_sequence(self.sequence)


    def _replace_sequence(self, new_sequence):
        """Replace the current sequence of the problem.

        This method is subclassed in CircularDnaOptimization problem where
        is is more complex (changing the sequence in one location changes
        it in more locations).
        """
        self.sequence = new_sequence


[docs]
    def sequence_edits_as_array(self):
        """Return an array [False, False, True...] where True indicates an edit
        (i.e. a change at this position between the original problem sequence
        and the current one)."""
        return sequences_differences_array(self.sequence, self.sequence_before)



[docs]
    def number_of_edits(self):
        """Return the number of nucleotide differences between the original
        and current sequence."""
        return self.sequence_edits_as_array().sum()



[docs]
    def optimize_with_report(
        self,
        target,
        project_name="My project",
        file_path=None,
        file_content=None,
    ):
        """Resolve constraints, optimize objectives, write a multi-file report.

        The report's content may vary depending on the optimization's success.

        Parameters
        ----------

        target
          Either a path to a folder that will contain the report, or a path to
          a zip archive, or "@memory" to return raw data of a zip archive
          containing the report.

        project_name
          Project name to write on PDF reports

        Returns
        -------

        (success, message, zip_data)
          Triplet where success is True/False, message is a one-line string
          summary indication whether some clash was found, or some solution, or
          maybe no solution was found because the random searches were too short
        """
        self.logger(message="Solving constraints")
        try:
            self.resolve_constraints()
        except NoSolutionError as error:
            self.logger(message="No solution found: making report")
            data = write_no_solution_report(
                target,
                self,
                error,
                file_path=file_path,
                file_content=file_content,
            )
            start, end, s = error.location.to_tuple()
            message = "No solution found in zone [%d, %d]: %s." % (
                start,
                end,
                str(error),
            )
            return False, message, data
        self.logger(message="Now optimizing the sequence")
        self.optimize()
        self.logger(message="Success! Generating report.")
        data = write_optimization_report(
            target,
            self,
            project_name=project_name,
            file_path=file_path,
            file_content=file_content,
        )
        return True, "Optimization successful.", data