Source code for dnachisel.builtin_specifications.codon_optimization.HarmonizeRCA

import numpy as np

from ...Specification.SpecEvaluation import SpecEvaluation
from .BaseCodonOptimizationClass import BaseCodonOptimizationClass



[docs]
class HarmonizeRCA(BaseCodonOptimizationClass):
    """Codon-Harmonize a native sequence for a new host (Claassens method).

    This specification will optimize a Sequence 1 from Host 1 into a Sequence
    2 for target Host 2.

    In simple, rare Host 1 codons will be replaced by rare Host 2 codons, and
    high-frequency Host 1 codons will get replaced by codons that are
    high-frequency in Host 2.

    In more specific, each codon along Sequence 1 gets replaced by the codon
    whose Relative Codon Adaptiveness (RCA) in Host 2 is the closest from the
    RCA of the original codon in Host 1. A codon's RCA in a given organism is
    defined by f/fmax where f is the codon's frequency in the organism and fmax
    is the highest frequency of all synonymous codons.

    The minimized quantity is sum_i abs(RCA(c_i, H1) - RCA(c'_i, H2))
    where c_i, c'_i represent the i-th codon before and after optimization

    This method is taken from Claassens 2017, where they simplify a previous
    algorithm (Angov 2008), which was much more complicated as it involved
    predicting "ribosome pausing" sites in the sequence.

    Warning: always use this specification with an EnforceTranslation constraint
    defined over the same location, to preserve the amino acid sequence.


    Parameters
    ----------
    species
      Name or TaxID of the species for which to optimize the sequence. A custom
      codon_usage_table can be provided instead (or in addition, for species
      names whose codon usage table cannot be imported).

    codon_usage_table
      Optional - can be provided instead of ``species``. A dict of the form
      ``{'*': {"TGA": 0.112, "TAA": 0.68}, 'K': ...}`` giving the codon
      frequency table (relative usage of each codon; frequencies add up to 1,
      separately for each amino acid).

    original_species
      Name or TaxID of the species the original sequence was taken from. This
      information will be used to spot codons which are supposed to be rare
      or common. A codon_usage_table can be provided instead (or in addition,
      for species names whose codon usage table cannot be imported).

    original_codon_usage_table
      A dict of the form ``{'*': {"TGA": 0.112, "TAA": 0.68}, 'K': ...}``
      giving the codon frequency table (relative usage of each codon;
      frequencies add up to 1, separately for each amino acid).

    location
      Location on which the specification applies

    boost
      Score multiplicator (=weight) for when the specification is used as an
      optimization objective alongside competing objectives.


    References
    ----------
    Claassens et. al., Improving heterologous membrane protein
    production in Escherichia coli by combining transcriptional tuning and
    codon usage algorithms. PLOS One, 2017
    """

    shorthand_name = "harmonize_rca"

    def __init__(
        self,
        species=None,
        codon_usage_table=None,
        original_species=None,
        original_codon_usage_table=None,
        location=None,
        boost=1,
    ):
        if isinstance(species, str) and "->" in species:
            original_species, species = species.split("->")
            species = species.strip()
            original_species = original_species.strip()
        BaseCodonOptimizationClass.__init__(
            self,
            species=species,
            codon_usage_table=codon_usage_table,
            location=location,
            boost=boost,
        )
        self.codons_synonyms = self.get_codons_synonyms()
        self.original_species = original_species
        self.original_codon_usage_table = self.get_codons_table(
            original_species, original_codon_usage_table
        )
        for table in [self.codon_usage_table, self.original_codon_usage_table]:
            if "RCA" not in table:
                table["RCA"] = {
                    codon: frequency / max(codons_frequencies.values())
                    for aa, codons_frequencies in table.items()
                    for codon, frequency in codons_frequencies.items()
                    if len(aa) == 1
                }

    def initialized_on_problem(self, problem, role):
        new_spec = self._copy_with_full_span_if_no_location(problem)
        new_spec.original_codons = new_spec.get_codons(problem)
        rca = new_spec.codon_usage_table["RCA"]
        rca_o = new_spec.original_codon_usage_table["RCA"]
        new_spec.smallest_possible_discrepancies = [
            min([abs(rca[c] - rca_o[c]) for c in self.codons_synonyms[codon]])
            for codon in new_spec.original_codons
        ]
        return new_spec

    def evaluate(self, problem):
        """Return the evaluation for method=="harmonize_rca"."""
        codons = self.get_codons(problem)

        if len(codons) == 1:
            # We are evaluating a single codon. Easy!
            codon = codons[0]
            original = self.original_codons[0]
            rca_codon = self.codon_usage_table["RCA"][codon]
            rca_original = self.original_codon_usage_table["RCA"][original]
            score = -abs(rca_codon - rca_original)
            return SpecEvaluation(
                self,
                problem,
                score=score,
                locations=[] if (score == 0) else [self.location],
                message="Codon harmonization on window %s scored %.02E"
                % (self.location, score),
            )
        # print (len(codons))
        rca_in_original_species = [
            self.original_codon_usage_table["RCA"][original_codon]
            for original_codon in self.original_codons
        ]
        rca_in_target_species = [
            self.codon_usage_table["RCA"][codon] for codon in codons
        ]
        discrepancies = abs(
            np.array(rca_in_original_species) - np.array(rca_in_target_species)
        )
        non_optimality = self.smallest_possible_discrepancies - discrepancies
        nonoptimal_indices = np.nonzero(non_optimality)[0]
        locations = self.codons_indices_to_locations(nonoptimal_indices)
        score = -discrepancies.sum()
        return SpecEvaluation(
            self,
            problem,
            score=score,
            locations=locations,
            message="Codon harmonization on %s scored %.02E" % (self.location, score),
        )

    def label_parameters(self):
        if self.species is None:
            return ["(custom table)"]
        else:
            return [self.original_species + " -> " + self.species]

    def short_label(self):
        short_label = "harmonize-rca ("
        if self.species is not None:
            target_species = self.species
        else:
            target_species = "custom table"
        if self.original_species is not None:
            original_species = self.original_species
        else:
            original_species = "custom table"
        # short_label += " (%s)" % self.species
        return short_label + original_species + " -> " + target_species + ")"