Source code for dnachisel.builtin_specifications.EnforceGCContent

"""Implement EnforceGCContent."""

import numpy as np
import re

from ..biotools import gc_content, group_nearby_segments
from ..Location import Location
from ..Specification import Specification, SpecEvaluation



[docs]class EnforceGCContent(Specification): """Specification on the local or global proportion of G/C nucleotides. Shorthand for annotations: "gc". Examples -------- >>> # Enforce global GC content between 40 and 70 percent. >>> Specification = GCContentSpecification(0.4, 0.7) >>> # Enforce 30-80 percent local GC content over 50-nucleotides windows >>> Specification = GCContentSpecification(0.3, 0.8, window=50) Parameters ---------- mini Minimal proportion of G-C (e.g. ``0.35``) maxi Maximal proportion of G-C (e.g. ``0.75``) target Target proportion of GC (e.g. ``0.4``), which can be used instead of ``mini`` and ``maxi`` when using the specification as an optimization objective. window Length of the sliding window, in nucleotides, for local GC content. If not provided, the global GC content of the whole sequence is considered location Location objet indicating that the Specification only applies to a subsegment of the sequence. Make sure it is bigger than ``window`` if both parameters are provided """ best_possible_score = 0 locations_span = 50 # The resolution will use locations size shorthand_name = "gc" def __init__( self, mini=0, maxi=1.0, target=None, window=None, location=None, boost=1.0, ): """Initialize.""" if isinstance(mini, str): mini, maxi, target, _window = self.string_to_parameters(mini) if _window is not None: window = _window if target is not None: mini = maxi = target self.target = target self.mini = mini self.maxi = maxi self.window = window location = Location.from_data(location) if location is not None and (location.strand == -1): location = Location(location.start, location.end, 1) self.location = location self.boost = boost @staticmethod def string_to_parameters(pattern): target = None gc_pattern = r"(\d+)(|-(\d+))%($|/(\d+)bp)" match = re.match(gc_pattern, pattern) mini, _, maxi, _, window = match.groups() if maxi is None: target = mini mini = None mini, maxi, target = [ None if e is None else int(e) / 100.0 for e in (mini, maxi, target) ] window = None if window is None else int(window) return mini, maxi, target, window def initialized_on_problem(self, problem, role=None): return self._copy_with_full_span_if_no_location(problem) def evaluate(self, problem): """Return the sum of breaches extent for all windowed breaches.""" wstart, wend = self.location.start, self.location.end sequence = self.location.extract_sequence(problem.sequence) gc = gc_content(sequence, window_size=self.window) breaches = np.maximum(0, self.mini - gc) + np.maximum( 0, gc - self.maxi ) score = -breaches.sum() breaches_starts = wstart + (breaches > 0).nonzero()[0] if len(breaches_starts) == 0: breaches_locations = [] elif len(breaches_starts) == 1: if self.window is not None: start = breaches_starts[0] breaches_locations = [[start, start + self.window]] else: breaches_locations = [[wstart, wend]] else: segments = [(bs, bs + self.window) for bs in breaches_starts] groups = group_nearby_segments( segments, max_start_spread=max(1, self.locations_span) ) breaches_locations = [ (group[0][0], group[-1][-1]) for group in groups ] if breaches_locations == []: message = "Passed !" else: breaches_locations = [Location(*loc) for loc in breaches_locations] message = "Out of bound on segments " + ", ".join( [str(l) for l in breaches_locations] ) return SpecEvaluation( self, problem, score, locations=breaches_locations, message=message ) def localized(self, location, problem=None, with_righthand=True): """Localize the GC content evaluation. For a location, the GC content evaluation will be restricted to [start - window, end + window] """ # if self.location is not None: if self.window is None: # No specific location policy at the moment for non-windowed GC # content. The same specification is returned. # NOTE: this could be refined by computing # how much the local bounds should be in order for the global # sequence not get out of bound. This would be faster but much # more complicated. return self new_location = self.location.overlap_region(location) if new_location is None: return None else: extension = 0 if self.window is None else self.window - 1 extended_location = location.extended( extension, right=with_righthand ) new_location = self.location.overlap_region(extended_location) return self.copy_with_changes(location=new_location) def label_parameters(self): show_mini = self.mini is not None and (self.target is None) show_maxi = self.maxi is not None and (self.target is None) show_target = not (show_mini or show_maxi) show_window = self.window is not None return ( show_mini * [("mini", "%.2f" % self.mini)] + show_maxi * [("maxi", "%.2f" % self.maxi)] + show_target * [("target", ("%.2f" % self.target) if self.target else "")] + show_window * [("window", ("%d" % self.window) if self.window else "")] ) def short_label(self): if self.target is not None: result = "%d%% GC" % (np.round(100 * self.target)) else: result = "%d-%d%% GC" % ( np.round(100 * self.mini), np.round(100 * self.maxi), ) if self.window is not None: result += "/%dbp" % self.window return result def breach_label(self): if self.target is not None: result = "GC != %d%% " % (np.round(100 * self.target)) else: result = "GC outside %d-%d%%" % ( np.round(100 * self.mini), np.round(100 * self.maxi), ) if self.window is not None: result += "/%dbp" % self.window return result