Source code for dnachisel.builtin_specifications.EnforceSequence

"""Implement EnforceSequence (DO NOT USE YET: Work in progress, stabilizing)"""

# TODO: factorize with self.sequence ?

import numpy as np

from ..Specification import Specification, SpecEvaluation
from ..Location import Location
from ..biotools import group_nearby_indices, reverse_complement, IUPAC_NOTATION


[docs]class EnforceSequence(Specification): """Enforces a (possibly degenerate) sequence at some location. Shorthand for annotations: "sequence". Parameters ---------- sequence An ATGC string representing the wanted sequence, possibly degenerated, for instance ATTCGCGTYTTKWNAA location Location of the DNA segment on which to enforce the pattern e.g. ``Location(10, 45, 1)`` or simply ``(10, 45, 1)`` """ localization_interval_length = 6 # used when optimizing best_possible_score = 0 enforced_by_nucleotide_restrictions = True shorthand_name = "sequence" def __init__(self, sequence=None, location=None, boost=1.0): """Initialize.""" self.sequence = sequence self.location = Location.from_data(location) self.boost = boost def initialized_on_problem(self, problem, role): """Find out what sequence it is that we are supposed to conserve.""" return self._copy_with_full_span_if_no_location(problem) # if self.location is None: # result = self.copy_with_changes() # result.location = Location(0, len(problem.sequence), 1) # return result # else: # return self def evaluate(self, problem): """Return a score equal to -number_of modifications. Locations are "binned" modifications regions. Each bin has a length in nucleotides equal to ``localization_interval_length`.` """ sequence = self.location.extract_sequence(problem.sequence) discrepancies = np.array( [ i for i, nuc in enumerate(sequence) if nuc not in IUPAC_NOTATION[self.sequence[i]] ] ) if self.location.strand == -1: discrepancies = self.location.end - discrepancies else: discrepancies = discrepancies + self.location.start intervals = [ (r[0], r[-1] + 1) for r in group_nearby_indices( discrepancies, max_group_spread=self.localization_interval_length ) ] locations = [Location(start, end, 1) for start, end in intervals] return SpecEvaluation( self, problem, score=-len(discrepancies), locations=locations ) def localized(self, location, problem=None): """Localize the spec to the overlap of its location and the new.""" start, end = location.start, location.end new_location = self.location.overlap_region(location) if new_location is None: return None else: if self.location.strand == -1: start = self.location.end - new_location.end end = self.location.end - new_location.start else: start = new_location.start - self.location.start end = new_location.end - self.location.start new_sequence = self.sequence[start:end] return self.copy_with_changes(location=new_location, sequence=new_sequence) def restrict_nucleotides(self, sequence, location=None): """When localizing, forbid any nucleotide but the one already there.""" if location is not None: new_location = self.location.overlap_region(location) if new_location is None: return [] else: new_location = self.location start, end = new_location.start, new_location.end if self.location.strand == -1: lend = self.location.end return [ ( i, set( reverse_complement(n) for n in IUPAC_NOTATION[self.sequence[lend - i - 1]] ), ) for i in range(start, end) ] else: lstart = self.location.start return [ (i, IUPAC_NOTATION[self.sequence[i - lstart]]) for i in range(start, end) ] def __repr__(self): """Represent.""" return "EnforceSequence(%s)" % str(self.location) def __str__(self): """Represent.""" return "EnforceSequence(%s)" % str(self.location)