"""Implements the SequencePattern, DnaNotationPattern classes.
These classes are responsible for looking for a pattern in a sequence
(including overlapping patterns !), separating patterns with fixed size
and patterns with maximal size (from problem localization purposes).
The module also implements functions to specify common DNA patterns:
homopolymers, repeats, enzymatic restriction sites.
"""
import re
from ..biotools import reverse_complement
from ..Location import Location
[docs]class SequencePattern:
"""Pattern/ that will be looked for in a DNA sequence.
Use this class for matching regular expression patterns, and
DnaNotationPattern for matching explicit sequences or sequences using Ns
etc.
Examples
--------
>>> expression = "A[ATGC]{3,}"
>>> pattern = SequencePattern(expression)
>>> constraint = AvoidPattern(pattern)
Parameters
----------
expression
Any string or regular expression for matching ATGC nucleotides.
Note that multi-nucleotides symbols such as "N" (for A-T-G-C), or "K"
are not supported by this class, see DnaNotationPattern instead.
size
Size of the pattern, in number of characters (if none provided, the size
of the ``pattern`` string is used).
The ``size`` is used to determine the size of windows when performing
local optimization and constraint solving.
It can be important to provide the size when the
``pattern`` string provided represents a complex regular expression whose
maximal matching size cannot be easily evaluated.
name
Name of the pattern (will be displayed e.g. when the pattern is printed)
"""
registered_string_pattern_classes = []
def __init__(
self,
expression,
size=None,
name=None,
lookahead="loop",
is_palyndromic=False,
):
# if size is None:
# self.shrink_when_localized = False
# # size = len(expression)
self.expression = expression
self.lookahead = lookahead
if lookahead == "re":
expression = "(?=(%s))" % expression
if "(" not in expression:
expression = "(%s)" % expression
self.lookahead_expression = expression
self.compiled_expression = re.compile(self.lookahead_expression)
self.size = size
self.name = name
self.is_palyndromic = is_palyndromic
[docs] def find_matches(self, sequence, location=None, forced_strand=None):
"""Return the locations where the sequence matches the expression.
Parameters
----------
sequence
A string of "ATGC..."
location
Location indicating a segment to which to restrict
the search. Only patterns entirely included in the segment will be
returned
Returns
-------
matches
A list of the locations of matches, of the form
``[(start1, end1), (start2, end2),...]``.
"""
# THE FUNCTION HAS BEEN CALLED WITH A LOCATION AND A FORCED STRAND
if forced_strand is not None:
subsequence = sequence[location.start: location.end]
if forced_strand == 1:
return [
(loc + location.start)
for loc in self.find_matches(subsequence)
]
if forced_strand == -1:
subsequence = reverse_complement(subsequence)
return [
Location(
location.end - loc.end,
location.end - loc.start,
strand=-1,
)
for loc in self.find_matches(subsequence)
]
# THE FUNCTION HAS BEEN CALLED WITH A LOCATION ONLY
if location is not None:
strand = location.strand
if strand == 1:
return self.find_matches(sequence, location, 1)
if strand == -1:
if self.is_palyndromic:
return self.find_matches(sequence, location, 1)
else:
return self.find_matches(sequence, location, -1)
if strand == 0:
matches = self.find_matches(sequence, location, 1)
if not self.is_palyndromic:
matches += self.find_matches(sequence, location, -1)
return matches
# THE FUNCTION HAS BEEN CALLED WITH NO LOCATION/STRAND: WHOLE SEQUENCE
matches = self.find_matches_in_string(sequence)
return [Location(start, end, strand) for start, end, strand in matches]
def find_matches_in_string(self, sequence):
if self.lookahead == "loop":
matches = []
position = 0
while True:
result = re.search(self.compiled_expression, sequence)
if result is None:
return matches
start, end = result.start(), result.end()
matches.append((start + position, end + position, 1))
sequence = sequence[start + 1 :]
position += start + 1
else:
return [
(match.start(), match.start() + len(match.groups()[0]), 1)
for match in re.finditer(self.compiled_expression, sequence)
]
def __str__(self):
return self.expression + (
"" if self.name is None else " (%s)" % self.name
)
@classmethod
def from_string(cls, string):
for myclass in cls.registered_string_pattern_classes:
pattern = myclass.from_string(string)
if pattern is not None:
return pattern
return SequencePattern(string)