from Bio.Seq import Seq
from Bio import motifs
from Bio.Align.AlignInfo import PSSM
from .SequencePattern import SequencePattern
import numpy as np
[docs]class MotifPssmPattern(SequencePattern):
"""Motif pattern represented by a Position-Specific Scoring Matrix (PSSM).
This class is better instantiated by creating a pattern from a set of
sequence strings with ``MotifPssmPattern.from_sequences()``, or reading
pattern(s) for a file with ``MotifPssmPattern.from_file()``.
Parameters
----------
pssm
A Bio.Align.AlignInfo.PSSM object.
threshold
locations of the sequence with a PSSM score above this value will be
considered matches. For convenience, a relative_threshold can be
given instead.
relative_threshold
Value between 0 and 1 from which the threshold will be auto-computed.
0 means "match everything", 1 means "only match the one (or several)
sequence(s) with the absolute highest possible score".
"""
def __init__(
self, pssm, threshold=None, relative_threshold=None,
):
self.name = pssm.name
self.pssm = pssm.pssm
if relative_threshold is not None:
mini, maxi = self.pssm.min, self.pssm.max
threshold = mini + relative_threshold * (maxi - mini)
self.threshold = threshold
self.relative_threshold = relative_threshold
self.size = pssm.length
self.pssm_matrix = np.array([self.pssm[n] for n in "ATGC"])
self.is_palyndromic = False
[docs] @classmethod
def apply_pseudocounts(cls, motif, pseudocounts):
"""Add pseudocounts to the motif's pssm matrix.
Add nothing if pseudocounts is None, auto-compute pseudocounts if
pseudocounts="jaspar", else just attribute the pseudocounts as-is
to the motif.
"""
if pseudocounts is not None:
if pseudocounts == "jaspar":
pseudocounts = motifs.jaspar.calculate_pseudocounts(motif)
motif.pseudocounts = pseudocounts
[docs] def find_matches_in_string(self, sequence):
"""Find all positions where the PSSM score is above threshold."""
# NOTE: Before, I made my PSSM searches with Biopython. It was looong!
# Now I use Numpy and np.choice(), and I never looked back
# sequence = Seq(sequence, alphabet=alphabet)
# search = self.pssm.search(
# sequence, threshold=self.threshold, both=False
# )
indices = find_pssm_matches_with_numpy(
pssm_matrix=self.pssm_matrix, sequence=sequence, threshold=self.threshold,
)
return [(i, i + self.size, 1) for i in indices]
[docs] @classmethod
def from_sequences(
cls,
sequences,
name="unnamed",
pseudocounts="jaspar",
threshold=None,
relative_threshold=None,
):
"""Return a PSSM pattern computed from same-length sequences.
Parameters
----------
sequences
A list of same-length sequences.
name
Name to give to the pattern (will appear in reports etc.).
pseudocounts
Either a dict {"A": 0.01, "T": ...} or "jaspar" for automatic
pseudocounts from the Biopython.motifs.jaspar module (recommended),
or None for no pseudocounts at all (not recommended!).
threshold
locations of the sequence with a PSSM score above this value will be
considered matches. For convenience, a relative_threshold can be
given instead.
relative_threshold
Value between 0 and 1 from which the threshold will be auto-computed.
0 means "match everything", 1 means "only match the one (or several)
sequence(s) with the absolute highest possible score".
"""
sequences = [Seq(s) for s in sequences]
motif = motifs.create(sequences)
cls.apply_pseudocounts(motif, pseudocounts)
pssm = PSSM(motif.pssm)
pssm.name = name
return MotifPssmPattern(
pssm=pssm, threshold=threshold, relative_threshold=relative_threshold,
)
[docs] @classmethod
def list_from_file(
cls,
motifs_file,
file_format,
threshold=None,
pseudocounts="jaspar",
relative_threshold=None,
):
"""Return a list of PSSM patterns from a file in JASPAR, MEME, etc.
Parameters
----------
motifs_file
Path to a motifs file, or file handle.
file_format
File format. one of "jaspar", "meme", "TRANSFAC".
pseudocounts
Either a dict {"A": 0.01, "T": ...} or "jaspar" for automatic
pseudocounts from the Biopython.motifs.jaspar module (recommended),
or None for no pseudocounts at all (not recommended!).
threshold
locations of the sequence with a PSSM score above this value will be
considered matches. For convenience, a relative_threshold can be
given instead.
relative_threshold
Value between 0 and 1 from which the threshold will be auto-computed.
0 means "match everything", 1 means "only match the one (or several)
sequence(s) with the absolute highest possible score".
"""
if isinstance(motifs_file, str):
with open("./jaspar.txt", "r") as f:
motifs_list = motifs.parse(f, file_format)
else:
motifs_list = motifs.parse(motifs_file, file_format)
if pseudocounts is not None:
for motif in motifs_list:
cls.apply_pseudocounts(motif, pseudocounts)
return [
MotifPssmPattern(
pssm, threshold=threshold, relative_threshold=relative_threshold,
)
for pssm in motifs_list
]
def __str__(self):
if self.relative_threshold is not None:
threshold = "%d%%" % (100 * self.relative_threshold)
else:
threshold = "%.2f" % self.threshold
return "%s-PSSM(%s+)" % (self.name, threshold)
def __repr__(self):
if self.relative_threshold is not None:
threshold = "%d%%" % (100 * self.relative_threshold)
else:
threshold = "%.2f" % self.threshold
return "%s-PSSM(%s+)" % (self.name, threshold)
def find_pssm_matches_with_numpy(pssm_matrix, sequence, threshold):
"""Return every index in the +1 strand wit a PSSM score above threshold.
My numpy-based implementation is 10 times faster than Biopython for some
reason. Weird. Can someone else check?
Parameters:
-----------
pssm
A matrix whose rows give the frequency motif of ATGC (in this order).
sequence
A string representing a DNA sequence.
threshold
Every index with a score above this threshold will be returned.
"""
nucleotide_to_index = dict(zip("ATGC", range(4)))
len_pattern = len(pssm_matrix[0])
# If sequence is small, use normal python to avoid numpy overhead
if len(sequence) < 60:
nucl_indices = [nucleotide_to_index[n] for n in sequence]
return [
i
for i in range(len(sequence) - len_pattern)
if np.choose(nucl_indices[i : len_pattern + i], pssm_matrix).sum()
>= threshold
]
# If sequence is large, use Numpy for speed. tested experimentally
nucl_indices = np.array([nucleotide_to_index[n] for n in sequence], dtype="uint8")
len_pattern = len(pssm_matrix[0])
scores = np.array(
[
np.choose(nucl_indices[k : len_pattern + k], pssm_matrix).sum()
for k in range(len(sequence) - len_pattern)
]
)
return np.nonzero(scores >= threshold)[0]