Source code for dnacauldron.Fragment.HomologousFragment.HomologyChecker

import Levenshtein
import numpy as np


[docs]class HomologyChecker: tm_dict = {"A": 2, "T": 2, "G": 4, "C": 4} def __init__( self, min_size=15, max_size=80, min_tm=0, max_tm=None, max_distance=0 ): """Class to define which homologies are acceptable. This class is for instance used to detect and validate end-homologies for Gibson Assembly. Melting temperatures are currently computed using the naive but efficient technique of A/T = 2C, G/C= 4C. Parameters ========== min_size, max_size Acceptable size, in nucleotides, for an homology. The upper bound helps reduce computational times min_tm, max_tm Acceptable melting temperature range in Celsius for the homology. max_distance If >0, small 1-, or 2- nucleotide differences (edits, deletion) can be accepted in homologies. """ self.min_size = min_size self.max_size = max_size self.min_tm = min_tm self.max_tm = max_tm self.max_distance = max_distance
[docs] def compute_tm(self, sequence): """Compute the melting temp. of a sequence""" return sum([self.tm_dict[c] for c in sequence], 0)
def sequence_to_string(self, seq): if isinstance(seq, str): return seq if hasattr(seq, "seq"): return str(seq.seq) return str(seq)
[docs] def find_end_homologies(self, seq1, seq2): """Finds an homology between seq1's end and seq2's start. Return the size of the homology, or 0 if no homologies found. """ seq1 = self.sequence_to_string(seq1) seq2 = self.sequence_to_string(seq2) max_size = np.min([self.max_size, len(seq1), len(seq2)]) for homology_size in range(self.min_size, max_size): subseq1 = seq1[-homology_size:] subseq2 = seq2[:homology_size] if self.check_homology(subseq1, subseq2): return homology_size return 0
[docs] def check_homology(self, sequence, other_sequence=None): """Return whether there is an acceptable full-sequence homology between two sequences.""" sequence = self.sequence_to_string(sequence) if other_sequence is not None: other_sequence = self.sequence_to_string(other_sequence) if self.max_distance == 0: if sequence != other_sequence: return False else: distance = Levenshtein.distance(sequence, other_sequence) if distance > self.max_distance: return False if len(sequence) < self.min_size: return False if (self.max_size is not None) and len(sequence) > self.max_size: return False if self.min_tm > 0 or (self.max_tm is not None): tm = self.compute_tm(sequence) if tm < self.min_tm: return False if tm > (self.max_tm or 1e8): return False return True
[docs] def parameters_as_string(self): """Return a string of the parameters for errors and reports.""" return "%d-%dbp, %.1f-%sC Tm" % ( self.min_size, self.max_size, self.min_tm, "+" if (self.max_tm is None) else ("%.1f" % self.max_tm), )