Source code for easy_dna.matching

import itertools
from .biotables import NUCLEOTIDE_TO_REGEXPR, IUPAC_NOTATION
from .biological_operations import reverse_complement
from .record_operations import sequence_to_atgc_string


[docs]def dna_pattern_to_regexpr(dna_pattern): """Return a regular expression pattern for the provided DNA pattern. For instance ``dna_pattern_to_regexpr('ATTNN')`` returns ``"ATT[A|T|G|C][A|T|G|C]"``. """ return "".join([NUCLEOTIDE_TO_REGEXPR[n] for n in dna_pattern])
[docs]def all_iupac_variants(iupac_sequence): """Return all unambiguous possible versions of the given sequence. Examples ======== >>> all_iupac_variants('ATN') >>> ['ATA', 'ATC', 'ATG', 'ATT'] """ return [ "".join(nucleotides) for nucleotides in itertools.product( *[IUPAC_NOTATION[n] for n in iupac_sequence] ) ]
def find_index(seq, pattern): """Return the index of the first match of the pattern in seq. Returns -1 if there is no match. """ pattern = sequence_to_atgc_string(pattern) if hasattr(seq, "seq"): # Seq is a SeqRecord return seq.seq.find(pattern) else: return seq.find(pattern) def find_occurence(seq, pattern, strand="both"): if strand == 1: position = find_index(seq, pattern) if position == -1: return None else: return (position, position + len(pattern), 1) elif strand == -1: rev = reverse_complement(seq) occurrence = find_occurence(rev, pattern, strand=1) if occurrence is None: return None L = len(seq) return (L - occurrence[1], L - occurrence[0], -1) elif strand == "both": occurrence = find_occurence(seq, pattern, strand=1) if occurrence is None: occurrence = find_occurence(seq, pattern, strand=-1) return occurrence