Source code for dnacauldron.biotools.record_operations

from copy import copy

try:
    # Biopython <1.78
    from Bio.Alphabet import DNAAlphabet

    has_dna_alphabet = True
except ImportError:
    # Biopython >=1.78
    has_dna_alphabet = False
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation


[docs]def complement(dna_sequence): """Return the complement of the DNA sequence. For instance ``complement("ATGCCG")`` returns ``"TACGGC"``. Uses BioPython for speed. """ return str(Seq(dna_sequence).complement())
[docs]def set_record_topology(record, topology): """Set the Biopython record's topology, possibly passing if already set. This actually sets the ``record.annotations['topology']``.The ``topology`` parameter can be "circular", "linear", "default_to_circular" (will default to circular if ``annotations['topology']`` is not already set) or "default_to_linear". """ valid_topologies = [ "circular", "linear", "default_to_circular", "default_to_linear", ] if topology not in valid_topologies: raise ValueError("topology should be one of %s." % ", ".join(valid_topologies)) annotations = record.annotations default_prefix = "default_to_" if topology.startswith(default_prefix): if "topology" not in annotations: annotations["topology"] = topology[len(default_prefix) :] else: annotations["topology"] = topology
def record_is_linear(record, default=True): if "topology" not in record.annotations: return default else: return record.annotations["topology"] == "linear"
[docs]def reverse_complement(sequence): """Return the reverse-complement of the DNA sequence. For instance ``complement("ATGCCG")`` returns ``"GCCGTA"``. Uses BioPython for speed. """ return complement(sequence)[::-1]
[docs]def sequence_to_biopython_record( sequence, id="<unknown id>", name="same_as_id", features=() ): """Return a SeqRecord of the sequence, ready to be Genbanked.""" if has_dna_alphabet: seqrecord = SeqRecord( Seq(sequence, alphabet=DNAAlphabet()), id=id, name=id if name == "same_as_id" else name, features=list(features), ) else: seqrecord = SeqRecord( Seq(sequence), id=id, name=id if name == "same_as_id" else name, features=list(features), ) seqrecord.annotations["molecule_type"] = "DNA" return seqrecord
[docs]def annotate_record( seqrecord, location="full", feature_type="misc_feature", margin=0, **qualifiers ): """Add a feature to a Biopython SeqRecord. Parameters ---------- seqrecord The Biopython seqrecord to be annotated. location Either (start, end) or (start, end, strand). (strand defaults to +1) feature_type The type associated with the feature. margin Number of extra bases added on each side of the given location. qualifiers Dictionary that will be the Biopython feature's `qualifiers` attribute. """ if location == "full": location = (margin, len(seqrecord) - margin) strand = location[2] if len(location) == 3 else 1 seqrecord.features.append( SeqFeature( FeatureLocation(location[0], location[1], strand), qualifiers=qualifiers, type=feature_type, ) )
[docs]def crop_record_with_saddling_features(record, start, end, filters=()): """Crop the Biopython record, but keep features that are only partially in. Parameters ---------- record The Biopython record to crop. start, end Coordinates of the segment to crop. filters list of functions (feature=>True/False). Any feature that doesn't pass at least one filter will be filtered out. """ cropped = record[start:end] def is_saddling(f_start, f_end): return (f_start < start <= f_end) or (f_start <= end < f_end) saddling_features = [ copy(f) for f in record.features if all([fl(f) for fl in filters]) and f.location is not None and is_saddling(f.location.start, f.location.end) ] for f in saddling_features: f.location = FeatureLocation( start=max(f.location.start, start), end=min(f.location.end, end), strand=f.location.strand, ) cropped.features.append(f) return cropped