Source code for dnacauldron.biotools.record_operations

from copy import copy

try:
    # Biopython <1.78
    from Bio.Alphabet import DNAAlphabet

    has_dna_alphabet = True
except ImportError:
    # Biopython >=1.78
    has_dna_alphabet = False
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation



[docs]
def complement(dna_sequence):
    """Return the complement of the DNA sequence.

    For instance ``complement("ATGCCG")`` returns ``"TACGGC"``.

    Uses BioPython for speed.
    """
    return str(Seq(dna_sequence).complement())




[docs]
def set_record_topology(record, topology):
    """Set the Biopython record's topology, possibly passing if already set.

    This actually sets the ``record.annotations['topology']``.The ``topology``
    parameter can be "circular", "linear", "default_to_circular" (will default
    to circular if ``annotations['topology']`` is not already set) or
    "default_to_linear".
    """
    valid_topologies = [
        "circular",
        "linear",
        "default_to_circular",
        "default_to_linear",
    ]
    if topology not in valid_topologies:
        raise ValueError("topology should be one of %s." % ", ".join(valid_topologies))
    annotations = record.annotations
    default_prefix = "default_to_"
    if topology.startswith(default_prefix):
        if "topology" not in annotations:
            annotations["topology"] = topology[len(default_prefix) :]
    else:
        annotations["topology"] = topology



def record_is_linear(record, default=True):
    if "topology" not in record.annotations:
        return default
    else:
        return record.annotations["topology"] == "linear"



[docs]
def reverse_complement(sequence):
    """Return the reverse-complement of the DNA sequence.

    For instance ``complement("ATGCCG")`` returns ``"GCCGTA"``.

    Uses BioPython for speed.
    """
    return complement(sequence)[::-1]




[docs]
def sequence_to_biopython_record(
    sequence, id="<unknown id>", name="same_as_id", features=()
):
    """Return a SeqRecord of the sequence, ready to be Genbanked."""

    if has_dna_alphabet:
        seqrecord = SeqRecord(
            Seq(sequence, alphabet=DNAAlphabet()),
            id=id,
            name=id if name == "same_as_id" else name,
            features=list(features),
        )
    else:
        seqrecord = SeqRecord(
            Seq(sequence),
            id=id,
            name=id if name == "same_as_id" else name,
            features=list(features),
        )

    seqrecord.annotations["molecule_type"] = "DNA"

    return seqrecord




[docs]
def annotate_record(
    seqrecord, location="full", feature_type="misc_feature", margin=0, **qualifiers
):
    """Add a feature to a Biopython SeqRecord.

    Parameters
    ----------

    seqrecord
      The Biopython seqrecord to be annotated.

    location
      Either (start, end) or (start, end, strand). (strand defaults to +1)

    feature_type
      The type associated with the feature.

    margin
      Number of extra bases added on each side of the given location.

    qualifiers
      Dictionary that will be the Biopython feature's `qualifiers` attribute.
    """
    if location == "full":
        location = (margin, len(seqrecord) - margin)

    strand = location[2] if len(location) == 3 else 1
    seqrecord.features.append(
        SeqFeature(
            FeatureLocation(location[0], location[1], strand),
            qualifiers=qualifiers,
            type=feature_type,
        )
    )




[docs]
def crop_record_with_saddling_features(record, start, end, filters=()):
    """Crop the Biopython record, but keep features that are only partially in.

    Parameters
    ----------
    record
      The Biopython record to crop.

    start, end
      Coordinates of the segment to crop.

    filters
      list of functions (feature=>True/False). Any feature that doesn't pass
      at least one filter will be filtered out.
    """
    cropped = record[start:end]

    def is_saddling(f_start, f_end):
        return (f_start < start <= f_end) or (f_start <= end < f_end)

    saddling_features = [
        copy(f)
        for f in record.features
        if all([fl(f) for fl in filters])
        and f.location is not None
        and is_saddling(f.location.start, f.location.end)
    ]
    for f in saddling_features:
        f.location = FeatureLocation(
            start=max(f.location.start, start),
            end=min(f.location.end, end),
            strand=f.location.strand,
        )
        cropped.features.append(f)
    return cropped