Source code for genedom.biotools

import os
import re
from copy import deepcopy
import numpy as np

from snapgene_reader import snapgene_file_to_seqrecord

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

try:
    # Biopython <1.78
    from Bio.Alphabet import DNAAlphabet

    has_dna_alphabet = True
except ImportError:
    # Biopython >=1.78
    has_dna_alphabet = False
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import SeqIO


complements_dict = {"A": "T", "T": "A", "C": "G", "G": "C"}



[docs]
def random_dna_sequence(length, probas=None, seed=None):
    """Return a random DNA sequence ("ATGGCGT...") with the specified length.

    Parameters
    ----------

    length
      Length of the DNA sequence.

    proba
      Frequencies for the different nucleotides, for instance
      ``probas={"A":0.2, "T":0.3, "G":0.3, "C":0.2}``.
      If not specified, all nucleotides are equiprobable (p=0.25).

    seed
      The seed to feed to the random number generator. When a seed is provided
      the random results depend deterministically on the seed, thus enabling
      reproducibility.
    """
    if seed is not None:
        np.random.seed(seed)
    if probas is None:
        sequence = np.random.choice(list("ATCG"), length)
    else:
        bases, probas = zip(*probas.items())
        sequence = np.random.choice(bases, length, p=probas)
    return "".join(sequence)



formats_dict = {".fa": "fasta", ".gb": "genbank", ".gbk": "genbank", ".dna": "snapgene"}



[docs]
def load_record(filename, linear=True, name="unnamed", capitalize=True):
    no_extension, extension = os.path.splitext(filename)
    fmt = formats_dict[extension]
    if fmt == "snapgene":
        record = snapgene_file_to_seqrecord(filename)
    else:
        record = SeqIO.read(filename, fmt)
    if capitalize:
        record.seq = record.seq.upper()
    record.linear = linear
    record.id = name
    record.name = name.replace(" ", "_")[:20]

    return record




[docs]
def load_records(path, capitalize=True):
    if isinstance(path, (list, tuple)):
        return [record for p in path for record in load_records(p)]
    no_extension, extension = os.path.splitext(path)
    fmt = formats_dict[extension]
    if fmt == "snapgene":
        records = [snapgene_file_to_seqrecord(path)]
    else:
        records = list(SeqIO.parse(path, fmt))
    for i, record in enumerate(records):
        if capitalize:
            record.seq = record.seq.upper()
        if str(record.id) in ["None", "", "<unknown id>", ".", " "]:
            record.id = path.replace("/", "_").replace("\\", "_")
            if len(records) > 1:
                record.id += "_%04d" % i
    return records



def complement(sequence):
    return "".join(complements_dict[c] for c in sequence)


def reverse_complement(sequence):
    return complement(sequence)[::-1]


def sequence_to_record(sequence, features=()):
    if has_dna_alphabet:
        seq = Seq(sequence, alphabet=DNAAlphabet())
    else:
        seq = Seq(sequence)

    seqrecord = SeqRecord(seq, features=list(features))
    seqrecord.annotations["molecule_type"] = "DNA"

    return seqrecord


def annotate_record(
    seqrecord, location="full", feature_type="feature", margin=0, **qualifiers
):
    """Add a feature to a Biopython SeqRecord.

    Parameters
    ----------

    seqrecord
      The biopython seqrecord to be annotated.

    location
      Either (start, end) or (start, end, strand). (strand defaults to +1).

    feature_type
      The type associated with the feature.

    margin
      Number of extra bases added on each side of the given location.

    qualifiers
      Dictionary that will be the Biopython feature's `qualifiers` attribute.
    """

    if location == "full":
        location = (margin, len(seqrecord) - margin)

    strand = location[2] if len(location) == 3 else 1
    seqrecord.features.append(
        SeqFeature(
            FeatureLocation(location[0], location[1], strand),
            qualifiers=qualifiers,
            type=feature_type,
        )
    )


def sanitize_string(
    string, max_length=15, replacements=(("'", "p"), ("*", "s"), ("-", "_"))
):
    for old, new in replacements:
        string = string.replace(old, new)
    string = re.sub(r"[^a-zA-Z\d\S]", "_", string)
    return string[:max_length]


def sanitize_and_uniquify(
    strings, max_length, replacements=(("'", "p"), ("*", "s"), ("-", "_"))
):
    dejavu = set()
    table = {}
    for string in strings:
        newstring = sanitize_string(
            string, max_length=max_length, replacements=replacements
        )
        i = 1
        while newstring in dejavu:
            i += 1
            newstring = newstring[:-1] + str(i)
        dejavu.add(newstring)
        table[string] = newstring
    return table



[docs]
def write_record(record, target, fmt="genbank"):
    """Write a record as genbank, fasta, etc. via Biopython, with fixes."""
    record = deepcopy(record)
    record.name = record.name[:20]
    if has_dna_alphabet:
        if str(record.seq.alphabet.__class__.__name__) != "DNAAlphabet":
            record.seq.alphabet = DNAAlphabet()
    record.annotations["molecule_type"] = "DNA"
    if hasattr(target, "open"):
        target = target.open("w")
    SeqIO.write(record, target, fmt)