Source code for geneblocks.biotools

import tempfile
import subprocess

import numpy as np

try:
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.SeqFeature import SeqFeature, FeatureLocation
    from Bio import SeqIO

    BIOPYTHON_AVAILABLE = True
except ImportError:
    BIOPYTHON_AVAILABLE = False

try:
    # Biopython <1.78
    from Bio.Alphabet import DNAAlphabet

    has_dna_alphabet = True
except ImportError:
    # Biopython >=1.78
    has_dna_alphabet = False


complements_dict = {"A": "T", "T": "A", "C": "G", "G": "C"}



[docs]
def random_dna_sequence(length, probas=None, seed=None):
    """Return a random DNA sequence ("ATGGCGT...") with the specified length.

    Parameters
    ----------

    length
      Length of the DNA sequence.

    proba
      Frequencies for the different nucleotides, for instance
      ``probas={"A":0.2, "T":0.3, "G":0.3, "C":0.2}``.
      If not specified, all nucleotides are equiprobable (p=0.25).

    seed
      The seed to feed to the random number generator. When a seed is provided
      the random results depend deterministically on the seed, thus enabling
      reproducibility

    """
    if seed is not None:
        np.random.seed(seed)
    if probas is None:
        sequence = np.random.choice(list("ATCG"), length)
    else:
        bases, probas = zip(*probas.items())
        sequence = np.random.choice(bases, length, p=probas)
    return "".join(sequence)



def load_record(filename, linear=True, name="id", upperize=True):
    formt = "genbank" if filename.endswith(("gb", "gbk")) else "fasta"
    record = SeqIO.read(filename, formt)
    if upperize:
        record.seq = record.seq.upper()
    record.linear = linear
    if name != "id":
        record.id = name
    record.name = record.id.replace(" ", "_")[:20]

    return record


def complement(sequence):
    return "".join(complements_dict[c] for c in sequence)


def reverse_complement(sequence):
    return complement(sequence)[::-1]


def sequence_to_record(sequence, record_id=None, name="unnamed", features=()):
    if not BIOPYTHON_AVAILABLE:
        raise ImportError("Creating records requires Biopython installed.")
    if has_dna_alphabet:  # Biopython <1.78
        sequence = Seq(sequence, alphabet=DNAAlphabet())
    else:
        sequence = Seq(sequence)

    seqrecord = SeqRecord(
        sequence,
        name=name,
        id=record_id,
        features=list(features),
    )
    seqrecord.annotations["molecule_type"] = "DNA"

    return seqrecord



[docs]
def annotate_record(
    seqrecord, location="full", feature_type="feature", margin=0, **qualifiers
):
    """Add a feature to a Biopython SeqRecord.

    Parameters
    ----------

    seqrecord
      The biopython seqrecord to be annotated.

    location
      Either (start, end) or (start, end, strand). (strand defaults to +1)

    feature_type
      The type associated with the feature

    margin
      Number of extra bases added on each side of the given location.

    qualifiers
      Dictionnary that will be the Biopython feature's `qualifiers` attribute.
    """

    if not BIOPYTHON_AVAILABLE:
        raise ImportError("Creating records requires Biopython installed.")

    if location == "full":
        location = (margin, len(seqrecord) - margin)

    strand = location[2] if len(location) == 3 else 1
    seqrecord.features.append(
        SeqFeature(
            FeatureLocation(location[0], location[1], strand),
            qualifiers=qualifiers,
            type=feature_type,
        )
    )




[docs]
def sequences_differences_array(seq1, seq2):
    """Return an array [0, 0, 1, 0, ...] with 1s for sequence differences.

    seq1, seq2 should both be ATGC strings.
    """
    if len(seq1) != len(seq2):
        raise ValueError(
            "Only use on same-size sequences (%d, %d)" % (len(seq1), len(seq2))
        )
    arr1 = np.fromstring(seq1, dtype="uint8")
    arr2 = np.fromstring(seq2, dtype="uint8")
    return arr1 != arr2




[docs]
def sequences_differences(seq1, seq2):
    """Return the number of nucleotides that differ in the two sequences.

    seq1, seq2 should be strings of DNA sequences e.g. "ATGCTGTGC"
    """
    return sequences_differences_array(seq1, seq2).sum()