Source code for geneblocks.biotools

import tempfile
import subprocess

import numpy as np

try:
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.SeqFeature import SeqFeature, FeatureLocation
    from Bio import SeqIO

    BIOPYTHON_AVAILABLE = True
except ImportError:
    BIOPYTHON_AVAILABLE = False

try:
    # Biopython <1.78
    from Bio.Alphabet import DNAAlphabet

    has_dna_alphabet = True
except ImportError:
    # Biopython >=1.78
    has_dna_alphabet = False


complements_dict = {"A": "T", "T": "A", "C": "G", "G": "C"}


[docs]def random_dna_sequence(length, probas=None, seed=None): """Return a random DNA sequence ("ATGGCGT...") with the specified length. Parameters ---------- length Length of the DNA sequence. proba Frequencies for the different nucleotides, for instance ``probas={"A":0.2, "T":0.3, "G":0.3, "C":0.2}``. If not specified, all nucleotides are equiprobable (p=0.25). seed The seed to feed to the random number generator. When a seed is provided the random results depend deterministically on the seed, thus enabling reproducibility """ if seed is not None: np.random.seed(seed) if probas is None: sequence = np.random.choice(list("ATCG"), length) else: bases, probas = zip(*probas.items()) sequence = np.random.choice(bases, length, p=probas) return "".join(sequence)
def load_record(filename, linear=True, name="id", upperize=True): formt = "genbank" if filename.endswith(("gb", "gbk")) else "fasta" record = SeqIO.read(filename, formt) if upperize: record.seq = record.seq.upper() record.linear = linear if name != "id": record.id = name record.name = record.id.replace(" ", "_")[:20] return record def complement(sequence): return "".join(complements_dict[c] for c in sequence) def reverse_complement(sequence): return complement(sequence)[::-1] def sequence_to_record(sequence, record_id=None, name="unnamed", features=()): if not BIOPYTHON_AVAILABLE: raise ImportError("Creating records requires Biopython installed.") if has_dna_alphabet: # Biopython <1.78 sequence = Seq(sequence, alphabet=DNAAlphabet()) else: sequence = Seq(sequence) seqrecord = SeqRecord(sequence, name=name, id=record_id, features=list(features),) seqrecord.annotations["molecule_type"] = "DNA" return seqrecord
[docs]def annotate_record( seqrecord, location="full", feature_type="feature", margin=0, **qualifiers ): """Add a feature to a Biopython SeqRecord. Parameters ---------- seqrecord The biopython seqrecord to be annotated. location Either (start, end) or (start, end, strand). (strand defaults to +1) feature_type The type associated with the feature margin Number of extra bases added on each side of the given location. qualifiers Dictionnary that will be the Biopython feature's `qualifiers` attribute. """ if not BIOPYTHON_AVAILABLE: raise ImportError("Creating records requires Biopython installed.") if location == "full": location = (margin, len(seqrecord) - margin) strand = location[2] if len(location) == 3 else 1 seqrecord.features.append( SeqFeature( FeatureLocation(location[0], location[1], strand), qualifiers=qualifiers, type=feature_type, ) )
[docs]def sequences_differences_array(seq1, seq2): """Return an array [0, 0, 1, 0, ...] with 1s for sequence differences. seq1, seq2 should both be ATGC strings. """ if len(seq1) != len(seq2): raise ValueError( "Only use on same-size sequences (%d, %d)" % (len(seq1), len(seq2)) ) arr1 = np.fromstring(seq1, dtype="uint8") arr2 = np.fromstring(seq2, dtype="uint8") return arr1 != arr2
[docs]def sequences_differences(seq1, seq2): """Return the number of nucleotides that differ in the two sequences. seq1, seq2 should be strings of DNA sequences e.g. "ATGCTGTGC" """ return sequences_differences_array(seq1, seq2).sum()