Source code for genedom.biotools

import os
import re
from copy import deepcopy
import numpy as np

from snapgene_reader import snapgene_file_to_seqrecord

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import DNAAlphabet
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import SeqIO


complements_dict = {"A": "T", "T": "A", "C": "G", "G": "C"}


[docs]def random_dna_sequence(length, probas=None, seed=None): """Return a random DNA sequence ("ATGGCGT...") with the specified length. Parameters ---------- length Length of the DNA sequence. proba Frequencies for the different nucleotides, for instance ``probas={"A":0.2, "T":0.3, "G":0.3, "C":0.2}``. If not specified, all nucleotides are equiprobable (p=0.25). seed The seed to feed to the random number generator. When a seed is provided the random results depend deterministically on the seed, thus enabling reproducibility """ if seed is not None: np.random.seed(seed) if probas is None: sequence = np.random.choice(list("ATCG"), length) else: bases, probas = zip(*probas.items()) sequence = np.random.choice(bases, length, p=probas) return "".join(sequence)
formats_dict = { '.fa': 'fasta', '.gb': 'genbank', '.gbk': 'genbank', '.dna': 'snapgene' }
[docs]def load_record(filename, linear=True, name="unnamed", capitalize=True): no_extension, extension = os.path.splitext(filename) fmt = formats_dict[extension] if fmt == 'snapgene': record = snapgene_file_to_seqrecord(filename) else: record = SeqIO.read(filename, fmt) if capitalize: record.seq = record.seq.upper() record.linear = linear record.id = name record.name = name.replace(" ", "_")[:20] return record
[docs]def load_records(path, capitalize=True): if isinstance(path, (list, tuple)): return [record for p in path for record in load_records(p)] no_extension, extension = os.path.splitext(path) fmt = formats_dict[extension] if fmt == 'snapgene': records = [snapgene_file_to_seqrecord(path)] else: records = list(SeqIO.parse(path, fmt)) for i, record in enumerate(records): if capitalize: record.seq = record.seq.upper() if str(record.id) in ['None', '', "<unknown id>", '.', ' ']: record.id = path.replace("/", "_").replace("\\", "_") if len(records) > 1: record.id += "_%04d" % i return records
def complement(sequence): "Return the complement of the sequence" return "".join(complements_dict[c] for c in sequence) def reverse_complement(sequence): "Return the reverse-complement of the sequence" return complement(sequence)[::-1] def sequence_to_record(sequence, features=()): "Return a biopython record with the provided (ATGC string) sequence" return SeqRecord(Seq(sequence, alphabet=DNAAlphabet()), features=list(features)) def annotate_record(seqrecord, location="full", feature_type="feature", margin=0, **qualifiers): """Add a feature to a Biopython SeqRecord. Parameters ---------- seqrecord The biopython seqrecord to be annotated. location Either (start, end) or (start, end, strand). (strand defaults to +1) feature_type The type associated with the feature margin Number of extra bases added on each side of the given location. qualifiers Dictionnary that will be the Biopython feature's `qualifiers` attribute. """ if location == "full": location = (margin, len(seqrecord) - margin) strand = location[2] if len(location) == 3 else 1 seqrecord.features.append( SeqFeature( FeatureLocation(location[0], location[1], strand), qualifiers=qualifiers, type=feature_type ) ) def sanitize_string(string, max_length=15, replacements=(("'", "p"), ("*", "s"), ("-", "_"))): """Return a ordering-compatible string (not too long, no bad characters) This is used for creating sets of ordering-friendly names when companies refuse part names with more than 15 characters. """ for old, new in replacements: string = string.replace(old, new) string = re.sub(r'[^a-zA-Z\d\S]', '_', string) return string[:max_length] def sanitize_and_uniquify(strings, max_length=15, replacements=(("'", "p"), ("*", "s"), ("-", "_"))): """Sanitize many sequences and add numbers to non-unique names. This is used for creating sets of ordering-friendly names when companies refuse part names with more than 15 characters. """ dejavu = set() table = {} for string in strings: newstring = sanitize_string(string, max_length=max_length, replacements=replacements) i = 1 while newstring in dejavu: i += 1 newstring = newstring[:-1] + str(i) dejavu.add(newstring) table[string] = newstring return table
[docs]def write_record(record, target, fmt='genbank'): """Write a record as genbank, fasta, etc. via Biopython, with fixes""" record = deepcopy(record) record.name = record.name[:20] if str(record.seq.alphabet.__class__.__name__) != 'DNAAlphabet': record.seq.alphabet = DNAAlphabet() if hasattr(target, 'open'): target = target.open('w') SeqIO.write(record, target, fmt)