Examples

Note: all examples here are available in the examples/ folder of the code repository

Basic examples

Optimization with report

"""Example of optimization from a Genbank record, with report

In this example, the Genbank record data/example_sequence.gbk
gets optimized and a report is created at reports/optimization_with_report/
"""

import os
from dnachisel import DnaOptimizationProblem

genbank_path = os.path.join("data", "example_sequence.gbk")
report_path = "optimization_with_report.zip"  # could also be a folder

problem = DnaOptimizationProblem.from_record(genbank_path)
success, message, _ = problem.optimize_with_report(target=report_path)

print(message + " A report was generated in " + report_path)

Optimization of a circular sequence

"""Example of optimization of a circular sequence.

We demonstrates the use of DNA Chisel to solve a circular DNA optimization
problem:

- The sequence is designed to have a cross-origin BsmBI site that will need
  to be removed, because the location-less specification ``AvoidPattern``
  is interpreted as applying to the full circle.

- The specification ``EnforceGCContent`` is cross-origin since its location is
  1500-2500, and the sequence is ~2000bp long.

"""

import dnachisel as dc

dna_sequence = "CTC%sCGTCTC%sCGT" % (
    dc.random_dna_sequence(1000),
    dc.random_dna_sequence(1000),
)

constraints = [
    dc.AvoidPattern("BsmBI_site"),
    dc.EnforceGCContent(mini=0.4, maxi=0.6, location=(1500, 2500), window=50),
    dc.UniquifyAllKmers(k=9, location=(10, 1000)),
]

problem = dc.CircularDnaOptimizationProblem(
    sequence=dna_sequence, constraints=constraints
)

print("BEFORE OPTIMIZATION:\n\n", problem.constraints_text_summary())
problem.resolve_constraints()
print("AFTER OPTIMIZATION:\n\n", problem.constraints_text_summary())

Advanced examples

Sequence standardization/domestication

"""Example of use of the AvoidPAttern specification"""

from dnachisel import (
    DnaOptimizationProblem,
    random_protein_sequence,
    reverse_translate,
    CodonOptimize,
    EnforceTranslation,
    AvoidPattern,
    EnforceGCContent,
)

protein = random_protein_sequence(1000, seed=123)
sequence = reverse_translate(protein)
problem = DnaOptimizationProblem(
    sequence=sequence,
    constraints=[
        EnforceTranslation(),
        AvoidPattern("BsmBI_site"),
        EnforceGCContent(mini=0.4, maxi=0.6, window=60),
    ],
    objectives=[CodonOptimize(species="s_cerevisiae")],
)

print("\nBefore optimization:\n")
print(problem.constraints_text_summary())
print(problem.objectives_text_summary())

problem.resolve_constraints(final_check=True)
problem.optimize()

print("\nAfter optimization:\n")
print(problem.constraints_text_summary())
print(problem.objectives_text_summary())

Programmatic optimization of a full genome

from urllib import request
from io import StringIO
from dnachisel import (
    DnaOptimizationProblem,
    load_record,
    Location,
    EnforceTranslation,
    EnforceGCContent,
    AvoidPattern,
)

print("DOWNLOADING AND PARSING THE GENBANK DATA...")

url = (
    "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
    + "db=nucleotide&id=48994873&rettype=gb&retmode=txt"
)
genbank_data = request.urlopen(url).read().decode("utf-8")
genbank_record = load_record(StringIO(genbank_data), file_format="genbank")

print("INITIALIZING THE PROBLEM WITH CONSTRAINTS FOR EACH GENE...")

constraints = []
for feature in genbank_record.features:
    if feature.type == "gene" and len(feature.location.parts) == 1:
        location = Location.from_biopython_location(feature.location)
        if (len(location) % 3 == 0) and len(location) > 100:
            gene_constraints = [
                EnforceTranslation(location=location),
                AvoidPattern("BsmBI_site", location, strand="both"),
                EnforceGCContent(mini=0.40, maxi=0.60, window=150, location=location),
            ]
            constraints.extend(gene_constraints)
problem = DnaOptimizationProblem(genbank_record, constraints)

print("RESOLVING THE CONSTRAINTS...")

problem.logger.ignore_bars_under = 50
problem.resolve_constraints()
problem.to_record("ecoli_genes_optimization.gb")

Creating a collection of compatible primers

"""We will build a collection of 20 non-interannealing primers.

DNA Chisel is not originally meant for creating collections of sequences
(frameworks such as D-tailor were written with this purpose in mind), but
it is still possible to create collections of inter-compatible sequences.

Here we create 20 short sequences one after the other, while making sure
that each new primer is "compatible" with the other ones made so far.
Compatibility in this example means low heterodimerization (=the primers
could be used together without undesired annealing between them).

"""

from dnachisel import DnaOptimizationProblem, random_dna_sequence
from dnachisel.builtin_specifications import (
    AvoidHeterodimerization,
    EnforceGCContent,
    AvoidPattern,
)


def create_new_primer(existing_primers):
    """Create a new primer based on the primers created so far"""
    problem = DnaOptimizationProblem(
        sequence=random_dna_sequence(length=20),
        constraints=[
            AvoidHeterodimerization(existing_primers, tmax=3),
            AvoidPattern("3x3mer"),
            AvoidPattern("4xG"),
        ],
        objectives=[EnforceGCContent(target=0.6)],
        logger=None,
    )
    problem.resolve_constraints()
    problem.optimize()
    return problem.sequence


# MAIN LOOP, WHERE PRIMERS ARE CREATED ONE BY ONE

existing_primers = []
for i in range(20):
    new_primer = create_new_primer(existing_primers)
    existing_primers.append(new_primer)

print("PRIMERS GENERATED: \n\n%s\n" % "\n".join(existing_primers))

# (OPTIONAL VERIFICATION OF THE COLLECTION)

import itertools
import primer3
from dnachisel.biotools import gc_content

max_tm = max(
    primer3.calcHeterodimer(seq1, seq2).tm
    for seq1, seq2 in itertools.combinations(existing_primers, 2)
)
print("Max Tm heterodimerization between any 2 primers: %.2f" % max_tm)

gc_contents = [gc_content(p) for p in existing_primers]
print("GC content range %.2f-%.2f" % (min(gc_contents), max(gc_contents)))

Writing a custom Specification: 9-mer score

"""Example of DNA Chisel user-defined specification: Gen9's 9mer score.

Once upon the time there was a DNA synthesis company called Gen9, who accepted
or refused orders based on sequence features.

One of them was the "9mer score", defined as 9 * N / len(sequence) where N is
the number of non-unique kmers in the sequence.

The score can be interpreted as the density of nucleotides which are part of
a 9mer also present somewhere else in the sequence. These should be avoided
as they impede the synthesis (certainly at the oligo assembly stage). Too much
9mers are a... nightmare (ah ah ah pun).

In this script we show how to create a new objective ``MinimizeNinemersScore``
to minimize this score, and we use it to optimize a coding sequence.
"""

from collections import Counter
from dnachisel import (
    EnforceTranslation,
    Specification,
    SpecEvaluation,
    reverse_translate,
    random_protein_sequence,
    Location,
    DnaOptimizationProblem,
)


class MinimizeNinemersScore(Specification):
    """Minimize Gen9's "no-9mers" score."""

    def evaluate(self, problem):
        """Return Gen9's ninemer score for the problem' sequence"""
        sequence = problem.sequence
        all_9mers = [sequence[i : i + 9] for i in range(len(sequence) - 9)]
        number_of_non_unique_9mers = sum(
            [count for ninemer, count in Counter(all_9mers).items() if count > 1]
        )
        score = -(9.0 * number_of_non_unique_9mers) / len(sequence)
        return SpecEvaluation(
            self,
            problem,
            score=score,
            locations=[Location(0, len(sequence))],
            message="Score: %.02f (%d non-unique ninemers)"
            % (score, number_of_non_unique_9mers),
        )

    def __str__(self):
        """String representation."""
        return "MinimizeNinemersScore"


sequence = reverse_translate(random_protein_sequence(300))
problem = DnaOptimizationProblem(
    sequence=sequence,
    constraints=[EnforceTranslation()],
    objectives=[MinimizeNinemersScore()],
)

print("\n=== Status before optimization ===")
print(problem.objectives_text_summary())

problem.optimize()

print("\n=== Status after optimization ===")
print(problem.objectives_text_summary())
print(problem.constraints_text_summary(failed_only=True))

Creating a sequence without E. coli TF binding sites

from dnachisel import DnaOptimizationProblem, AvoidPattern, random_dna_sequence
from urllib import request
import pandas as pd
from io import StringIO

# DOWNLOAD THE LIST OF TF BINDING SITES
url = "http://regulondb.ccg.unam.mx/menu/download/datasets/files/BindingSiteSet.txt"
data = request.urlopen(url).read().decode("utf-8")
df = pd.read_csv(
    StringIO(data), sep="\t", skiprows=(range(0, 46)), header=None
)  # First 46 lines are description

# OBTAIN A LIST OF TF BINDING SEQUENCES
tf_column = df[13]  # 14th column contains TF binding sites
tf_column.dropna(inplace=True)
tf_list = tf_column.to_list()
# According to the description, the binding sites are in uppercase, so we remove lowercase:
tf_binding_sequences = ["".join(ch for ch in tf if not ch.islower()) for tf in tf_list]
# Remove single-nucleotide TFs:
tf_binding_sequences = [tf for tf in tf_binding_sequences if len(tf) > 1]

# DEFINE AND SOLVE THE OPTIMIZATION PROBLEM
problem = DnaOptimizationProblem(
    sequence=random_dna_sequence(50000),
    constraints=[AvoidPattern(pattern) for pattern in tf_binding_sequences],
)
problem.resolve_constraints()
problem.to_record("sequence_without_tf_binding_sites.gb")

Constraints reports example

import dnachisel as dc
import dnachisel.reports.constraints_reports as cr
import os

# IMPORT THE 10 RECORDS FROM THE genbanks/ FOLDER

records = [
    dc.load_record(os.path.join("genbanks", filename), name=filename)
    for filename in os.listdir("genbanks")
]

# DEFINE THE CONSTRAINTS TO BE CHECKED ON EACH RECORD

constraints = [
    dc.AvoidPattern("BsaI_site"),
    dc.AvoidPattern("BsmBI_site"),
    dc.AvoidPattern("BbsI_site"),
    dc.AvoidPattern("8x1mer"),
    dc.AvoidPattern("5x3mer"),
    dc.AvoidPattern("9x2mer"),
    dc.AvoidHairpins(stem_size=20, hairpin_window=200),
    dc.EnforceGCContent(mini=0.3, maxi=0.7, window=100),
]

# CREATE A SPREADSHEET AND PLOTS OF THE BREACHES

dataframe = cr.constraints_breaches_dataframe(constraints, records)
dataframe.to_excel("breaches.xlsx")
records = cr.records_from_breaches_dataframe(dataframe, records)
cr.breaches_records_to_pdf(records, "breaches_plots.pdf")

print("Done! Check breaches.xlsx and breaches_plots.pdf for results.")