Source code for dnachisel.reports.optimization_reports

"""Methods to generate optimization reports."""

import os
import textwrap
from collections import OrderedDict
import hashlib

from Bio import SeqIO
import flametree
import numpy as np

from ..biotools import (
    sequence_to_biopython_record,
    find_specification_label_in_feature,
)
from ..version import __version__
from .SpecAnnotationsTranslator import SpecAnnotationsTranslator
from .tools import install_extras_message
from ..Location import Location

try:
    import pandas
    PANDAS_AVAILABLE = True
except ImportError:
    PANDAS_AVAILABLE = False

try:
    from sequenticon import sequenticon

    SEQUENTICON_AVAILABLE = True
except ImportError:
    SEQUENTICON_AVAILABLE = False

MATPLOTLIB_AVAILABLE = False
try:
    import matplotlib.cm as cm
    import matplotlib.pyplot as plt
    from matplotlib.backends.backend_pdf import PdfPages

    MATPLOTLIB_AVAILABLE = True
except ImportError:
    pass


try:
    from geneblocks import DiffBlocks

    GENEBLOCKS_AVAILABLE = True
except ImportError:
    GENEBLOCKS_AVAILABLE = False

try:
    from pdf_reports import ReportWriter
    import pdf_reports.tools as pdf_tools

    PDF_REPORTS_AVAILABLE = True
except ImportError:

    def ReportWriter(*a, **kw):
        return None

    PDF_REPORTS_AVAILABLE = False

THIS_DIR = os.path.dirname(os.path.realpath(__file__))
ASSETS_DIR = os.path.join(THIS_DIR, "assets")
TITLE_FONTDICT = fontdict = dict(size=14, weight="bold")

report_writer = ReportWriter(
    dnachisel_logo_url=os.path.join(ASSETS_DIR, "logo.png"),
    version=__version__,
    default_stylesheets=(os.path.join(ASSETS_DIR, "style.css"),),
)

install_reports_extra_message = (
    "Could not load %s (is it installed ?). You can install all "
    "dependencies for generating reports in DNA Chisel with this command:\n\n "
    "pip install dnachisel[reports]"
)


[docs]def write_no_solution_report( target, problem, error, file_content=None, file_path=None ): """Write a report on incompatibility found in the problem's constraints. The report comprises a PDF of plots of the sequence (global constraints, local constraints around the problem) and an annotated genbank. Parameters ---------- target Either a path to a folder, or a path to a zip archive, or "@memory" to return raw data of a zip archive containing the report. problem A DnaOptimizationProblem error A NoSolutionError (carries a message and a location) """ if not MATPLOTLIB_AVAILABLE: raise ImportError(install_extras_message("Matplotlib")) if isinstance(target, str): root = flametree.file_tree(target, replace=True) else: root = target # TRANSFER THE ORIGINAL FILE file_hash = None if file_path is not None: if file_content is None: with open(file_path, "rb") as f: file_content = f.read() basename = os.path.basename(file_path) file_hash = hashlib.md5(file_content).hexdigest()[:8] root._file("_".join([file_hash, basename])).write(file_content) translator = SpecAnnotationsTranslator() with PdfPages(root._file("plots.pdf").open("wb")) as pdf_io: # PLOT GLOBAL LOCATION OF ERROR record = problem.to_record() translator = SpecAnnotationsTranslator() graphical_record = translator.translate_record(record) ax, _ = graphical_record.plot(figure_width=min(20, 0.3 * len(record))) if len(record) < 60: graphical_record.plot_sequence(ax) if error.location is None: raise error start, end, strand = error.location.to_tuple() ax.fill_between( [start, end], -10, 10, zorder=-1000, facecolor="#ffcccc" ) title = "\n".join( textwrap.wrap( "No solution found in zone [%d, %d]:%s" % (start, end, str(error)), width=120, ) ) ax.set_title(title, fontdict=TITLE_FONTDICT) pdf_io.savefig(ax.figure, bbox_inches="tight", alpha=0.5) plt.close(ax.figure) # CREATE AND SAVE THE LOCAL CONSTRAINTS BREACHES RECORD record = error.problem.to_record( with_original_spec_features=False, with_constraints=False, with_objectives=False, ) start = max(0, error.location.start - 5) end = min(len(record), error.location.end + 4) focus_location = Location(start, end) def is_in_focus(location): return location.overlap_region(focus_location) is not None evals = error.problem.constraints_evaluations() passing = evals.filter("passing") record.features += passing.success_and_failures_as_features() failing = evals.filter("failing") record.features += failing.locations_as_features( label_prefix="BREACH", locations_filter=is_in_focus ) SeqIO.write( record, root._file("local_constraints_breaches.gb").open("w"), "genbank", ) # CREATE A FIGURE OF THE LOCAL CONSTRAINTS BREACHES AS A NEW PDF PAGE graphical_record = translator.translate_record(record) graphical_record = graphical_record.crop((start, end)) figure_width = min(20, 0.3 * (end - start)) ax, _ = graphical_record.plot(figure_width=figure_width) graphical_record.plot_sequence(ax) ax.set_title( "Local constraints breaches in [%d, %d]" % (start, end) + " (green = passing constraints)", fontdict=TITLE_FONTDICT, ) ax.set_ylim(top=ax.get_ylim()[1] + 1) pdf_io.savefig(ax.figure, bbox_inches="tight", alpha=0.5) plt.close(ax.figure) root._file("logs.txt").write(problem.logger.dump_logs()) # returns zip data if target == '@memory' if isinstance(target, str): return root._close()
def constraints_before_after_dataframe(problem, constraints_evaluations=None): if not PANDAS_AVAILABLE: raise ImportError("Install pandas to use this method.") if constraints_evaluations is None: constraints_evaluations = problem.constraints_evaluations() edits = problem.sequence_edits_as_array() def constraint_record(evaluation_before, evaluation_after): constraint = evaluation_before.specification start, end, _ = constraint.location.to_tuple() edits_sum = edits[start:end].sum() edits_percent = 100 * edits_sum / (end - start) label = constraint.label(use_short_form=True, with_location=False) return OrderedDict( [ ("constraint", label), ("start", start), ("end", end), ("before", "PASS" if evaluation_before.passes else "FAIL"), ("after", "PASS" if evaluation_after.passes else "FAIL"), ("edits", edits_sum), ("% edited", np.round(edits_percent, 2)), ] ) dataframe = pandas.DataFrame.from_records( [ constraint_record(before, after) for (before, after) in zip( problem.constraints_before, constraints_evaluations ) ] ) if len(dataframe): dataframe = dataframe.sort_values(by="start") return dataframe def objectives_before_after_dataframe(problem, objectives_evaluations=None): if objectives_evaluations is None: objectives_evaluations = problem.objectives_evaluations() edits = problem.sequence_edits_as_array() def objective_record(evaluation_before, evaluation_after): objective = evaluation_before.specification start, end, _ = objective.location.to_tuple() edits_sum = edits[start:end].sum() edits_percent = 100 * edits_sum / (end - start) label = objective.label(use_short_form=True, with_location=False) return OrderedDict( [ ("objective", label), ("boost", objective.boost), ("start", start), ("end", end), ("before", evaluation_before.score_to_formatted_string), ("after", evaluation_after.score_to_formatted_string), ("edits", edits_sum), ("% edited", np.round(edits_percent, 2)), ] ) dataframe = pandas.DataFrame.from_records( [ objective_record(before, after) for (before, after) in zip( problem.objectives_before, objectives_evaluations ) ] ) if len(dataframe): dataframe = dataframe.sort_values(by="start") return dataframe def plot_optimization_changes(problem): if not GENEBLOCKS_AVAILABLE: raise ImportError("Install Geneblocks to use plot_differences()") sequence_before = sequence_to_biopython_record(problem.sequence_before) sequence_after = problem.to_record() diffs = DiffBlocks.from_sequences(sequence_before, sequence_after) span = max(2, len(sequence_after) / 20) diffs = diffs.merged( blocks_per_span=(3, span), replace_gap=span / 2, change_gap=span / 2 ) _, diffs_ax = diffs.plot( translator_class=SpecAnnotationsTranslator, annotate_inline=True, figure_width=15, ) return diffs_ax
[docs]def write_optimization_report( target, problem, project_name="unnamed", plot_figure=True, constraints_evaluations=None, objectives_evaluations=None, figure_width=20, max_features_in_plots=300, file_path=None, file_content=None, ): """Write an optimization report with a PDF summary, plots, and genbanks. Parameters ---------- target Path to a directory or zip file, or "@memory" for returning raw data of a zip file created in-memory. problem A DnaOptimizationProblem to be solved and optimized project_name Name of the project that will appear on the PDF report constraints_evaluations Precomputed constraints evaluations. If None provided, they will be computed again from the problem. objectives_evaluations Precomputed objectives evaluations. If None provided, they will be computed again from the problem. figure_width Width of the report's figure, in inches. The more annotations there will be in the figure, the wider it should be. The default should work for most cases. max_features_in_plots Limit to the number of features to plot (plots with thousands of features may take ages to plot) file_path Path to the file from which the problem was created """ if not PDF_REPORTS_AVAILABLE: raise ImportError(install_extras_message("PDF Reports")) if not SEQUENTICON_AVAILABLE: raise ImportError(install_extras_message("Sequenticon")) if constraints_evaluations is None: constraints_evaluations = problem.constraints_evaluations() if objectives_evaluations is None: objectives_evaluations = problem.objectives_evaluations() if isinstance(target, str): root = flametree.file_tree(target, replace=True) else: root = target # TRANSFER THE ORIGINAL FILE file_hash = None if file_path is not None: if file_content is None: with open(file_path, "rb") as f: file_content = f.read() basename = os.path.basename(file_path) file_hash = hashlib.md5(file_content).hexdigest()[:8] root._file("_".join([file_hash, basename])).write(file_content) # CREATE FIGURES AND GENBANKS diffs_figure_data = None if GENEBLOCKS_AVAILABLE and plot_figure: diffs_ax = plot_optimization_changes(problem) diffs_figure_data = pdf_tools.figure_data(diffs_ax.figure, fmt="svg") plt.close(diffs_ax.figure) # GENERATE AND SAVE THE CONSTRAINTS SUMMARY constraints_before_after = constraints_before_after_dataframe( problem=problem, constraints_evaluations=constraints_evaluations ) filename = "constraints_before_and_after.csv" constraints_before_after.to_csv( root._file(filename).open("w"), index=False ) # GENERATE AND SAVE THE OBJECTIVES SUMMARY objectives_before_after = objectives_before_after_dataframe( problem=problem, objectives_evaluations=objectives_evaluations ) filename = "objectives_before_and_after.csv" objectives_before_after.to_csv(root._file(filename).open("w"), index=False) # CREATE PDF REPORT html = report_writer.pug_to_html( path=os.path.join(ASSETS_DIR, "optimization_report.pug"), project_name=project_name, problem=problem, constraints_evaluations=constraints_evaluations, objectives_evaluations=objectives_evaluations, constraints_before_after=constraints_before_after, objectives_before_after=objectives_before_after, edits=problem.sequence_edits_as_array().sum(), diffs_figure_data=diffs_figure_data, file_hash=file_hash, sequenticons={ label: sequenticon(seq, output_format="html_image", size=24) for label, seq in [ ("before", problem.sequence_before), ("after", problem.sequence), ] }, ) report_writer.write_report(html, root._file("Report.pdf")) # CREATE THE "SEQUENCE EDITS" REPORT record = problem.to_record(with_sequence_edits=True) breaches = problem.constraints_before.filter("failing") breaches_locations = breaches.locations_as_features( label_prefix="Breach from", merge_overlapping=True ) record.features += breaches_locations SeqIO.write( record, root._file("final_sequence_with_edits.gb").open("w"), "genbank" ) # CREATE THE "FINAL SEQUENCE" REPORT problem.to_record( root._file("final_sequence.gb").open("w"), with_constraints=False, with_objectives=False, ) if isinstance(target, str): return root._close()