Source code for bandwitch.ClonesObservations.ClonesObservations

"""Classes for parsing and analysis of DNA migration data."""

from collections import OrderedDict, Counter
import matplotlib.pyplot as plt
from io import BytesIO
import pandas
from matplotlib.backends.backend_pdf import PdfPages

from bandwagon import BandsPattern, BandsPatternsSet
import flametree

try:
    from plateo.exporters.plate_to_matplotlib_plots import PlateColorsPlotter
    from plateo.parsers import plate_from_platemap_spreadsheet
    from plateo.containers import Plate96

    PLATEO_AVAILABLE = True
except ImportError:
    PLATEO_AVAILABLE = False

try:
    import saboteurs

    SABOTEURS_AVAILABLE = True
except ImportError:
    SABOTEURS_AVAILABLE = False

from ..bands_predictions import predict_digestion_bands
from ..tools import load_record, all_subsets, record_is_linear
from .Clone import Clone
from .BandsObservation import BandsObservation



[docs]
class ClonesObservations:
    """All useful information for a collection of clones to be validated.

    Parameters
    ----------
    clones : list or dict
        Either a list of Clones (each with a unique name) or a dictionary
        ``{name: Clone}``.
    constructs_records : dict
        A dictionary ``{construct_name: biopython_record}`` indicating the
        sequence of the different constructs. For each construct, set the
        attribute ``construct.linear = False`` if the construct is circular.
    """

    def __init__(self, clones, constructs_records, partial_cutters=()):
        """Initialize."""
        if isinstance(clones, (list, tuple)):
            clones = OrderedDict([(clone.name, clone) for clone in clones])
        self.clones = clones
        clones_constructs = []
        for clone in list(clones.values()):
            if clone.construct_id not in clones_constructs:
                clones_constructs.append(clone.construct_id)
        self.constructs_records = OrderedDict(
            sorted(
                [
                    (construct_name, record)
                    for construct_name, record in constructs_records.items()
                ],
                key=lambda cst: (
                    +100000
                    if cst[0] not in clones_constructs
                    else clones_constructs.index(cst[0])
                ),
            )
        )
        self.constructs_digestions = {ct: {} for ct in constructs_records}
        self.partial_cutters = partial_cutters


[docs]
    def get_digestion_bands_for_construct(self, construct_id, digestion):
        """Return the bands resulting from the digestion of the construct.

        This function enables some memoization (no digestion is computed
        twice).

        Parameters
        ----------
        construct_id : str
            ID of the construct as it appears in this object's ``constructs_records``.
        digestion : tuple
            For instance ``('BamHI', 'XbaI')``.
        """
        construct_digestions = self.constructs_digestions[construct_id]
        if digestion not in construct_digestions:
            construct_record = self.constructs_records[construct_id]
            construct_digestions[digestion] = predict_digestion_bands(
                str(construct_record.seq),
                linear=record_is_linear(construct_record, default=True),
                enzymes=[
                    enzyme for enzyme in digestion if enzyme not in self.partial_cutters
                ],
                partial_cutters=self.partial_cutters,
            )
        return construct_digestions[digestion]



[docs]
    def get_clone_digestion_bands(self, clone, construct_id=None):
        """Return ``{digestion: bands}`` for all digestions of this clone."""
        if construct_id is None:
            construct_id = clone.construct_id
        return {
            digestion: self.get_digestion_bands_for_construct(
                construct_id=clone.construct_id, digestion=digestion
            )
            for digestion in clone.digestions
        }



[docs]
    def validate_all_clones(
        self,
        relative_tolerance=0.05,
        min_band_cutoff=None,
        max_band_cutoff=None,
    ):
        """Return ``{clone: CloneValidation}`` for all clones."""
        return OrderedDict(
            [
                (
                    clone_name,
                    clone.validate_bands(
                        bands_by_digestion=self.get_clone_digestion_bands(clone),
                        relative_tolerance=relative_tolerance,
                        min_band_cutoff=min_band_cutoff,
                        max_band_cutoff=max_band_cutoff,
                    ),
                )
                for clone_name, clone in self.clones.items()
            ]
        )



[docs]
    def identify_all_clones(
        self,
        relative_tolerance=0.05,
        min_band_cutoff=None,
        max_band_cutoff=None,
    ):
        """Return ``{clone: {construct_id: CloneValidation}}`` for all clones.

        Parameters
        ----------
        relative_tolerance : float
            Tolerance, as a ratio of the full ladder span. If =0.1, then the
            discrepancy will have a value of 1 when a band's nearest
            correspondent in the other pattern is more that 10% of the ladder
            span apart.
        min_band_cutoff : int
            Discrepancies involving at least one band below this minimal band
            size will be ignored. By default, it will be set to the smallest
            band size in the ladder.
        max_band_cutoff : int
            Discrepancies involving at least one band above this minimal band
            size will be ignored. By default, it will be set to the smallest
            band size in the ladder.
        """
        return OrderedDict(
            [
                (
                    clone_name,
                    {
                        construct_id: clone.validate_bands(
                            bands_by_digestion=self.get_clone_digestion_bands(
                                clone, construct_id=construct_id
                            ),
                            relative_tolerance=relative_tolerance,
                            min_band_cutoff=min_band_cutoff,
                            max_band_cutoff=max_band_cutoff,
                        )
                        for construct_id in self.constructs_records
                    },
                )
                for clone_name, clone in self.clones.items()
            ]
        )



[docs]
    def validations_summary(self, validations, sort_clones_by_score=True):
        """Return ``{construct_id: [CloneValidation, ...]}``.

        To each construct corresponds a list of the validation of all clones
        associated with that construct, from the best-scoring to the
        least-scoring.
        """
        results = OrderedDict()
        for clone_name, validation in validations.items():
            construct_id = self.clones[clone_name].construct_id
            if construct_id not in results:
                results[construct_id] = []
            results[construct_id].append(validation)
        if sort_clones_by_score:
            for cst_id, validations in results.items():
                results[cst_id] = sorted(validations, key=lambda v: v.max_discrepancy)
        return results



[docs]
    def plot_validations_plate_map(self, validations, target=None, ax=None):
        """Plot a map of the plate with passing/failing wells in green/red."""

        if not PLATEO_AVAILABLE:
            raise ImportError("This function requires Plateo installed")
        plate = Plate96("Validations map")
        constructs_colors = {}
        for well in plate.iter_wells():
            if well.name not in validations:
                well.data["bg_color"] = None
                well.data["pass_color"] = (1, 1, 1, 0.1)
            else:
                validation = validations[well.name]
                construct = validation.clone.construct_id
                if construct not in constructs_colors:
                    alpha = ((0.35 * len(constructs_colors) % 1) + 0.2) / 2
                    constructs_colors[construct] = (0.4, 0.4, 0.76, alpha)
                well.data["bg_color"] = constructs_colors[construct]
                well.data["pass_color"] = (
                    (0.43, 0.92, 0.49) if validation.passes else (0.91, 0.43, 0.43)
                )
        bg_plotter = PlateColorsPlotter(lambda w: w.data["bg_color"])
        ax, _ = bg_plotter.plot_plate(plate, figsize=(10, 5), ax=ax)
        pass_plotter = PlateColorsPlotter(
            lambda w: w.data["pass_color"], well_radius=250
        )
        pass_plotter.plot_plate(plate, ax=ax)
        if target is not None:
            ax.figure.savefig(target, bbox_inches="tight")
            plt.close(ax.figure)
        else:
            return ax


    def validations_summary_table(self, validations, target=None):
        validations_summary = self.validations_summary(validations)
        records = []
        for construct, data in validations_summary.items():
            best_clone = None
            valid_clones = [clone for clone in data if clone.passes]
            best_clone_name = None
            if len(valid_clones):
                best_clone = min(valid_clones, key=lambda c: c.max_discrepancy)
                best_clone_name = best_clone.clone.name
            other_valid_clones = ", ".join(
                [
                    clone.clone.name
                    for clone in valid_clones
                    if clone.clone.name != best_clone_name
                ]
            )
            records.append(
                dict(
                    construct=construct,
                    n_clones=len(data),
                    valid_clones=len(valid_clones),
                    best_clone=best_clone_name,
                    other_valid_clones=other_valid_clones,
                )
            )
        dataframe = pandas.DataFrame.from_records(
            records,
            columns=[
                "construct",
                "n_clones",
                "valid_clones",
                "best_clone",
                "other_valid_clones",
            ],
        )
        if target is not None:
            dataframe.to_csv(target, index=False)
        return dataframe


[docs]
    def plot_all_validations_patterns(
        self, validations, target=None, per_digestion_discrepancy=False
    ):
        """Plot a Graphic report of the gel validation.

        The report displays the patterns, with green and red backgrounds
        depending on whether they passed the validation test.

        Parameters
        ----------
        validations
            Validation results.
        target
            File path.
        per_digestion_discrepancy
            If true, each pattern for each clone will have an indication of
            how close it is from the expected pattern. If False, and the
            validation involves several digestions, the biggest discrepancy
            among all patterns is shown.
        """

        summary = self.validations_summary(validations)
        max_x = (
            Counter([clone.construct_id for clone in self.clones.values()]).most_common(
                1
            )[0][1]
            + 2
        )
        digestions_by_construct = {construct: [] for construct in summary}
        for construct, validations in summary.items():
            for validation in validations:
                for digestion in validation.discrepancies:
                    if digestion not in digestions_by_construct[construct]:
                        digestions_by_construct[construct].append(digestion)

        # ladder = list(validations[0].clone.digestions.values())[0]["ladder"]
        pdf_io = BytesIO()
        with PdfPages(pdf_io) as pdf:
            for construct_id, validations in summary.items():
                ladder = list(validations[0].clone.digestions.values())[0].ladder
                digestions = digestions_by_construct[construct_id]
                band_patterns = OrderedDict()
                seen_clones = set()
                for i, digestion in enumerate(digestions):
                    reference_bands = self.get_digestion_bands_for_construct(
                        construct_id, digestion
                    )
                    reference = BandsPattern(
                        bands=reference_bands,
                        ladder=ladder,
                        label="exp." if (i == 0) else None,
                        background_color="#c6dcff",
                        corner_note="Total: %d bp" % sum(reference_bands),
                        global_bands_props={"label_fontdict": {"size": 5}},
                    )
                    sorted_bands = sorted(reference.bands, key=lambda b: -b.dna_size)
                    for band_name, band in zip("abcdefghijklm", sorted_bands):
                        band.label = band_name

                    patterns = []
                    for validation in validations:
                        clone_name = validation.clone.name
                        label = None if clone_name in seen_clones else "auto"
                        pattern = validation.to_bandwagon_bandpattern(
                            digestion=digestion,
                            label=label,
                            per_digestion_discrepancy=per_digestion_discrepancy,
                        )
                        if pattern is not None:
                            seen_clones.add(clone_name)
                        patterns.append(pattern)
                    band_patterns[digestion] = BandsPatternsSet(
                        [reference] + patterns,
                        ladder=ladder,
                        label=" + ".join(digestion),
                        ladder_ticks=5,
                        global_patterns_props={"label_fontdict": {"rotation": 60}},
                    )

                digestions = digestions_by_construct[construct_id]
                fig, axes = plt.subplots(
                    len(digestions),
                    1,
                    sharex=True,
                    figsize=(1.1 * max_x, 3 * len(digestions)),
                )
                if len(digestions) == 1:
                    axes = [axes]
                axes[-1].set_xlim(right=max_x)

                for ax, (dig, pattern_set) in zip(axes, band_patterns.items()):
                    pattern_set.plot(ax)

                axes[-1].set_xlabel(
                    construct_id,
                    fontdict=dict(size=14, weight="bold"),
                    ha="left",
                    position=(0.0, 0.1),
                )
                pdf.attach_note(construct_id)

                # Temporary fix because of Matplotlib bug #12634
                pdf.savefig(fig, bbox_inches="tight")
                plt.close(fig)
        pdf_data = pdf_io.getvalue()
        if target is None:
            return pdf_data
        elif target == "base64":
            return "data:application/pdf;base64," + pdf_data.decode("utf-8")
        else:
            with open(target, "wb") as f:
                f.write(pdf_data)



[docs]
    @staticmethod
    def identify_bad_parts(
        validations,
        constructs_parts,
        constructs_records=None,
        report_target=None,
        extra_failures=None,
    ):
        """Identifies parts associated with failure in the validations.

        Uses the Saboteurs package:
        https://github.com/Edinburgh-Genome-Foundry/saboteurs

        Parameters
        ----------
        validations
            Validation results.

        constructs_parts
            Either a dict {construct_id: [list, of, part, names]} or a function
            (biopython_record => [list, of, part, names]).

        report_target
            Can be a path to file or file-like object where to write a PDF
            report. Can also be "@memory", at which case the raw binary PDF
            data is returned.

        Returns
        -------
        analysis, pdf_data
            Where ``analysis`` is the result of sabotage analysis (see the
            saboteurs library), and pdf_data is None unless report_target is
            set to "@memory" (see above).
        """
        if not SABOTEURS_AVAILABLE:
            raise ImportError(
                "You must install the saboteurs library to"
                "be able to identify bad parts. Try:\n\n"
                "(sudo) pip install saboteurs"
            )
        if hasattr(constructs_parts, "__call__"):
            constructs_parts = {
                record_id: constructs_parts(record)
                for record_id, record in constructs_records.items()
            }
        extra_failures = extra_failures or {}
        constructs_stats = OrderedDict()
        for clone_name, validation in validations.items():
            construct_id = validation.clone.construct_id
            if construct_id not in constructs_stats:
                constructs_stats[construct_id] = dict(
                    exp_id=construct_id,
                    attempts=0,
                    failures=0,
                    members=constructs_parts[construct_id],
                )
            stats = constructs_stats[construct_id]
            stats["attempts"] += 1
            stats["failures"] += not validation.passes
        for construct_id, weight in extra_failures.items():
            constructs_stats[construct_id] = dict(
                exp_id=construct_id,
                attempts=weight,
                failures=weight,
                members=constructs_parts[construct_id],
            )
        # import json
        # print (json.dumps(constructs_stats, indent=2))
        analysis = saboteurs.find_statistical_saboteurs(constructs_stats)
        report_data = None
        if report_target is not None:
            report_data = saboteurs.statistics_report(analysis, report_target)
        return analysis, report_data



[docs]
    def write_identification_report(
        self,
        target_file=None,
        relative_tolerance=0.05,
        min_band_cutoff=None,
        max_band_cutoff=None,
    ):
        """Plot a Graphic report of the gel validation.

        The report displays the patterns, with green and red backgrounds
        depending on whether they passed the validation test.

        Parameters
        ----------
        target_file
            File object or file path where to save the figure. If provided, the
            function returns none, else the function returns the axes array of
            the final figure.
        relative_tolerance
            Relative error tolerated on each band for the ovserved patterns to
            be considered similar to the expected patterns.
        min_band_cutoff
            Bands with a size below this value will not be considered.
        max_band_cutoff
            Bands with a size above this value will not be considered.
        """
        bands_validities = self.compute_all_bands_validities(
            relative_tolerance=relative_tolerance,
            min_band_cutoff=min_band_cutoff,
            max_band_cutoff=max_band_cutoff,
        )
        L = len(self.constructs_records)
        max_x = max(len(measures) for measures in self.observed_bands.values()) + 1
        fig, axes = plt.subplots(L, 1, figsize=(2.2 * max_x, 3 * L))
        axes_validities = zip(axes, bands_validities.items())
        for ax, (construct_id, validities) in axes_validities:
            reference = BandsPattern(
                self.expected_bands[construct_id],
                ladder=self.ladder,
                label="exp.",
                background_color="#c6dcff",
                corner_note="Total: %d bp" % sum(self.expected_bands[construct_id]),
                global_bands_props={"label_fontdict": {"size": 5}},
            )
            sorted_bands = sorted(reference.bands, key=lambda b: -b.dna_size)
            for band_name, band in zip("abcdefghijklm", sorted_bands):
                band.label = band_name
            patterns = [
                BandsPattern(
                    self.observed_bands[construct_id][measure_name],
                    corner_note="Total: %d bp"
                    % sum(self.observed_bands[construct_id][measure_name]),
                    ladder=self.ladder,
                    label=measure_name,
                    gel_image=self.migration_images[measure_name],
                    background_color=(
                        "#aaffaa" if validities[measure_name] else "#ffaaaa"
                    ),
                )
                for measure_name in self.observed_bands[construct_id]
            ]
            patterns_set = BandsPatternsSet(
                [reference] + patterns,
                ladder=self.ladder,
                label=construct_id,
                ladder_ticks=5,
                global_patterns_props={"label_fontdict": {"rotation": 60}},
            )
            ax.set_xlim(0.5, max_x + 2)
            patterns_set.plot(ax)
        fig.subplots_adjust(hspace=0.3)
        if target_file is not None:
            fig.savefig(target_file, bbox_inches="tight")
            plt.close(fig)
        else:
            return axes


    @staticmethod
    def from_files(
        records_path,
        constructs_map_path,
        aati_zip_path,
        digestions_map_path=None,
        digestion=None,
        direction="column",
        ignore_bands_under=None,
        topology="circular",
    ):
        constructs_records = {
            f._name_no_extension: load_record(
                f.open("r"),
                topology=topology,
                id=f._name_no_extension,
                file_format="genbank",
            )
            for f in flametree.file_tree(records_path)._all_files
        }
        constructs_records = OrderedDict(sorted(constructs_records.items()))

        constructs_plate = plate_from_platemap_spreadsheet(
            constructs_map_path, data_field="construct", headers=True
        )
        constructs_map = OrderedDict(
            [
                (well.name, well.data["construct"])
                for well in constructs_plate.iter_wells(direction=direction)
                if "construct" in well.data and str(well.data["construct"]) != "nan"
            ]
        )

        if digestion:
            digestion = tuple(digestion)
            digestions_map = OrderedDict(
                [(wellname, digestion) for wellname in constructs_map]
            )
        else:
            digestions_plate = plate_from_platemap_spreadsheet(
                digestions_map_path, data_field="digestion", headers=True
            )
            digestions_map = OrderedDict(
                [
                    (well.name, tuple(well.data["digestion"].split(", ")))
                    for well in digestions_plate.iter_wells(direction=direction)
                    if "digestion" in well.data and str(well.data["digestion"]) != "nan"
                ]
            )

        observations = BandsObservation.from_aati_fa_archive(
            aati_zip_path,
            direction=direction,
            ignore_bands_under=ignore_bands_under,
        )
        clones = Clone.from_bands_observations(
            observations, constructs_map, digestions_map
        )
        clones_observations = ClonesObservations(clones, constructs_records)
        return clones_observations


[docs]
    def partial_digests_analysis(self, relative_tolerance=0.05):
        """Compute good clones under different partial digest assumptions.

        Returns a dictionary ``{partial: {'valid_clones': 60, 'label': 'x'}}``
        where for a given scenario ``partial`` is a tuple of all enzymes
        considered to have partial activity in this scenario (it defines the
        scenario) ``valid_clones`` is the number of good clones under this
        assumption, and ``label`` is a string representation of all enzymes
        involved, with partial activity enzymes in parenthesis.

        This result can be fed to ``ClonesObservations``'s
        ``.plot_partial_digests_analysis`` method for plotting.
        """
        all_enzymes = set(
            enzyme
            for clone in self.clones.values()
            for digestion in clone.digestions
            for enzyme in digestion
        )
        all_enzymes
        results = {}
        for good_cutters in all_subsets(all_enzymes):
            partial_cutters = tuple([e for e in all_enzymes if e not in good_cutters])
            clones_observations = ClonesObservations(
                self.clones,
                self.constructs_records,
                partial_cutters=partial_cutters,
            )
            validations = clones_observations.validate_all_clones(
                relative_tolerance=relative_tolerance
            )
            label = " + ".join(
                sorted(good_cutters) + sorted(["(%s)" % c for c in partial_cutters])
            )
            valid_clones = sum([v.passes for name, v in validations.items()])

            results[partial_cutters] = {
                "label": label,
                "validations": validations,
                "valid_clones": valid_clones,
            }
        return results



[docs]
    @staticmethod
    def plot_partial_digests_analysis(analysis_results, ax=None):
        """Plot partial digest analysis results.

        Parameters
        ----------
        analysis_results
            results from ``ClonesObservations.partial_digest_analysis``.
        ax
            A Matplotlib ax. If none, one is created and returned at the end.
        """
        if ax is None:
            fig, ax = plt.subplots(figsize=(5, 0.5 * len(analysis_results)))
        ax.axis("off")
        values, labels = zip(
            *sorted(
                [
                    (data["valid_clones"], data["label"])
                    for partial, data in analysis_results.items()
                ]
            )
        )
        ax.barh(range(len(values)), values)
        for i, (value, label) in enumerate(zip(values, labels)):
            ax.text(
                0,
                i,
                label + " ",
                ha="right",
                va="center",
                fontweight=("normal" if "(" in label else "bold"),
            )
            ax.text(
                value,
                i,
                str(value) + " ",
                ha="right",
                va="center",
                fontdict={"color": "white", "weight": "bold"},
            )
        ax.set_ylim(-0.5, len(values) - 0.5)
        ax.set_title("Number of good clones, by scenario")
        return ax


    @staticmethod
    def merge_observations(observations_list, prefixes=None):
        prefixes = prefixes or (len(observations_list) * [""])

        clones = OrderedDict(
            [
                (pref + k, v)
                for pref, obs in zip(prefixes, observations_list)
                for k, v in obs.clones.items()
            ]
        )
        constructs_records = {
            name: record
            for obs in observations_list
            for name, record in obs.constructs_records.items()
        }
        partial_cutters = tuple(
            enzyme for obs in observations_list for enzyme in obs.partial_cutters
        )

        return ClonesObservations(clones, constructs_records, partial_cutters)