Module ediacara.Assembly

None

View Source

import os

import pandas as pd

from Bio import SeqFeature

from Bio import SeqIO

import dna_features_viewer

from .Comparator import ComparatorGroup

class AssemblyTranslator(dna_features_viewer.BiopythonTranslator):

    """Custom translator to display only the aligned parts."""

    def compute_feature_color(self, feature):

        ediacara_qualifier = "ediacara"

        if feature.qualifiers[ediacara_qualifier] == "wrong_part":

            return "#f5685e"

        elif feature.qualifiers[ediacara_qualifier] == "correct_part":

            return "#79d300"

        elif feature.qualifiers[ediacara_qualifier] == "unknown_part":

            return "#FFFFFF"

        elif feature.qualifiers[ediacara_qualifier] == "reference":

            return "#d3d3d3"

        else:

            return "#7245dc"  # default dna_features_viewer colour

class AssemblyBatch:

    """Batch of Assembly class instances.

    **Parameters**

    **assemblies**

    > List of `Assembly` instances.

    **name**

    > Name of the assembly analysis project (`str`).

    """

    def __init__(self, assemblies, name="Unnamed"):

        self.assemblies = assemblies

        self.name = name

    def perform_all_interpretations_in_group(self):

        for assembly in self.assemblies:

            assembly.interpret_alignment()

class Assembly:

    """Compare an assembly sequence to parts and simulated reference.

    **Parameters**

    **assembly_path**

    > Path to the *de novo* assembled FASTA sequence file (`str`).

    **reference**

    > Path to reference Genbank file (`str`).

    **alignment**

    > Path (`str`) to minimap2 alignment PAF file, created with the `-cx asm5` options.

    In this, the part and reference sequences are aligned against the de novo sequence.

    **assembly_plan**

    > Path of assembly plan CSV file (DNA Cauldron output format, with header line).

    May contain additional entries, the correct line is chosen by reference sequence ID.

    **use_file_names_as_ids**

    > If True, uses the Genbank file name as sequence ID.

    """

    def __init__(

        self,

        assembly_path,

        reference_path,

        alignment_path,

        assembly_plan=None,

        use_file_names_as_ids=True,

    ):

        self.assembly = SeqIO.read(handle=assembly_path, format="fasta")

        self.reference = SeqIO.read(handle=reference_path, format="genbank")

        if use_file_names_as_ids:

            basename = os.path.basename(reference_path)

            basename_no_extension = os.path.splitext(basename)[0]

            self.reference.id = basename_no_extension

        self.paf = ComparatorGroup.load_paf(alignment_path)

        self.paf.columns = self.paf.columns[:-1].to_list() + ["CIGAR"]  # -1 is last

        if assembly_plan is None:

            self.assembly_plan = None

            self.parts = []

            self.has_assembly_plan = False

        else:

            self.has_assembly_plan = True

            plan_df = pd.read_csv(assembly_plan, skiprows=1, header=None)  # skip header

            self.assembly_plan = plan_df[plan_df[0] == self.reference.id]

            if len(self.assembly_plan) == 0:

                raise ValueError("Error! Assembly plan doesn't contain the reference!")

            if len(self.assembly_plan) > 1:

                raise ValueError(

                    "Error! More than one assembly plan entry matches the reference!"

                )

            self.parts = self.assembly_plan.iloc[0].to_list()  # has only one line

    def interpret_alignment(self):

        seq_matches = 0  # counts signed matches to reference for evaluating orientation

        for index, row in self.paf.iterrows():

            # May be useful to exclude reference:

            # if row["query_name"] == self.reference.id:

            #     continue

            part_type = self.evaluate_part_name(row["query_name"])

            location = SeqFeature.FeatureLocation(

                row["target_start"], row["target_end"]

            )

            feature = SeqFeature.SeqFeature(

                location=location,

                type="misc_feature",

                id=row["query_name"],

                qualifiers={"label": row["query_name"], "ediacara": part_type},

            )

            self.assembly.features.append(feature)

            if part_type == "reference":

                if row["strand"] == "+":

                    sign = 1

                else:

                    sign = -1

                seq_matches += sign * int(row["mapping_matches"])

        self.reverse_complement = True if seq_matches < 0 else False

        self.assembly_figure = self.plot_assembly()

    def subset_paf(self):

        selected_columns = [

            "query_name",

            "query_length",

            "query_start",

            "query_end",

            "strand",

            "target_start",

            "target_end",

            "mapping_matches",

            "mapping_size",

            "mapping_quality",

        ]

        paf_subset = self.paf[selected_columns]

        new_columnnames = [

            "Name",

            "Length",

            "Start",

            "End",

            "Strand",

            "T Start",

            "T End",

            "Matches",

            "Size",

            "Quality",

        ]

        paf_subset.columns = new_columnnames

        return paf_subset

    def evaluate_part_name(self, name):

        if self.assembly_plan is None:

            return "unknown_part"

        elif name == self.reference.id:

            return "reference"

        elif name in self.parts:

            return "correct_part"

        else:

            return "wrong_part"

    def plot_assembly(self):

        graphic_record = AssemblyTranslator().translate_record(self.assembly)

        ax, _ = graphic_record.plot(figure_width=8, strand_in_label_threshold=7)

        return ax

Classes

Assembly

class Assembly(
    assembly_path,
    reference_path,
    alignment_path,
    assembly_plan=None,
    use_file_names_as_ids=True
)

View Source

class Assembly:

    """Compare an assembly sequence to parts and simulated reference.

    **Parameters**

    **assembly_path**

    > Path to the *de novo* assembled FASTA sequence file (`str`).

    **reference**

    > Path to reference Genbank file (`str`).

    **alignment**

    > Path (`str`) to minimap2 alignment PAF file, created with the `-cx asm5` options.

    In this, the part and reference sequences are aligned against the de novo sequence.

    **assembly_plan**

    > Path of assembly plan CSV file (DNA Cauldron output format, with header line).

    May contain additional entries, the correct line is chosen by reference sequence ID.

    **use_file_names_as_ids**

    > If True, uses the Genbank file name as sequence ID.

    """

    def __init__(

        self,

        assembly_path,

        reference_path,

        alignment_path,

        assembly_plan=None,

        use_file_names_as_ids=True,

    ):

        self.assembly = SeqIO.read(handle=assembly_path, format="fasta")

        self.reference = SeqIO.read(handle=reference_path, format="genbank")

        if use_file_names_as_ids:

            basename = os.path.basename(reference_path)

            basename_no_extension = os.path.splitext(basename)[0]

            self.reference.id = basename_no_extension

        self.paf = ComparatorGroup.load_paf(alignment_path)

        self.paf.columns = self.paf.columns[:-1].to_list() + ["CIGAR"]  # -1 is last

        if assembly_plan is None:

            self.assembly_plan = None

            self.parts = []

            self.has_assembly_plan = False

        else:

            self.has_assembly_plan = True

            plan_df = pd.read_csv(assembly_plan, skiprows=1, header=None)  # skip header

            self.assembly_plan = plan_df[plan_df[0] == self.reference.id]

            if len(self.assembly_plan) == 0:

                raise ValueError("Error! Assembly plan doesn't contain the reference!")

            if len(self.assembly_plan) > 1:

                raise ValueError(

                    "Error! More than one assembly plan entry matches the reference!"

                )

            self.parts = self.assembly_plan.iloc[0].to_list()  # has only one line

    def interpret_alignment(self):

        seq_matches = 0  # counts signed matches to reference for evaluating orientation

        for index, row in self.paf.iterrows():

            # May be useful to exclude reference:

            # if row["query_name"] == self.reference.id:

            #     continue

            part_type = self.evaluate_part_name(row["query_name"])

            location = SeqFeature.FeatureLocation(

                row["target_start"], row["target_end"]

            )

            feature = SeqFeature.SeqFeature(

                location=location,

                type="misc_feature",

                id=row["query_name"],

                qualifiers={"label": row["query_name"], "ediacara": part_type},

            )

            self.assembly.features.append(feature)

            if part_type == "reference":

                if row["strand"] == "+":

                    sign = 1

                else:

                    sign = -1

                seq_matches += sign * int(row["mapping_matches"])

        self.reverse_complement = True if seq_matches < 0 else False

        self.assembly_figure = self.plot_assembly()

    def subset_paf(self):

        selected_columns = [

            "query_name",

            "query_length",

            "query_start",

            "query_end",

            "strand",

            "target_start",

            "target_end",

            "mapping_matches",

            "mapping_size",

            "mapping_quality",

        ]

        paf_subset = self.paf[selected_columns]

        new_columnnames = [

            "Name",

            "Length",

            "Start",

            "End",

            "Strand",

            "T Start",

            "T End",

            "Matches",

            "Size",

            "Quality",

        ]

        paf_subset.columns = new_columnnames

        return paf_subset

    def evaluate_part_name(self, name):

        if self.assembly_plan is None:

            return "unknown_part"

        elif name == self.reference.id:

            return "reference"

        elif name in self.parts:

            return "correct_part"

        else:

            return "wrong_part"

    def plot_assembly(self):

        graphic_record = AssemblyTranslator().translate_record(self.assembly)

        ax, _ = graphic_record.plot(figure_width=8, strand_in_label_threshold=7)

        return ax

Methods

evaluate_part_name

def evaluate_part_name(
    self,
    name
)

View Source

    def evaluate_part_name(self, name):

        if self.assembly_plan is None:

            return "unknown_part"

        elif name == self.reference.id:

            return "reference"

        elif name in self.parts:

            return "correct_part"

        else:

            return "wrong_part"

interpret_alignment

def interpret_alignment(
    self
)

View Source

    def interpret_alignment(self):

        seq_matches = 0  # counts signed matches to reference for evaluating orientation

        for index, row in self.paf.iterrows():

            # May be useful to exclude reference:

            # if row["query_name"] == self.reference.id:

            #     continue

            part_type = self.evaluate_part_name(row["query_name"])

            location = SeqFeature.FeatureLocation(

                row["target_start"], row["target_end"]

            )

            feature = SeqFeature.SeqFeature(

                location=location,

                type="misc_feature",

                id=row["query_name"],

                qualifiers={"label": row["query_name"], "ediacara": part_type},

            )

            self.assembly.features.append(feature)

            if part_type == "reference":

                if row["strand"] == "+":

                    sign = 1

                else:

                    sign = -1

                seq_matches += sign * int(row["mapping_matches"])

        self.reverse_complement = True if seq_matches < 0 else False

        self.assembly_figure = self.plot_assembly()

plot_assembly

def plot_assembly(
    self
)

View Source

    def plot_assembly(self):

        graphic_record = AssemblyTranslator().translate_record(self.assembly)

        ax, _ = graphic_record.plot(figure_width=8, strand_in_label_threshold=7)

        return ax

subset_paf

def subset_paf(
    self
)

View Source

    def subset_paf(self):

        selected_columns = [

            "query_name",

            "query_length",

            "query_start",

            "query_end",

            "strand",

            "target_start",

            "target_end",

            "mapping_matches",

            "mapping_size",

            "mapping_quality",

        ]

        paf_subset = self.paf[selected_columns]

        new_columnnames = [

            "Name",

            "Length",

            "Start",

            "End",

            "Strand",

            "T Start",

            "T End",

            "Matches",

            "Size",

            "Quality",

        ]

        paf_subset.columns = new_columnnames

        return paf_subset

AssemblyBatch

class AssemblyBatch(
    assemblies,
    name='Unnamed'
)

View Source

class AssemblyBatch:

    """Batch of Assembly class instances.

    **Parameters**

    **assemblies**

    > List of `Assembly` instances.

    **name**

    > Name of the assembly analysis project (`str`).

    """

    def __init__(self, assemblies, name="Unnamed"):

        self.assemblies = assemblies

        self.name = name

    def perform_all_interpretations_in_group(self):

        for assembly in self.assemblies:

            assembly.interpret_alignment()

Methods

perform_all_interpretations_in_group

def perform_all_interpretations_in_group(
    self
)

View Source

    def perform_all_interpretations_in_group(self):

        for assembly in self.assemblies:

            assembly.interpret_alignment()

AssemblyTranslator

class AssemblyTranslator(
    features_filters=(),
    features_properties=None
)

View Source

class AssemblyTranslator(dna_features_viewer.BiopythonTranslator):

    """Custom translator to display only the aligned parts."""

    def compute_feature_color(self, feature):

        ediacara_qualifier = "ediacara"

        if feature.qualifiers[ediacara_qualifier] == "wrong_part":

            return "#f5685e"

        elif feature.qualifiers[ediacara_qualifier] == "correct_part":

            return "#79d300"

        elif feature.qualifiers[ediacara_qualifier] == "unknown_part":

            return "#FFFFFF"

        elif feature.qualifiers[ediacara_qualifier] == "reference":

            return "#d3d3d3"

        else:

            return "#7245dc"  # default dna_features_viewer colour

Ancestors (in MRO)

dna_features_viewer.BiopythonTranslator.BiopythonTranslator.BiopythonTranslator
dna_features_viewer.BiopythonTranslator.BiopythonTranslatorBase.BiopythonTranslatorBase

Class variables

default_feature_color

graphic_record_parameters

ignored_features_types

label_fields

Static methods

quick_class_plot

def quick_class_plot(
    record,
    figure_width=12,
    **kwargs
)

Allows super quick and dirty plotting of Biopython records.

This is really meant for use in a Jupyter/Ipython notebook with the "%matplotlib inline" setting.

from dna_features_viewer import BiopythonTranslator BiopythonTranslator.quick_plot(my_record)

View Source

    @classmethod

    def quick_class_plot(cls, record, figure_width=12, **kwargs):

        """Allows super quick and dirty plotting of Biopython records.

        This is really meant for use in a Jupyter/Ipython notebook with

        the "%matplotlib inline" setting.

        >>> from dna_features_viewer import BiopythonTranslator

        >>> BiopythonTranslator.quick_plot(my_record)

        """

        graphic_record = cls().translate_record(record)

        ax, _ = graphic_record.plot(figure_width=figure_width, **kwargs)

        return ax

Methods

compute_feature_box_color

def compute_feature_box_color(
    self,
    feature
)

Compute a box_color for this feature.

View Source

    def compute_feature_box_color(self, feature):

        """Compute a box_color for this feature."""

        return "auto"

compute_feature_box_linewidth

def compute_feature_box_linewidth(
    self,
    feature
)

Compute a box_linewidth for this feature.

View Source

    def compute_feature_box_linewidth(self, feature):

        """Compute a box_linewidth for this feature."""

        return 0.3

compute_feature_color

def compute_feature_color(
    self,
    feature
)

Compute a color for this feature.

If the feature has a color qualifier it will be used. Otherwise, the classe's default_feature_color is used.

To change the behaviour, create a subclass of BiopythonTranslator and overwrite this method.

View Source

    def compute_feature_color(self, feature):

        ediacara_qualifier = "ediacara"

        if feature.qualifiers[ediacara_qualifier] == "wrong_part":

            return "#f5685e"

        elif feature.qualifiers[ediacara_qualifier] == "correct_part":

            return "#79d300"

        elif feature.qualifiers[ediacara_qualifier] == "unknown_part":

            return "#FFFFFF"

        elif feature.qualifiers[ediacara_qualifier] == "reference":

            return "#d3d3d3"

        else:

            return "#7245dc"  # default dna_features_viewer colour

compute_feature_fontdict

def compute_feature_fontdict(
    self,
    feature
)

Compute a font dict for this feature.

View Source

    def compute_feature_fontdict(self, feature):

        """Compute a font dict for this feature."""

        return None

compute_feature_html

def compute_feature_html(
    self,
    feature
)

Gets the 'label' of the feature.

View Source

    def compute_feature_html(self, feature):

        """Gets the 'label' of the feature."""

        return self.compute_feature_label(feature)

compute_feature_label

def compute_feature_label(
    self,
    feature
)

Compute the label of the feature.

View Source

    def compute_feature_label(self, feature):

        """Compute the label of the feature."""

        label = feature.type

        for key in self.label_fields:

            if key in feature.qualifiers and len(feature.qualifiers[key]):

                label = feature.qualifiers[key]

                break

        if isinstance(label, list):

            label = "|".join(label)

        return label

compute_feature_label_link_color

def compute_feature_label_link_color(
    self,
    feature
)

Compute the color of the line linking the label to its feature.

View Source

    def compute_feature_label_link_color(self, feature):

        """Compute the color of the line linking the label to its feature."""

        return "black"

compute_feature_legend_text

def compute_feature_legend_text(
    self,
    feature
)

View Source

    def compute_feature_legend_text(self, feature):

        return None

compute_feature_linewidth

def compute_feature_linewidth(
    self,
    feature
)

Compute the edge width of the feature's arrow/rectangle.

View Source

    def compute_feature_linewidth(self, feature):

        """Compute the edge width of the feature's arrow/rectangle."""

        return 1.0

compute_filtered_features

def compute_filtered_features(
    self,
    features
)

Return the list of features minus the ignored ones.

By the method keeps any feature whose type is not in ignored_features_types and for which all filter(f) pass.

View Source

    def compute_filtered_features(self, features):

        """Return the list of features minus the ignored ones.

        By the method keeps any feature whose type is not in

        ignored_features_types and for which all filter(f) pass.

        """

        return [

            f

            for f in features

            if all([fl(f) for fl in self.features_filters])

            and f.type not in self.ignored_features_types

        ]

quick_plot

def quick_plot(
    self,
    record,
    figure_width=12,
    **kwargs
)

Allows super quick and dirty plotting of Biopython records.

This is really meant for use in a Jupyter/Ipython notebook with the "%matplotlib inline" setting.

from dna_features_viewer import BiopythonTranslator BiopythonTranslator.quick_plot(my_record)

View Source

    def quick_plot(self, record, figure_width=12, **kwargs):

        """Allows super quick and dirty plotting of Biopython records.

        This is really meant for use in a Jupyter/Ipython notebook with

        the "%matplotlib inline" setting.

        >>> from dna_features_viewer import BiopythonTranslator

        >>> BiopythonTranslator.quick_plot(my_record)

        """

        graphic_record = self.translate_record(record)

        ax, _ = graphic_record.plot(figure_width=figure_width, **kwargs)

        return ax

translate_feature

def translate_feature(
    self,
    feature
)

Translate a Biopython feature into a Dna Features Viewer feature.

View Source

    def translate_feature(self, feature):

        """Translate a Biopython feature into a Dna Features Viewer feature."""

        properties = dict(

            label=self.compute_feature_label(feature),

            color=self.compute_feature_color(feature),

            html=self.compute_feature_html(feature),

            fontdict=self.compute_feature_fontdict(feature),

            box_linewidth=self.compute_feature_box_linewidth(feature),

            box_color=self.compute_feature_box_color(feature),

            linewidth=self.compute_feature_linewidth(feature),

            label_link_color=self.compute_feature_label_link_color(feature),

            legend_text=self.compute_feature_legend_text(feature),

        )

        if self.features_properties is not None:

            other_properties = self.features_properties

            if hasattr(other_properties, "__call__"):

                other_properties = other_properties(feature)

            properties.update(other_properties)

        return GraphicFeature(

            start=feature.location.start,

            end=feature.location.end,

            strand=feature.location.strand,

            **properties

        )

translate_record

def translate_record(
    self,
    record,
    record_class=None,
    filetype=None
)

Create a new GraphicRecord from a BioPython Record object.

Parameters:

Name	Type	Description	Default
record	None	A BioPython Record object or the path to a Genbank or a GFF file.	None
record_class	None	The graphic record class to use, e.g. GraphicRecord (default) or
CircularGraphicRecord. Strings 'circular' and 'linear' can also be
provided.	None
filetype	None	Used only when a Genbank or a GFF file is provided; one of "genbank"
or "gff" to be used. Default None infers from file extension.	None

View Source

    def translate_record(self, record, record_class=None, filetype=None):

        """Create a new GraphicRecord from a BioPython Record object.

        Parameters

        ----------

        record

          A BioPython Record object or the path to a Genbank or a GFF file.

        record_class

          The graphic record class to use, e.g. GraphicRecord (default) or

          CircularGraphicRecord. Strings 'circular' and 'linear' can also be

          provided.

        filetype

          Used only when a Genbank or a GFF file is provided; one of "genbank"

          or "gff" to be used. Default None infers from file extension.

        """

        classes = {

            "linear": GraphicRecord,

            "circular": CircularGraphicRecord,

            None: GraphicRecord,

        }

        if record_class in classes:

            record_class = classes[record_class]

        if isinstance(record, str) or hasattr(record, "read"):

            record = load_record(record, filetype=filetype)

        filtered_features = self.compute_filtered_features(record.features)

        return record_class(

            sequence_length=len(record),

            sequence=str(record.seq),

            features=[

                self.translate_feature(feature)

                for feature in filtered_features

                if feature.location is not None

            ],

            **self.graphic_record_parameters

        )