Module ediacara.Assembly
None
None
View Source
import os
import pandas as pd
from Bio import SeqFeature
from Bio import SeqIO
import dna_features_viewer
from .Comparator import ComparatorGroup
class AssemblyTranslator(dna_features_viewer.BiopythonTranslator):
"""Custom translator to display only the aligned parts."""
def compute_feature_color(self, feature):
ediacara_qualifier = "ediacara"
if feature.qualifiers[ediacara_qualifier] == "wrong_part":
return "#f5685e"
elif feature.qualifiers[ediacara_qualifier] == "correct_part":
return "#79d300"
elif feature.qualifiers[ediacara_qualifier] == "unknown_part":
return "#FFFFFF"
elif feature.qualifiers[ediacara_qualifier] == "reference":
return "#d3d3d3"
else:
return "#7245dc" # default dna_features_viewer colour
class AssemblyBatch:
"""Batch of Assembly class instances.
**Parameters**
**assemblies**
> List of `Assembly` instances.
**name**
> Name of the assembly analysis project (`str`).
"""
def __init__(self, assemblies, name="Unnamed"):
self.assemblies = assemblies
self.name = name
def perform_all_interpretations_in_group(self):
for assembly in self.assemblies:
assembly.interpret_alignment()
class Assembly:
"""Compare an assembly sequence to parts and simulated reference.
**Parameters**
**assembly_path**
> Path to the *de novo* assembled FASTA sequence file (`str`).
**reference**
> Path to reference Genbank file (`str`).
**alignment**
> Path (`str`) to minimap2 alignment PAF file, created with the `-cx asm5` options.
In this, the part and reference sequences are aligned against the de novo sequence.
**assembly_plan**
> Path of assembly plan CSV file (DNA Cauldron output format, with header line).
May contain additional entries, the correct line is chosen by reference sequence ID.
**use_file_names_as_ids**
> If True, uses the Genbank file name as sequence ID.
"""
def __init__(
self,
assembly_path,
reference_path,
alignment_path,
assembly_plan=None,
use_file_names_as_ids=True,
):
self.assembly = SeqIO.read(handle=assembly_path, format="fasta")
self.reference = SeqIO.read(handle=reference_path, format="genbank")
if use_file_names_as_ids:
basename = os.path.basename(reference_path)
basename_no_extension = os.path.splitext(basename)[0]
self.reference.id = basename_no_extension
self.paf = ComparatorGroup.load_paf(alignment_path)
self.paf.columns = self.paf.columns[:-1].to_list() + ["CIGAR"] # -1 is last
if assembly_plan is None:
self.assembly_plan = None
self.parts = []
self.has_assembly_plan = False
else:
self.has_assembly_plan = True
plan_df = pd.read_csv(assembly_plan, skiprows=1, header=None) # skip header
self.assembly_plan = plan_df[plan_df[0] == self.reference.id]
if len(self.assembly_plan) == 0:
raise ValueError("Error! Assembly plan doesn't contain the reference!")
if len(self.assembly_plan) > 1:
raise ValueError(
"Error! More than one assembly plan entry matches the reference!"
)
self.parts = self.assembly_plan.iloc[0].to_list() # has only one line
def interpret_alignment(self):
seq_matches = 0 # counts signed matches to reference for evaluating orientation
for index, row in self.paf.iterrows():
# May be useful to exclude reference:
# if row["query_name"] == self.reference.id:
# continue
part_type = self.evaluate_part_name(row["query_name"])
location = SeqFeature.FeatureLocation(
row["target_start"], row["target_end"]
)
feature = SeqFeature.SeqFeature(
location=location,
type="misc_feature",
id=row["query_name"],
qualifiers={"label": row["query_name"], "ediacara": part_type},
)
self.assembly.features.append(feature)
if part_type == "reference":
if row["strand"] == "+":
sign = 1
else:
sign = -1
seq_matches += sign * int(row["mapping_matches"])
self.reverse_complement = True if seq_matches < 0 else False
self.assembly_figure = self.plot_assembly()
def subset_paf(self):
selected_columns = [
"query_name",
"query_length",
"query_start",
"query_end",
"strand",
"target_start",
"target_end",
"mapping_matches",
"mapping_size",
"mapping_quality",
]
paf_subset = self.paf[selected_columns]
new_columnnames = [
"Name",
"Length",
"Start",
"End",
"Strand",
"T Start",
"T End",
"Matches",
"Size",
"Quality",
]
paf_subset.columns = new_columnnames
return paf_subset
def evaluate_part_name(self, name):
if self.assembly_plan is None:
return "unknown_part"
elif name == self.reference.id:
return "reference"
elif name in self.parts:
return "correct_part"
else:
return "wrong_part"
def plot_assembly(self):
graphic_record = AssemblyTranslator().translate_record(self.assembly)
ax, _ = graphic_record.plot(figure_width=8, strand_in_label_threshold=7)
return ax
Classes
Assembly
class Assembly(
assembly_path,
reference_path,
alignment_path,
assembly_plan=None,
use_file_names_as_ids=True
)
View Source
class Assembly:
"""Compare an assembly sequence to parts and simulated reference.
**Parameters**
**assembly_path**
> Path to the *de novo* assembled FASTA sequence file (`str`).
**reference**
> Path to reference Genbank file (`str`).
**alignment**
> Path (`str`) to minimap2 alignment PAF file, created with the `-cx asm5` options.
In this, the part and reference sequences are aligned against the de novo sequence.
**assembly_plan**
> Path of assembly plan CSV file (DNA Cauldron output format, with header line).
May contain additional entries, the correct line is chosen by reference sequence ID.
**use_file_names_as_ids**
> If True, uses the Genbank file name as sequence ID.
"""
def __init__(
self,
assembly_path,
reference_path,
alignment_path,
assembly_plan=None,
use_file_names_as_ids=True,
):
self.assembly = SeqIO.read(handle=assembly_path, format="fasta")
self.reference = SeqIO.read(handle=reference_path, format="genbank")
if use_file_names_as_ids:
basename = os.path.basename(reference_path)
basename_no_extension = os.path.splitext(basename)[0]
self.reference.id = basename_no_extension
self.paf = ComparatorGroup.load_paf(alignment_path)
self.paf.columns = self.paf.columns[:-1].to_list() + ["CIGAR"] # -1 is last
if assembly_plan is None:
self.assembly_plan = None
self.parts = []
self.has_assembly_plan = False
else:
self.has_assembly_plan = True
plan_df = pd.read_csv(assembly_plan, skiprows=1, header=None) # skip header
self.assembly_plan = plan_df[plan_df[0] == self.reference.id]
if len(self.assembly_plan) == 0:
raise ValueError("Error! Assembly plan doesn't contain the reference!")
if len(self.assembly_plan) > 1:
raise ValueError(
"Error! More than one assembly plan entry matches the reference!"
)
self.parts = self.assembly_plan.iloc[0].to_list() # has only one line
def interpret_alignment(self):
seq_matches = 0 # counts signed matches to reference for evaluating orientation
for index, row in self.paf.iterrows():
# May be useful to exclude reference:
# if row["query_name"] == self.reference.id:
# continue
part_type = self.evaluate_part_name(row["query_name"])
location = SeqFeature.FeatureLocation(
row["target_start"], row["target_end"]
)
feature = SeqFeature.SeqFeature(
location=location,
type="misc_feature",
id=row["query_name"],
qualifiers={"label": row["query_name"], "ediacara": part_type},
)
self.assembly.features.append(feature)
if part_type == "reference":
if row["strand"] == "+":
sign = 1
else:
sign = -1
seq_matches += sign * int(row["mapping_matches"])
self.reverse_complement = True if seq_matches < 0 else False
self.assembly_figure = self.plot_assembly()
def subset_paf(self):
selected_columns = [
"query_name",
"query_length",
"query_start",
"query_end",
"strand",
"target_start",
"target_end",
"mapping_matches",
"mapping_size",
"mapping_quality",
]
paf_subset = self.paf[selected_columns]
new_columnnames = [
"Name",
"Length",
"Start",
"End",
"Strand",
"T Start",
"T End",
"Matches",
"Size",
"Quality",
]
paf_subset.columns = new_columnnames
return paf_subset
def evaluate_part_name(self, name):
if self.assembly_plan is None:
return "unknown_part"
elif name == self.reference.id:
return "reference"
elif name in self.parts:
return "correct_part"
else:
return "wrong_part"
def plot_assembly(self):
graphic_record = AssemblyTranslator().translate_record(self.assembly)
ax, _ = graphic_record.plot(figure_width=8, strand_in_label_threshold=7)
return ax
Methods
evaluate_part_name
def evaluate_part_name(
self,
name
)
View Source
def evaluate_part_name(self, name):
if self.assembly_plan is None:
return "unknown_part"
elif name == self.reference.id:
return "reference"
elif name in self.parts:
return "correct_part"
else:
return "wrong_part"
interpret_alignment
def interpret_alignment(
self
)
View Source
def interpret_alignment(self):
seq_matches = 0 # counts signed matches to reference for evaluating orientation
for index, row in self.paf.iterrows():
# May be useful to exclude reference:
# if row["query_name"] == self.reference.id:
# continue
part_type = self.evaluate_part_name(row["query_name"])
location = SeqFeature.FeatureLocation(
row["target_start"], row["target_end"]
)
feature = SeqFeature.SeqFeature(
location=location,
type="misc_feature",
id=row["query_name"],
qualifiers={"label": row["query_name"], "ediacara": part_type},
)
self.assembly.features.append(feature)
if part_type == "reference":
if row["strand"] == "+":
sign = 1
else:
sign = -1
seq_matches += sign * int(row["mapping_matches"])
self.reverse_complement = True if seq_matches < 0 else False
self.assembly_figure = self.plot_assembly()
plot_assembly
def plot_assembly(
self
)
View Source
def plot_assembly(self):
graphic_record = AssemblyTranslator().translate_record(self.assembly)
ax, _ = graphic_record.plot(figure_width=8, strand_in_label_threshold=7)
return ax
subset_paf
def subset_paf(
self
)
View Source
def subset_paf(self):
selected_columns = [
"query_name",
"query_length",
"query_start",
"query_end",
"strand",
"target_start",
"target_end",
"mapping_matches",
"mapping_size",
"mapping_quality",
]
paf_subset = self.paf[selected_columns]
new_columnnames = [
"Name",
"Length",
"Start",
"End",
"Strand",
"T Start",
"T End",
"Matches",
"Size",
"Quality",
]
paf_subset.columns = new_columnnames
return paf_subset
AssemblyBatch
class AssemblyBatch(
assemblies,
name='Unnamed'
)
View Source
class AssemblyBatch:
"""Batch of Assembly class instances.
**Parameters**
**assemblies**
> List of `Assembly` instances.
**name**
> Name of the assembly analysis project (`str`).
"""
def __init__(self, assemblies, name="Unnamed"):
self.assemblies = assemblies
self.name = name
def perform_all_interpretations_in_group(self):
for assembly in self.assemblies:
assembly.interpret_alignment()
Methods
perform_all_interpretations_in_group
def perform_all_interpretations_in_group(
self
)
View Source
def perform_all_interpretations_in_group(self):
for assembly in self.assemblies:
assembly.interpret_alignment()
AssemblyTranslator
class AssemblyTranslator(
features_filters=(),
features_properties=None
)
View Source
class AssemblyTranslator(dna_features_viewer.BiopythonTranslator):
"""Custom translator to display only the aligned parts."""
def compute_feature_color(self, feature):
ediacara_qualifier = "ediacara"
if feature.qualifiers[ediacara_qualifier] == "wrong_part":
return "#f5685e"
elif feature.qualifiers[ediacara_qualifier] == "correct_part":
return "#79d300"
elif feature.qualifiers[ediacara_qualifier] == "unknown_part":
return "#FFFFFF"
elif feature.qualifiers[ediacara_qualifier] == "reference":
return "#d3d3d3"
else:
return "#7245dc" # default dna_features_viewer colour
Ancestors (in MRO)
- dna_features_viewer.BiopythonTranslator.BiopythonTranslator.BiopythonTranslator
- dna_features_viewer.BiopythonTranslator.BiopythonTranslatorBase.BiopythonTranslatorBase
Class variables
default_feature_color
graphic_record_parameters
ignored_features_types
label_fields
Static methods
quick_class_plot
def quick_class_plot(
record,
figure_width=12,
**kwargs
)
Allows super quick and dirty plotting of Biopython records.
This is really meant for use in a Jupyter/Ipython notebook with the "%matplotlib inline" setting.
from dna_features_viewer import BiopythonTranslator BiopythonTranslator.quick_plot(my_record)
View Source
@classmethod
def quick_class_plot(cls, record, figure_width=12, **kwargs):
"""Allows super quick and dirty plotting of Biopython records.
This is really meant for use in a Jupyter/Ipython notebook with
the "%matplotlib inline" setting.
>>> from dna_features_viewer import BiopythonTranslator
>>> BiopythonTranslator.quick_plot(my_record)
"""
graphic_record = cls().translate_record(record)
ax, _ = graphic_record.plot(figure_width=figure_width, **kwargs)
return ax
Methods
compute_feature_box_color
def compute_feature_box_color(
self,
feature
)
Compute a box_color for this feature.
View Source
def compute_feature_box_color(self, feature):
"""Compute a box_color for this feature."""
return "auto"
compute_feature_box_linewidth
def compute_feature_box_linewidth(
self,
feature
)
Compute a box_linewidth for this feature.
View Source
def compute_feature_box_linewidth(self, feature):
"""Compute a box_linewidth for this feature."""
return 0.3
compute_feature_color
def compute_feature_color(
self,
feature
)
Compute a color for this feature.
If the feature has a color
qualifier it will be used. Otherwise,
the classe's default_feature_color
is used.
To change the behaviour, create a subclass of BiopythonTranslator
and overwrite this method.
View Source
def compute_feature_color(self, feature):
ediacara_qualifier = "ediacara"
if feature.qualifiers[ediacara_qualifier] == "wrong_part":
return "#f5685e"
elif feature.qualifiers[ediacara_qualifier] == "correct_part":
return "#79d300"
elif feature.qualifiers[ediacara_qualifier] == "unknown_part":
return "#FFFFFF"
elif feature.qualifiers[ediacara_qualifier] == "reference":
return "#d3d3d3"
else:
return "#7245dc" # default dna_features_viewer colour
compute_feature_fontdict
def compute_feature_fontdict(
self,
feature
)
Compute a font dict for this feature.
View Source
def compute_feature_fontdict(self, feature):
"""Compute a font dict for this feature."""
return None
compute_feature_html
def compute_feature_html(
self,
feature
)
Gets the 'label' of the feature.
View Source
def compute_feature_html(self, feature):
"""Gets the 'label' of the feature."""
return self.compute_feature_label(feature)
compute_feature_label
def compute_feature_label(
self,
feature
)
Compute the label of the feature.
View Source
def compute_feature_label(self, feature):
"""Compute the label of the feature."""
label = feature.type
for key in self.label_fields:
if key in feature.qualifiers and len(feature.qualifiers[key]):
label = feature.qualifiers[key]
break
if isinstance(label, list):
label = "|".join(label)
return label
compute_feature_label_link_color
def compute_feature_label_link_color(
self,
feature
)
Compute the color of the line linking the label to its feature.
View Source
def compute_feature_label_link_color(self, feature):
"""Compute the color of the line linking the label to its feature."""
return "black"
compute_feature_legend_text
def compute_feature_legend_text(
self,
feature
)
View Source
def compute_feature_legend_text(self, feature):
return None
compute_feature_linewidth
def compute_feature_linewidth(
self,
feature
)
Compute the edge width of the feature's arrow/rectangle.
View Source
def compute_feature_linewidth(self, feature):
"""Compute the edge width of the feature's arrow/rectangle."""
return 1.0
compute_filtered_features
def compute_filtered_features(
self,
features
)
Return the list of features minus the ignored ones.
By the method keeps any feature whose type is not in ignored_features_types and for which all filter(f) pass.
View Source
def compute_filtered_features(self, features):
"""Return the list of features minus the ignored ones.
By the method keeps any feature whose type is not in
ignored_features_types and for which all filter(f) pass.
"""
return [
f
for f in features
if all([fl(f) for fl in self.features_filters])
and f.type not in self.ignored_features_types
]
quick_plot
def quick_plot(
self,
record,
figure_width=12,
**kwargs
)
Allows super quick and dirty plotting of Biopython records.
This is really meant for use in a Jupyter/Ipython notebook with the "%matplotlib inline" setting.
from dna_features_viewer import BiopythonTranslator BiopythonTranslator.quick_plot(my_record)
View Source
def quick_plot(self, record, figure_width=12, **kwargs):
"""Allows super quick and dirty plotting of Biopython records.
This is really meant for use in a Jupyter/Ipython notebook with
the "%matplotlib inline" setting.
>>> from dna_features_viewer import BiopythonTranslator
>>> BiopythonTranslator.quick_plot(my_record)
"""
graphic_record = self.translate_record(record)
ax, _ = graphic_record.plot(figure_width=figure_width, **kwargs)
return ax
translate_feature
def translate_feature(
self,
feature
)
Translate a Biopython feature into a Dna Features Viewer feature.
View Source
def translate_feature(self, feature):
"""Translate a Biopython feature into a Dna Features Viewer feature."""
properties = dict(
label=self.compute_feature_label(feature),
color=self.compute_feature_color(feature),
html=self.compute_feature_html(feature),
fontdict=self.compute_feature_fontdict(feature),
box_linewidth=self.compute_feature_box_linewidth(feature),
box_color=self.compute_feature_box_color(feature),
linewidth=self.compute_feature_linewidth(feature),
label_link_color=self.compute_feature_label_link_color(feature),
legend_text=self.compute_feature_legend_text(feature),
)
if self.features_properties is not None:
other_properties = self.features_properties
if hasattr(other_properties, "__call__"):
other_properties = other_properties(feature)
properties.update(other_properties)
return GraphicFeature(
start=feature.location.start,
end=feature.location.end,
strand=feature.location.strand,
**properties
)
translate_record
def translate_record(
self,
record,
record_class=None,
filetype=None
)
Create a new GraphicRecord from a BioPython Record object.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
record | None | A BioPython Record object or the path to a Genbank or a GFF file. | None |
record_class | None | The graphic record class to use, e.g. GraphicRecord (default) or | |
CircularGraphicRecord. Strings 'circular' and 'linear' can also be | |||
provided. | None | ||
filetype | None | Used only when a Genbank or a GFF file is provided; one of "genbank" | |
or "gff" to be used. Default None infers from file extension. | None |
View Source
def translate_record(self, record, record_class=None, filetype=None):
"""Create a new GraphicRecord from a BioPython Record object.
Parameters
----------
record
A BioPython Record object or the path to a Genbank or a GFF file.
record_class
The graphic record class to use, e.g. GraphicRecord (default) or
CircularGraphicRecord. Strings 'circular' and 'linear' can also be
provided.
filetype
Used only when a Genbank or a GFF file is provided; one of "genbank"
or "gff" to be used. Default None infers from file extension.
"""
classes = {
"linear": GraphicRecord,
"circular": CircularGraphicRecord,
None: GraphicRecord,
}
if record_class in classes:
record_class = classes[record_class]
if isinstance(record, str) or hasattr(record, "read"):
record = load_record(record, filetype=filetype)
filtered_features = self.compute_filtered_features(record.features)
return record_class(
sequence_length=len(record),
sequence=str(record.seq),
features=[
self.translate_feature(feature)
for feature in filtered_features
if feature.location is not None
],
**self.graphic_record_parameters
)