Module epijinn.Bedmethyl
View Source
# Copyright 2024 Edinburgh Genome Foundry, University of Edinburgh
#
# This file is part of EpiJinn.
#
# EpiJinn is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
#
# EpiJinn is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with Ediacara. If not, see <https://www.gnu.org/licenses/>.
import copy
import os
import pandas
import matplotlib.pyplot as plt
import Bio
import dna_features_viewer
from .Methyl import METHYLASES
from .Methyl import annotate_methylation
COMPLEMENTS = {
"A": "T",
"C": "G",
"G": "C",
"T": "A",
}
# From https://github.com/nanoporetech/modkit/blob/master/book/src/intro_bedmethyl.md
BEDMETHYL_HEADER = [
"chrom",
"start_position",
"end_position",
"modified_base_code_and_motif",
"score",
"strand",
"strand_start_position",
"strand_end_position",
"color",
"Nvalid_cov",
"percent_modified",
"Nmod",
"Ncanonical",
"Nother_mod",
"Ndelete",
"Nfail",
"Ndiff",
"Nnocall",
]
# Remove duplicate and unnecessary columns from report:
columns_for_pdf_report = [
BEDMETHYL_HEADER[i] for i in [1, 5, 9, 10, 11, 12, 13, 14, 15, 16, 17]
] + [
"status"
] # added during binarization
DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")
# For looking up modified_base_code_and_motif entries in bedmethyl:
MODIFICATION_CODES = pandas.read_csv(os.path.join(DATA_DIR, "mod_base_codes.csv"))
# Adapted From https://github.com/samtools/hts-specs/blob/master/SAMtags.pdf
class CustomTranslator(dna_features_viewer.BiopythonTranslator):
"""Custom translator used for the main plot."""
def compute_filtered_features(self, features):
"""Display only selected features."""
filtered_features = []
n = 8 # a good number of features to display
# Keep longest features
if len(features) > n:
feature_lengths = []
for feature in features:
feature_lengths += [len(feature.location)]
feature_lengths.sort(reverse=True)
max_length = feature_lengths[n]
for feature in features:
if len(feature.location) > max_length:
filtered_features += [feature]
else: # no need to do anything if not enough features
filtered_features = features
return filtered_features
class PatternTranslator(dna_features_viewer.BiopythonTranslator):
"""Custom translator used for the pattern annotation plot."""
def compute_feature_color(self, feature):
status = "status"
# Values are from binarize_bed() :
if feature.qualifiers[status] == "1":
return "red"
elif feature.qualifiers[status] == "U":
return "yellow"
elif feature.qualifiers[status] == "0":
return "grey"
# to be implemented later:
elif feature.qualifiers[status] == "?":
return "tab:cyan"
# There should be no other options.
class BedResult:
"""Results of a bedmethyl table analysis."""
def __init__(self, modification, bed, record):
self.feature_cutoff = 50 # do not create a plot if there are more features
self.modification = modification
self.mod_abbreviation = MODIFICATION_CODES.loc[
MODIFICATION_CODES["Code"] == self.modification
].Abbreviation.iloc[
0
] # there is only one entry so we take the first
self.mod_name = MODIFICATION_CODES.loc[
MODIFICATION_CODES["Code"] == self.modification
].Name.iloc[0]
self.mod_chebi = MODIFICATION_CODES.loc[
MODIFICATION_CODES["Code"] == self.modification
].ChEBI.iloc[0]
self.unmodified_base = MODIFICATION_CODES.loc[
MODIFICATION_CODES["Code"] == self.modification
].Unmodified_base.iloc[0]
self.bed = bed
self.record = record
# FILTER (ANNOTATED) RECORD AND ADD STATUS:
filtered_features = []
labelstart = (
"@epijinn(" + self.unmodified_base + ", strand="
) # unfinished to account for +/- strands
for feature in record.features:
if feature.id == "@epijinn": # as annotated by the function
if feature.qualifiers["label"].startswith(labelstart):
# To display seq in report:
feature.qualifiers["label"] = self.unmodified_base
# Assign status:
# no need to check for strand as the complement won't be modified
location_start = int(feature.location.start)
# expect exactly one bed table entry per modified nucleotide:
status_symbol = self.bed.loc[
self.bed["LOC"] == location_start
].STATUS.iloc[0]
# used by a custom BiopythonTranslator to colour the annotation:
feature.qualifiers["status"] = status_symbol
filtered_features += [feature]
record.features = filtered_features
if len(self.record.features) > self.feature_cutoff:
self.img_created = False
else:
self.img_created = True
fig, ax1 = plt.subplots(1, 1, figsize=(8, 3))
graphic_record = PatternTranslator().translate_record(self.record)
graphic_record.plot(ax=ax1, with_ruler=False, strand_in_label_threshold=4)
self.plot = fig
class BedmethylItem:
"""Analyse a bedmethyl file.
**Parameters**
**sample**
> The name of the sample, for example a barcode id (`str`).
**reference**
> A Biopython SeqRecord.
**bedmethyl**
> A pandas dataframe of a bedmethyl file for the reference.
"""
def __init__(self, sample, reference, bedmethyl):
self.sample = sample
self.record = reference
self.name = reference.name
self.id = reference.id
self.bed = bedmethyl
self.reference_length = len(self.record)
def perform_analysis(self, parameter_dict):
"""Perform analysis and plot the sequence."""
methylase_str = parameter_dict["methylases"]
methylase = METHYLASES[methylase_str]
self.fig = self.plot_record()
self.results = [] # BedResult instances. For easy reference in pug template.
for modification in self.bed.modified_base_code_and_motif.unique():
# RUN BED SUBSET ETC
bed_subtype = self.subset_bed_to_mod_subtype(self.bed, mod=modification)
annotated_record, bed_pattern_match = self.subset_bed_to_pattern_match(
methylase, bed=bed_subtype
)
bed_binarized = self.binarize_bed(
bed_pattern_match,
met_cutoff=parameter_dict["methylated_cutoff"],
nonmet_cutoff=parameter_dict["unmethylated_cutoff"],
)
final_bed = subset_bed_columns(bed_binarized)
bedresult = BedResult(modification, bed=final_bed, record=annotated_record)
self.results += [bedresult]
def annotate_record(self):
return self.record
def plot_record(self):
fig, ax1 = plt.subplots(1, 1, figsize=(8, 2))
graphic_record = CustomTranslator().translate_record(self.record)
graphic_record.plot(ax=ax1, with_ruler=False, strand_in_label_threshold=4)
return fig
@staticmethod
def subset_bed_to_mod_subtype(bed, mod):
# Columnname from Bedmethyl file specification
bed_subtype = bed.loc[bed["modified_base_code_and_motif"] == mod]
return bed_subtype
def subset_bed_to_base_matches(self, bed=None, modified_base="C"):
if bed is None: # optional bed allows linking multiple bed subset methods
bed = self.bed
mod_base = modified_base.upper()
mod_base_complement = COMPLEMENTS[mod_base]
# POSITIVE STRAND
matching_positions_on_positive_strand = [
pos
for pos, base in enumerate(str(self.record.seq.upper()))
if base == mod_base
]
# Columnname from Bedmethyl file specification:
positive_strand_filter = bed["start_position"].isin(
matching_positions_on_positive_strand
) & (bed["strand"] == "+")
# NEGATIVE STRAND
matching_positions_on_negative_strand = [
pos
for pos, base in enumerate(str(self.record.seq.upper()))
if base == mod_base_complement
]
negative_strand_filter = bed["start_position"].isin(
matching_positions_on_negative_strand
) & (bed["strand"] == "-")
bed_basematch = bed.loc[positive_strand_filter | negative_strand_filter]
return bed_basematch
def subset_bed_to_pattern_match(self, methylase, bed=None):
if bed is None: # optional bed allows linking multiple bed subset methods
bed = self.bed
methylated_index = methylase.index_pos
mod_base = methylase.sequence[methylated_index].upper()
record_copy = copy.deepcopy(self.record)
annotated_record = annotate_methylation(record_copy, methylases=[methylase])
# CREATE LIST OF POSITIONS
positive_strand_locations = []
negative_strand_locations = []
label_pos = "@epijinn(" + mod_base + ", strand=1)"
label_neg = "@epijinn(" + mod_base + ", strand=-1)"
for feature in annotated_record.features:
if feature.id == "@epijinn": # as annotated by function above
if feature.qualifiers["label"] == label_pos:
positive_strand_locations += [feature.location.start]
elif feature.qualifiers["label"] == label_neg:
negative_strand_locations += [feature.location.start]
# SUBSET USING POSITIONS
# Columnname from Bedmethyl file specification:
positive_strand_filter = bed["start_position"].isin(
positive_strand_locations
) & (bed["strand"] == "+")
negative_strand_filter = bed["start_position"].isin(
negative_strand_locations
) & (bed["strand"] == "-")
bed_pattern_match = bed.loc[positive_strand_filter | negative_strand_filter]
return annotated_record, bed_pattern_match
@staticmethod
def binarize_bed(bed, met_cutoff, nonmet_cutoff):
bed["status"] = "U" # prefill undetermined
for index, row in bed.iterrows():
if row["percent_modified"] >= float(met_cutoff) * 100:
bed.loc[index, "status"] = "1" # methylated, symbol may change
elif row["percent_modified"] <= float(nonmet_cutoff) * 100:
bed.loc[index, "status"] = "0" # unmethylated
# symbol "?" reserved for low coverage to be implemented later
return bed
def read_sample_sheet(
sample_sheet, genbank_dir="", bedmethyl_dir="", parameter_sheet=""
):
"""Read a sample sheet into a BedmethylItemGroup.
**Parameters**
**sample_sheet**
> CSV file path (`str`). No header and columns must be in this order: projectname,
sample, Genbank name (without extension), bedmethyl file.
**genbank_dir**
> Directory of the Genbank files (`str`). Default: local directory.
**bedmethyl_dir**
> Directory of the bedmethyl files (`str`). Default: local directory.
**parameter_sheet**
> CSV file path (`str`). Use 'Parameter', 'Value' header for columns. If a
'projectname' is specified, it overwrites the sample sheet value.
"""
# READ PARAMETERS
param_df = pandas.read_csv(parameter_sheet, usecols=["Parameter", "Value"])
parameter_dict = dict(param_df.values)
# READ SAMPLES
sample_df = pandas.read_csv(sample_sheet, header=None)
# add columnnames
# CREATE ITEMS
# We allow Sequeduct to specify the projectname as a command parameter as well;
if not "projectname" in parameter_dict:
# first entry of the first column (contains projectname):
parameter_dict["projectname"] = sample_df.iloc[0, 0]
# Allow the user to not specify these in the sheet:
# The default cutoffs are based on preliminary data.
if not "methylated_cutoff" in parameter_dict:
parameter_dict["methylated_cutoff"] = 0.7
if not "unmethylated_cutoff" in parameter_dict:
parameter_dict["unmethylated_cutoff"] = 0.3
bedmethylitems = []
for index, row in sample_df.iterrows():
genbank_name = row[2] # number specified by the sample sheet format
genbank_path = os.path.join(genbank_dir, genbank_name + ".gb") # Genbank ext
record = Bio.SeqIO.read(genbank_path, "genbank")
record.id = genbank_name
record.name = genbank_name
record.annotations["molecule_type"] = "DNA"
bed_name = row[3] # number specified by the sample sheet format
bed_path = os.path.join(bedmethyl_dir, bed_name)
bed_df = pandas.read_csv(bed_path, header=None, delimiter="\t")
bed_df.columns = BEDMETHYL_HEADER
bedmethylitems += [
BedmethylItem(sample=row[1], reference=record, bedmethyl=bed_df)
] # number specified by the sample sheet format
bedmethylitemgroup = BedmethylItemGroup(
bedmethylitems=bedmethylitems, parameter_dict=parameter_dict
)
return bedmethylitemgroup
def subset_bed_columns(bed):
bed_report = bed[columns_for_pdf_report]
# These were designed to be more informative and fit the report:
new_columnnames = [
"LOC",
"Strand",
"COV",
"% mod",
"MOD",
"STD",
"OTH",
"del",
"fail",
"diff",
"nocall",
"STATUS",
]
bed_report.columns = new_columnnames
return bed_report
class BedmethylItemGroup:
"""A group of BedmethylItem instances for reporting.
**Parameters**
**bedmethylitems**
> A list of BedmethylItem instances.
**parameter_dict**
> A dictionary of analysis parameters (`dict`).
"""
def __init__(self, bedmethylitems, parameter_dict):
self.bedmethylitems = bedmethylitems
self.parameter_dict = parameter_dict
self.number_of_samples = len(bedmethylitems)
def perform_all_analysis_in_bedmethylitemgroup(self):
for bedmethylitem in self.bedmethylitems:
bedmethylitem.perform_analysis(parameter_dict=self.parameter_dict)
self.comparisons_performed = True
Variables
BEDMETHYL_HEADER
COMPLEMENTS
DATA_DIR
METHYLASES
MODIFICATION_CODES
columns_for_pdf_report
Functions
read_sample_sheet
def read_sample_sheet(
sample_sheet,
genbank_dir='',
bedmethyl_dir='',
parameter_sheet=''
)
Read a sample sheet into a BedmethylItemGroup.
Parameters
sample_sheet
CSV file path (
str
). No header and columns must be in this order: projectname, sample, Genbank name (without extension), bedmethyl file.
genbank_dir
Directory of the Genbank files (
str
). Default: local directory.
bedmethyl_dir
Directory of the bedmethyl files (
str
). Default: local directory.
parameter_sheet
CSV file path (
str
). Use 'Parameter', 'Value' header for columns. If a 'projectname' is specified, it overwrites the sample sheet value.
View Source
def read_sample_sheet(
sample_sheet, genbank_dir="", bedmethyl_dir="", parameter_sheet=""
):
"""Read a sample sheet into a BedmethylItemGroup.
**Parameters**
**sample_sheet**
> CSV file path (`str`). No header and columns must be in this order: projectname,
sample, Genbank name (without extension), bedmethyl file.
**genbank_dir**
> Directory of the Genbank files (`str`). Default: local directory.
**bedmethyl_dir**
> Directory of the bedmethyl files (`str`). Default: local directory.
**parameter_sheet**
> CSV file path (`str`). Use 'Parameter', 'Value' header for columns. If a
'projectname' is specified, it overwrites the sample sheet value.
"""
# READ PARAMETERS
param_df = pandas.read_csv(parameter_sheet, usecols=["Parameter", "Value"])
parameter_dict = dict(param_df.values)
# READ SAMPLES
sample_df = pandas.read_csv(sample_sheet, header=None)
# add columnnames
# CREATE ITEMS
# We allow Sequeduct to specify the projectname as a command parameter as well;
if not "projectname" in parameter_dict:
# first entry of the first column (contains projectname):
parameter_dict["projectname"] = sample_df.iloc[0, 0]
# Allow the user to not specify these in the sheet:
# The default cutoffs are based on preliminary data.
if not "methylated_cutoff" in parameter_dict:
parameter_dict["methylated_cutoff"] = 0.7
if not "unmethylated_cutoff" in parameter_dict:
parameter_dict["unmethylated_cutoff"] = 0.3
bedmethylitems = []
for index, row in sample_df.iterrows():
genbank_name = row[2] # number specified by the sample sheet format
genbank_path = os.path.join(genbank_dir, genbank_name + ".gb") # Genbank ext
record = Bio.SeqIO.read(genbank_path, "genbank")
record.id = genbank_name
record.name = genbank_name
record.annotations["molecule_type"] = "DNA"
bed_name = row[3] # number specified by the sample sheet format
bed_path = os.path.join(bedmethyl_dir, bed_name)
bed_df = pandas.read_csv(bed_path, header=None, delimiter="\t")
bed_df.columns = BEDMETHYL_HEADER
bedmethylitems += [
BedmethylItem(sample=row[1], reference=record, bedmethyl=bed_df)
] # number specified by the sample sheet format
bedmethylitemgroup = BedmethylItemGroup(
bedmethylitems=bedmethylitems, parameter_dict=parameter_dict
)
return bedmethylitemgroup
subset_bed_columns
def subset_bed_columns(
bed
)
View Source
def subset_bed_columns(bed):
bed_report = bed[columns_for_pdf_report]
# These were designed to be more informative and fit the report:
new_columnnames = [
"LOC",
"Strand",
"COV",
"% mod",
"MOD",
"STD",
"OTH",
"del",
"fail",
"diff",
"nocall",
"STATUS",
]
bed_report.columns = new_columnnames
return bed_report
Classes
BedResult
class BedResult(
modification,
bed,
record
)
Results of a bedmethyl table analysis.
View Source
class BedResult:
"""Results of a bedmethyl table analysis."""
def __init__(self, modification, bed, record):
self.feature_cutoff = 50 # do not create a plot if there are more features
self.modification = modification
self.mod_abbreviation = MODIFICATION_CODES.loc[
MODIFICATION_CODES["Code"] == self.modification
].Abbreviation.iloc[
0
] # there is only one entry so we take the first
self.mod_name = MODIFICATION_CODES.loc[
MODIFICATION_CODES["Code"] == self.modification
].Name.iloc[0]
self.mod_chebi = MODIFICATION_CODES.loc[
MODIFICATION_CODES["Code"] == self.modification
].ChEBI.iloc[0]
self.unmodified_base = MODIFICATION_CODES.loc[
MODIFICATION_CODES["Code"] == self.modification
].Unmodified_base.iloc[0]
self.bed = bed
self.record = record
# FILTER (ANNOTATED) RECORD AND ADD STATUS:
filtered_features = []
labelstart = (
"@epijinn(" + self.unmodified_base + ", strand="
) # unfinished to account for +/- strands
for feature in record.features:
if feature.id == "@epijinn": # as annotated by the function
if feature.qualifiers["label"].startswith(labelstart):
# To display seq in report:
feature.qualifiers["label"] = self.unmodified_base
# Assign status:
# no need to check for strand as the complement won't be modified
location_start = int(feature.location.start)
# expect exactly one bed table entry per modified nucleotide:
status_symbol = self.bed.loc[
self.bed["LOC"] == location_start
].STATUS.iloc[0]
# used by a custom BiopythonTranslator to colour the annotation:
feature.qualifiers["status"] = status_symbol
filtered_features += [feature]
record.features = filtered_features
if len(self.record.features) > self.feature_cutoff:
self.img_created = False
else:
self.img_created = True
fig, ax1 = plt.subplots(1, 1, figsize=(8, 3))
graphic_record = PatternTranslator().translate_record(self.record)
graphic_record.plot(ax=ax1, with_ruler=False, strand_in_label_threshold=4)
self.plot = fig
BedmethylItem
class BedmethylItem(
sample,
reference,
bedmethyl
)
Analyse a bedmethyl file.
Parameters
sample
The name of the sample, for example a barcode id (
str
).
reference
A Biopython SeqRecord.
bedmethyl
A pandas dataframe of a bedmethyl file for the reference.
View Source
class BedmethylItem:
"""Analyse a bedmethyl file.
**Parameters**
**sample**
> The name of the sample, for example a barcode id (`str`).
**reference**
> A Biopython SeqRecord.
**bedmethyl**
> A pandas dataframe of a bedmethyl file for the reference.
"""
def __init__(self, sample, reference, bedmethyl):
self.sample = sample
self.record = reference
self.name = reference.name
self.id = reference.id
self.bed = bedmethyl
self.reference_length = len(self.record)
def perform_analysis(self, parameter_dict):
"""Perform analysis and plot the sequence."""
methylase_str = parameter_dict["methylases"]
methylase = METHYLASES[methylase_str]
self.fig = self.plot_record()
self.results = [] # BedResult instances. For easy reference in pug template.
for modification in self.bed.modified_base_code_and_motif.unique():
# RUN BED SUBSET ETC
bed_subtype = self.subset_bed_to_mod_subtype(self.bed, mod=modification)
annotated_record, bed_pattern_match = self.subset_bed_to_pattern_match(
methylase, bed=bed_subtype
)
bed_binarized = self.binarize_bed(
bed_pattern_match,
met_cutoff=parameter_dict["methylated_cutoff"],
nonmet_cutoff=parameter_dict["unmethylated_cutoff"],
)
final_bed = subset_bed_columns(bed_binarized)
bedresult = BedResult(modification, bed=final_bed, record=annotated_record)
self.results += [bedresult]
def annotate_record(self):
return self.record
def plot_record(self):
fig, ax1 = plt.subplots(1, 1, figsize=(8, 2))
graphic_record = CustomTranslator().translate_record(self.record)
graphic_record.plot(ax=ax1, with_ruler=False, strand_in_label_threshold=4)
return fig
@staticmethod
def subset_bed_to_mod_subtype(bed, mod):
# Columnname from Bedmethyl file specification
bed_subtype = bed.loc[bed["modified_base_code_and_motif"] == mod]
return bed_subtype
def subset_bed_to_base_matches(self, bed=None, modified_base="C"):
if bed is None: # optional bed allows linking multiple bed subset methods
bed = self.bed
mod_base = modified_base.upper()
mod_base_complement = COMPLEMENTS[mod_base]
# POSITIVE STRAND
matching_positions_on_positive_strand = [
pos
for pos, base in enumerate(str(self.record.seq.upper()))
if base == mod_base
]
# Columnname from Bedmethyl file specification:
positive_strand_filter = bed["start_position"].isin(
matching_positions_on_positive_strand
) & (bed["strand"] == "+")
# NEGATIVE STRAND
matching_positions_on_negative_strand = [
pos
for pos, base in enumerate(str(self.record.seq.upper()))
if base == mod_base_complement
]
negative_strand_filter = bed["start_position"].isin(
matching_positions_on_negative_strand
) & (bed["strand"] == "-")
bed_basematch = bed.loc[positive_strand_filter | negative_strand_filter]
return bed_basematch
def subset_bed_to_pattern_match(self, methylase, bed=None):
if bed is None: # optional bed allows linking multiple bed subset methods
bed = self.bed
methylated_index = methylase.index_pos
mod_base = methylase.sequence[methylated_index].upper()
record_copy = copy.deepcopy(self.record)
annotated_record = annotate_methylation(record_copy, methylases=[methylase])
# CREATE LIST OF POSITIONS
positive_strand_locations = []
negative_strand_locations = []
label_pos = "@epijinn(" + mod_base + ", strand=1)"
label_neg = "@epijinn(" + mod_base + ", strand=-1)"
for feature in annotated_record.features:
if feature.id == "@epijinn": # as annotated by function above
if feature.qualifiers["label"] == label_pos:
positive_strand_locations += [feature.location.start]
elif feature.qualifiers["label"] == label_neg:
negative_strand_locations += [feature.location.start]
# SUBSET USING POSITIONS
# Columnname from Bedmethyl file specification:
positive_strand_filter = bed["start_position"].isin(
positive_strand_locations
) & (bed["strand"] == "+")
negative_strand_filter = bed["start_position"].isin(
negative_strand_locations
) & (bed["strand"] == "-")
bed_pattern_match = bed.loc[positive_strand_filter | negative_strand_filter]
return annotated_record, bed_pattern_match
@staticmethod
def binarize_bed(bed, met_cutoff, nonmet_cutoff):
bed["status"] = "U" # prefill undetermined
for index, row in bed.iterrows():
if row["percent_modified"] >= float(met_cutoff) * 100:
bed.loc[index, "status"] = "1" # methylated, symbol may change
elif row["percent_modified"] <= float(nonmet_cutoff) * 100:
bed.loc[index, "status"] = "0" # unmethylated
# symbol "?" reserved for low coverage to be implemented later
return bed
Static methods
binarize_bed
def binarize_bed(
bed,
met_cutoff,
nonmet_cutoff
)
View Source
@staticmethod
def binarize_bed(bed, met_cutoff, nonmet_cutoff):
bed["status"] = "U" # prefill undetermined
for index, row in bed.iterrows():
if row["percent_modified"] >= float(met_cutoff) * 100:
bed.loc[index, "status"] = "1" # methylated, symbol may change
elif row["percent_modified"] <= float(nonmet_cutoff) * 100:
bed.loc[index, "status"] = "0" # unmethylated
# symbol "?" reserved for low coverage to be implemented later
return bed
subset_bed_to_mod_subtype
def subset_bed_to_mod_subtype(
bed,
mod
)
View Source
@staticmethod
def subset_bed_to_mod_subtype(bed, mod):
# Columnname from Bedmethyl file specification
bed_subtype = bed.loc[bed["modified_base_code_and_motif"] == mod]
return bed_subtype
Methods
annotate_record
def annotate_record(
self
)
View Source
def annotate_record(self):
return self.record
perform_analysis
def perform_analysis(
self,
parameter_dict
)
Perform analysis and plot the sequence.
View Source
def perform_analysis(self, parameter_dict):
"""Perform analysis and plot the sequence."""
methylase_str = parameter_dict["methylases"]
methylase = METHYLASES[methylase_str]
self.fig = self.plot_record()
self.results = [] # BedResult instances. For easy reference in pug template.
for modification in self.bed.modified_base_code_and_motif.unique():
# RUN BED SUBSET ETC
bed_subtype = self.subset_bed_to_mod_subtype(self.bed, mod=modification)
annotated_record, bed_pattern_match = self.subset_bed_to_pattern_match(
methylase, bed=bed_subtype
)
bed_binarized = self.binarize_bed(
bed_pattern_match,
met_cutoff=parameter_dict["methylated_cutoff"],
nonmet_cutoff=parameter_dict["unmethylated_cutoff"],
)
final_bed = subset_bed_columns(bed_binarized)
bedresult = BedResult(modification, bed=final_bed, record=annotated_record)
self.results += [bedresult]
plot_record
def plot_record(
self
)
View Source
def plot_record(self):
fig, ax1 = plt.subplots(1, 1, figsize=(8, 2))
graphic_record = CustomTranslator().translate_record(self.record)
graphic_record.plot(ax=ax1, with_ruler=False, strand_in_label_threshold=4)
return fig
subset_bed_to_base_matches
def subset_bed_to_base_matches(
self,
bed=None,
modified_base='C'
)
View Source
def subset_bed_to_base_matches(self, bed=None, modified_base="C"):
if bed is None: # optional bed allows linking multiple bed subset methods
bed = self.bed
mod_base = modified_base.upper()
mod_base_complement = COMPLEMENTS[mod_base]
# POSITIVE STRAND
matching_positions_on_positive_strand = [
pos
for pos, base in enumerate(str(self.record.seq.upper()))
if base == mod_base
]
# Columnname from Bedmethyl file specification:
positive_strand_filter = bed["start_position"].isin(
matching_positions_on_positive_strand
) & (bed["strand"] == "+")
# NEGATIVE STRAND
matching_positions_on_negative_strand = [
pos
for pos, base in enumerate(str(self.record.seq.upper()))
if base == mod_base_complement
]
negative_strand_filter = bed["start_position"].isin(
matching_positions_on_negative_strand
) & (bed["strand"] == "-")
bed_basematch = bed.loc[positive_strand_filter | negative_strand_filter]
return bed_basematch
subset_bed_to_pattern_match
def subset_bed_to_pattern_match(
self,
methylase,
bed=None
)
View Source
def subset_bed_to_pattern_match(self, methylase, bed=None):
if bed is None: # optional bed allows linking multiple bed subset methods
bed = self.bed
methylated_index = methylase.index_pos
mod_base = methylase.sequence[methylated_index].upper()
record_copy = copy.deepcopy(self.record)
annotated_record = annotate_methylation(record_copy, methylases=[methylase])
# CREATE LIST OF POSITIONS
positive_strand_locations = []
negative_strand_locations = []
label_pos = "@epijinn(" + mod_base + ", strand=1)"
label_neg = "@epijinn(" + mod_base + ", strand=-1)"
for feature in annotated_record.features:
if feature.id == "@epijinn": # as annotated by function above
if feature.qualifiers["label"] == label_pos:
positive_strand_locations += [feature.location.start]
elif feature.qualifiers["label"] == label_neg:
negative_strand_locations += [feature.location.start]
# SUBSET USING POSITIONS
# Columnname from Bedmethyl file specification:
positive_strand_filter = bed["start_position"].isin(
positive_strand_locations
) & (bed["strand"] == "+")
negative_strand_filter = bed["start_position"].isin(
negative_strand_locations
) & (bed["strand"] == "-")
bed_pattern_match = bed.loc[positive_strand_filter | negative_strand_filter]
return annotated_record, bed_pattern_match
BedmethylItemGroup
class BedmethylItemGroup(
bedmethylitems,
parameter_dict
)
A group of BedmethylItem instances for reporting.
Parameters
bedmethylitems
A list of BedmethylItem instances.
parameter_dict
A dictionary of analysis parameters (
dict
).
View Source
class BedmethylItemGroup:
"""A group of BedmethylItem instances for reporting.
**Parameters**
**bedmethylitems**
> A list of BedmethylItem instances.
**parameter_dict**
> A dictionary of analysis parameters (`dict`).
"""
def __init__(self, bedmethylitems, parameter_dict):
self.bedmethylitems = bedmethylitems
self.parameter_dict = parameter_dict
self.number_of_samples = len(bedmethylitems)
def perform_all_analysis_in_bedmethylitemgroup(self):
for bedmethylitem in self.bedmethylitems:
bedmethylitem.perform_analysis(parameter_dict=self.parameter_dict)
self.comparisons_performed = True
Methods
perform_all_analysis_in_bedmethylitemgroup
def perform_all_analysis_in_bedmethylitemgroup(
self
)
View Source
def perform_all_analysis_in_bedmethylitemgroup(self):
for bedmethylitem in self.bedmethylitems:
bedmethylitem.perform_analysis(parameter_dict=self.parameter_dict)
self.comparisons_performed = True
CustomTranslator
class CustomTranslator(
features_filters=(),
features_properties=None
)
Custom translator used for the main plot.
View Source
class CustomTranslator(dna_features_viewer.BiopythonTranslator):
"""Custom translator used for the main plot."""
def compute_filtered_features(self, features):
"""Display only selected features."""
filtered_features = []
n = 8 # a good number of features to display
# Keep longest features
if len(features) > n:
feature_lengths = []
for feature in features:
feature_lengths += [len(feature.location)]
feature_lengths.sort(reverse=True)
max_length = feature_lengths[n]
for feature in features:
if len(feature.location) > max_length:
filtered_features += [feature]
else: # no need to do anything if not enough features
filtered_features = features
return filtered_features
Ancestors (in MRO)
- dna_features_viewer.BiopythonTranslator.BiopythonTranslator.BiopythonTranslator
- dna_features_viewer.BiopythonTranslator.BiopythonTranslatorBase.BiopythonTranslatorBase
Class variables
default_feature_color
graphic_record_parameters
ignored_features_types
label_fields
Static methods
quick_class_plot
def quick_class_plot(
record,
figure_width=12,
**kwargs
)
Allows super quick and dirty plotting of Biopython records.
This is really meant for use in a Jupyter/Ipython notebook with the "%matplotlib inline" setting.
from dna_features_viewer import BiopythonTranslator BiopythonTranslator.quick_plot(my_record)
View Source
@classmethod
def quick_class_plot(cls, record, figure_width=12, **kwargs):
"""Allows super quick and dirty plotting of Biopython records.
This is really meant for use in a Jupyter/Ipython notebook with
the "%matplotlib inline" setting.
>>> from dna_features_viewer import BiopythonTranslator
>>> BiopythonTranslator.quick_plot(my_record)
"""
graphic_record = cls().translate_record(record)
ax, _ = graphic_record.plot(figure_width=figure_width, **kwargs)
return ax
Methods
compute_feature_box_color
def compute_feature_box_color(
self,
feature
)
Compute a box_color for this feature.
View Source
def compute_feature_box_color(self, feature):
"""Compute a box_color for this feature."""
return "auto"
compute_feature_box_linewidth
def compute_feature_box_linewidth(
self,
feature
)
Compute a box_linewidth for this feature.
View Source
def compute_feature_box_linewidth(self, feature):
"""Compute a box_linewidth for this feature."""
return 0.3
compute_feature_color
def compute_feature_color(
self,
feature
)
Compute a color for this feature.
If the feature has a color
qualifier it will be used. Otherwise,
the classe's default_feature_color
is used.
To change the behaviour, create a subclass of BiopythonTranslator
and overwrite this method.
View Source
def compute_feature_color(self, feature):
"""Compute a color for this feature.
If the feature has a ``color`` qualifier it will be used. Otherwise,
the classe's ``default_feature_color`` is used.
To change the behaviour, create a subclass of ``BiopythonTranslator``
and overwrite this method.
"""
if "color" in feature.qualifiers:
color = feature.qualifiers["color"]
if isinstance(color[0], str):
return "".join(feature.qualifiers["color"])
else:
return color
else:
return self.default_feature_color
compute_feature_fontdict
def compute_feature_fontdict(
self,
feature
)
Compute a font dict for this feature.
View Source
def compute_feature_fontdict(self, feature):
"""Compute a font dict for this feature."""
return None
compute_feature_html
def compute_feature_html(
self,
feature
)
Gets the 'label' of the feature.
View Source
def compute_feature_html(self, feature):
"""Gets the 'label' of the feature."""
return self.compute_feature_label(feature)
compute_feature_label
def compute_feature_label(
self,
feature
)
Compute the label of the feature.
View Source
def compute_feature_label(self, feature):
"""Compute the label of the feature."""
label = feature.type
for key in self.label_fields:
if key in feature.qualifiers and len(feature.qualifiers[key]):
label = feature.qualifiers[key]
break
if isinstance(label, list):
label = "|".join(label)
return label
compute_feature_label_link_color
def compute_feature_label_link_color(
self,
feature
)
Compute the color of the line linking the label to its feature.
View Source
def compute_feature_label_link_color(self, feature):
"""Compute the color of the line linking the label to its feature."""
return "black"
compute_feature_legend_text
def compute_feature_legend_text(
self,
feature
)
View Source
def compute_feature_legend_text(self, feature):
return None
compute_feature_linewidth
def compute_feature_linewidth(
self,
feature
)
Compute the edge width of the feature's arrow/rectangle.
View Source
def compute_feature_linewidth(self, feature):
"""Compute the edge width of the feature's arrow/rectangle."""
return 1.0
compute_filtered_features
def compute_filtered_features(
self,
features
)
Display only selected features.
View Source
def compute_filtered_features(self, features):
"""Display only selected features."""
filtered_features = []
n = 8 # a good number of features to display
# Keep longest features
if len(features) > n:
feature_lengths = []
for feature in features:
feature_lengths += [len(feature.location)]
feature_lengths.sort(reverse=True)
max_length = feature_lengths[n]
for feature in features:
if len(feature.location) > max_length:
filtered_features += [feature]
else: # no need to do anything if not enough features
filtered_features = features
return filtered_features
quick_plot
def quick_plot(
self,
record,
figure_width=12,
**kwargs
)
Allows super quick and dirty plotting of Biopython records.
This is really meant for use in a Jupyter/Ipython notebook with the "%matplotlib inline" setting.
from dna_features_viewer import BiopythonTranslator BiopythonTranslator.quick_plot(my_record)
View Source
def quick_plot(self, record, figure_width=12, **kwargs):
"""Allows super quick and dirty plotting of Biopython records.
This is really meant for use in a Jupyter/Ipython notebook with
the "%matplotlib inline" setting.
>>> from dna_features_viewer import BiopythonTranslator
>>> BiopythonTranslator.quick_plot(my_record)
"""
graphic_record = self.translate_record(record)
ax, _ = graphic_record.plot(figure_width=figure_width, **kwargs)
return ax
translate_feature
def translate_feature(
self,
feature
)
Translate a Biopython feature into a Dna Features Viewer feature.
View Source
def translate_feature(self, feature):
"""Translate a Biopython feature into a Dna Features Viewer feature."""
properties = dict(
label=self.compute_feature_label(feature),
color=self.compute_feature_color(feature),
html=self.compute_feature_html(feature),
fontdict=self.compute_feature_fontdict(feature),
box_linewidth=self.compute_feature_box_linewidth(feature),
box_color=self.compute_feature_box_color(feature),
linewidth=self.compute_feature_linewidth(feature),
label_link_color=self.compute_feature_label_link_color(feature),
legend_text=self.compute_feature_legend_text(feature),
)
if self.features_properties is not None:
other_properties = self.features_properties
if hasattr(other_properties, "__call__"):
other_properties = other_properties(feature)
properties.update(other_properties)
return GraphicFeature(
start=feature.location.start,
end=feature.location.end,
strand=feature.location.strand,
**properties
)
translate_record
def translate_record(
self,
record,
record_class=None,
filetype=None
)
Create a new GraphicRecord from a BioPython Record object.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
record | None | A BioPython Record object or the path to a Genbank or a GFF file. | None |
record_class | None | The graphic record class to use, e.g. GraphicRecord (default) or CircularGraphicRecord. Strings 'circular' and 'linear' can also be provided. |
None |
filetype | None | Used only when a Genbank or a GFF file is provided; one of "genbank" or "gff" to be used. Default None infers from file extension. |
None |
View Source
def translate_record(self, record, record_class=None, filetype=None):
"""Create a new GraphicRecord from a BioPython Record object.
Parameters
----------
record
A BioPython Record object or the path to a Genbank or a GFF file.
record_class
The graphic record class to use, e.g. GraphicRecord (default) or
CircularGraphicRecord. Strings 'circular' and 'linear' can also be
provided.
filetype
Used only when a Genbank or a GFF file is provided; one of "genbank"
or "gff" to be used. Default None infers from file extension.
"""
classes = {
"linear": GraphicRecord,
"circular": CircularGraphicRecord,
None: GraphicRecord,
}
if record_class in classes:
record_class = classes[record_class]
if isinstance(record, str) or hasattr(record, "read"):
record = load_record(record, filetype=filetype)
filtered_features = self.compute_filtered_features(record.features)
return record_class(
sequence_length=len(record),
sequence=str(record.seq) if record.seq.defined else None,
features=[
self.translate_feature(feature)
for feature in filtered_features
if feature.location is not None
],
**self.graphic_record_parameters
)
PatternTranslator
class PatternTranslator(
features_filters=(),
features_properties=None
)
Custom translator used for the pattern annotation plot.
View Source
class PatternTranslator(dna_features_viewer.BiopythonTranslator):
"""Custom translator used for the pattern annotation plot."""
def compute_feature_color(self, feature):
status = "status"
# Values are from binarize_bed() :
if feature.qualifiers[status] == "1":
return "red"
elif feature.qualifiers[status] == "U":
return "yellow"
elif feature.qualifiers[status] == "0":
return "grey"
# to be implemented later:
elif feature.qualifiers[status] == "?":
return "tab:cyan"
# There should be no other options.
Ancestors (in MRO)
- dna_features_viewer.BiopythonTranslator.BiopythonTranslator.BiopythonTranslator
- dna_features_viewer.BiopythonTranslator.BiopythonTranslatorBase.BiopythonTranslatorBase
Class variables
default_feature_color
graphic_record_parameters
ignored_features_types
label_fields
Static methods
quick_class_plot
def quick_class_plot(
record,
figure_width=12,
**kwargs
)
Allows super quick and dirty plotting of Biopython records.
This is really meant for use in a Jupyter/Ipython notebook with the "%matplotlib inline" setting.
from dna_features_viewer import BiopythonTranslator BiopythonTranslator.quick_plot(my_record)
View Source
@classmethod
def quick_class_plot(cls, record, figure_width=12, **kwargs):
"""Allows super quick and dirty plotting of Biopython records.
This is really meant for use in a Jupyter/Ipython notebook with
the "%matplotlib inline" setting.
>>> from dna_features_viewer import BiopythonTranslator
>>> BiopythonTranslator.quick_plot(my_record)
"""
graphic_record = cls().translate_record(record)
ax, _ = graphic_record.plot(figure_width=figure_width, **kwargs)
return ax
Methods
compute_feature_box_color
def compute_feature_box_color(
self,
feature
)
Compute a box_color for this feature.
View Source
def compute_feature_box_color(self, feature):
"""Compute a box_color for this feature."""
return "auto"
compute_feature_box_linewidth
def compute_feature_box_linewidth(
self,
feature
)
Compute a box_linewidth for this feature.
View Source
def compute_feature_box_linewidth(self, feature):
"""Compute a box_linewidth for this feature."""
return 0.3
compute_feature_color
def compute_feature_color(
self,
feature
)
Compute a color for this feature.
If the feature has a color
qualifier it will be used. Otherwise,
the classe's default_feature_color
is used.
To change the behaviour, create a subclass of BiopythonTranslator
and overwrite this method.
View Source
def compute_feature_color(self, feature):
status = "status"
# Values are from binarize_bed() :
if feature.qualifiers[status] == "1":
return "red"
elif feature.qualifiers[status] == "U":
return "yellow"
elif feature.qualifiers[status] == "0":
return "grey"
# to be implemented later:
elif feature.qualifiers[status] == "?":
return "tab:cyan"
# There should be no other options.
compute_feature_fontdict
def compute_feature_fontdict(
self,
feature
)
Compute a font dict for this feature.
View Source
def compute_feature_fontdict(self, feature):
"""Compute a font dict for this feature."""
return None
compute_feature_html
def compute_feature_html(
self,
feature
)
Gets the 'label' of the feature.
View Source
def compute_feature_html(self, feature):
"""Gets the 'label' of the feature."""
return self.compute_feature_label(feature)
compute_feature_label
def compute_feature_label(
self,
feature
)
Compute the label of the feature.
View Source
def compute_feature_label(self, feature):
"""Compute the label of the feature."""
label = feature.type
for key in self.label_fields:
if key in feature.qualifiers and len(feature.qualifiers[key]):
label = feature.qualifiers[key]
break
if isinstance(label, list):
label = "|".join(label)
return label
compute_feature_label_link_color
def compute_feature_label_link_color(
self,
feature
)
Compute the color of the line linking the label to its feature.
View Source
def compute_feature_label_link_color(self, feature):
"""Compute the color of the line linking the label to its feature."""
return "black"
compute_feature_legend_text
def compute_feature_legend_text(
self,
feature
)
View Source
def compute_feature_legend_text(self, feature):
return None
compute_feature_linewidth
def compute_feature_linewidth(
self,
feature
)
Compute the edge width of the feature's arrow/rectangle.
View Source
def compute_feature_linewidth(self, feature):
"""Compute the edge width of the feature's arrow/rectangle."""
return 1.0
compute_filtered_features
def compute_filtered_features(
self,
features
)
Return the list of features minus the ignored ones.
By the method keeps any feature whose type is not in ignored_features_types and for which all filter(f) pass.
View Source
def compute_filtered_features(self, features):
"""Return the list of features minus the ignored ones.
By the method keeps any feature whose type is not in
ignored_features_types and for which all filter(f) pass.
"""
return [
f
for f in features
if all([fl(f) for fl in self.features_filters])
and f.type not in self.ignored_features_types
]
quick_plot
def quick_plot(
self,
record,
figure_width=12,
**kwargs
)
Allows super quick and dirty plotting of Biopython records.
This is really meant for use in a Jupyter/Ipython notebook with the "%matplotlib inline" setting.
from dna_features_viewer import BiopythonTranslator BiopythonTranslator.quick_plot(my_record)
View Source
def quick_plot(self, record, figure_width=12, **kwargs):
"""Allows super quick and dirty plotting of Biopython records.
This is really meant for use in a Jupyter/Ipython notebook with
the "%matplotlib inline" setting.
>>> from dna_features_viewer import BiopythonTranslator
>>> BiopythonTranslator.quick_plot(my_record)
"""
graphic_record = self.translate_record(record)
ax, _ = graphic_record.plot(figure_width=figure_width, **kwargs)
return ax
translate_feature
def translate_feature(
self,
feature
)
Translate a Biopython feature into a Dna Features Viewer feature.
View Source
def translate_feature(self, feature):
"""Translate a Biopython feature into a Dna Features Viewer feature."""
properties = dict(
label=self.compute_feature_label(feature),
color=self.compute_feature_color(feature),
html=self.compute_feature_html(feature),
fontdict=self.compute_feature_fontdict(feature),
box_linewidth=self.compute_feature_box_linewidth(feature),
box_color=self.compute_feature_box_color(feature),
linewidth=self.compute_feature_linewidth(feature),
label_link_color=self.compute_feature_label_link_color(feature),
legend_text=self.compute_feature_legend_text(feature),
)
if self.features_properties is not None:
other_properties = self.features_properties
if hasattr(other_properties, "__call__"):
other_properties = other_properties(feature)
properties.update(other_properties)
return GraphicFeature(
start=feature.location.start,
end=feature.location.end,
strand=feature.location.strand,
**properties
)
translate_record
def translate_record(
self,
record,
record_class=None,
filetype=None
)
Create a new GraphicRecord from a BioPython Record object.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
record | None | A BioPython Record object or the path to a Genbank or a GFF file. | None |
record_class | None | The graphic record class to use, e.g. GraphicRecord (default) or CircularGraphicRecord. Strings 'circular' and 'linear' can also be provided. |
None |
filetype | None | Used only when a Genbank or a GFF file is provided; one of "genbank" or "gff" to be used. Default None infers from file extension. |
None |
View Source
def translate_record(self, record, record_class=None, filetype=None):
"""Create a new GraphicRecord from a BioPython Record object.
Parameters
----------
record
A BioPython Record object or the path to a Genbank or a GFF file.
record_class
The graphic record class to use, e.g. GraphicRecord (default) or
CircularGraphicRecord. Strings 'circular' and 'linear' can also be
provided.
filetype
Used only when a Genbank or a GFF file is provided; one of "genbank"
or "gff" to be used. Default None infers from file extension.
"""
classes = {
"linear": GraphicRecord,
"circular": CircularGraphicRecord,
None: GraphicRecord,
}
if record_class in classes:
record_class = classes[record_class]
if isinstance(record, str) or hasattr(record, "read"):
record = load_record(record, filetype=filetype)
filtered_features = self.compute_filtered_features(record.features)
return record_class(
sequence_length=len(record),
sequence=str(record.seq) if record.seq.defined else None,
features=[
self.translate_feature(feature)
for feature in filtered_features
if feature.location is not None
],
**self.graphic_record_parameters
)