Module epijinn.Bedmethyl

View Source

# Copyright 2024 Edinburgh Genome Foundry, University of Edinburgh

#

# This file is part of EpiJinn.

#

# EpiJinn is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

#

# EpiJinn is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

#

# You should have received a copy of the GNU General Public License along with Ediacara. If not, see <https://www.gnu.org/licenses/>.

import copy

import os

import pandas

import matplotlib.pyplot as plt

import Bio

import dna_features_viewer

from .Methyl import METHYLASES

from .Methyl import annotate_methylation

COMPLEMENTS = {

    "A": "T",

    "C": "G",

    "G": "C",

    "T": "A",

}

# From https://github.com/nanoporetech/modkit/blob/master/book/src/intro_bedmethyl.md

BEDMETHYL_HEADER = [

    "chrom",

    "start_position",

    "end_position",

    "modified_base_code_and_motif",

    "score",

    "strand",

    "strand_start_position",

    "strand_end_position",

    "color",

    "Nvalid_cov",

    "percent_modified",

    "Nmod",

    "Ncanonical",

    "Nother_mod",

    "Ndelete",

    "Nfail",

    "Ndiff",

    "Nnocall",

]

# Remove duplicate and unnecessary columns from report:

columns_for_pdf_report = [

    BEDMETHYL_HEADER[i] for i in [1, 5, 9, 10, 11, 12, 13, 14, 15, 16, 17]

] + [

    "status"

]  # added during binarization

DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")

# For looking up modified_base_code_and_motif entries in bedmethyl:

MODIFICATION_CODES = pandas.read_csv(os.path.join(DATA_DIR, "mod_base_codes.csv"))

# Adapted From https://github.com/samtools/hts-specs/blob/master/SAMtags.pdf

class CustomTranslator(dna_features_viewer.BiopythonTranslator):

    """Custom translator used for the main plot."""

    def compute_filtered_features(self, features):

        """Display only selected features."""

        filtered_features = []

        n = 8  # a good number of features to display

        # Keep longest features

        if len(features) > n:

            feature_lengths = []

            for feature in features:

                feature_lengths += [len(feature.location)]

            feature_lengths.sort(reverse=True)

            max_length = feature_lengths[n]

            for feature in features:

                if len(feature.location) > max_length:

                    filtered_features += [feature]

        else:  # no need to do anything if not enough features

            filtered_features = features

        return filtered_features

class PatternTranslator(dna_features_viewer.BiopythonTranslator):

    """Custom translator used for the pattern annotation plot."""

    def compute_feature_color(self, feature):

        status = "status"

        # Values are from binarize_bed() :

        if feature.qualifiers[status] == "1":

            return "red"

        elif feature.qualifiers[status] == "U":

            return "yellow"

        elif feature.qualifiers[status] == "0":

            return "grey"

        # to be implemented later:

        elif feature.qualifiers[status] == "?":

            return "tab:cyan"

        # There should be no other options.

class BedResult:

    """Results of a bedmethyl table analysis."""

    def __init__(self, modification, bed, record):

        self.feature_cutoff = 50  # do not create a plot if there are more features

        self.modification = modification

        self.mod_abbreviation = MODIFICATION_CODES.loc[

            MODIFICATION_CODES["Code"] == self.modification

        ].Abbreviation.iloc[

            0

        ]  # there is only one entry so we take the first

        self.mod_name = MODIFICATION_CODES.loc[

            MODIFICATION_CODES["Code"] == self.modification

        ].Name.iloc[0]

        self.mod_chebi = MODIFICATION_CODES.loc[

            MODIFICATION_CODES["Code"] == self.modification

        ].ChEBI.iloc[0]

        self.unmodified_base = MODIFICATION_CODES.loc[

            MODIFICATION_CODES["Code"] == self.modification

        ].Unmodified_base.iloc[0]

        self.bed = bed

        self.record = record

        # FILTER (ANNOTATED) RECORD AND ADD STATUS:

        filtered_features = []

        labelstart = (

            "@epijinn(" + self.unmodified_base + ", strand="

        )  # unfinished to account for +/- strands

        for feature in record.features:

            if feature.id == "@epijinn":  # as annotated by the function

                if feature.qualifiers["label"].startswith(labelstart):

                    # To display seq in report:

                    feature.qualifiers["label"] = self.unmodified_base

                    # Assign status:

                    # no need to check for strand as the complement won't be modified

                    location_start = int(feature.location.start)

                    # expect exactly one bed table entry per modified nucleotide:

                    status_symbol = self.bed.loc[

                        self.bed["LOC"] == location_start

                    ].STATUS.iloc[0]

                    # used by a custom BiopythonTranslator to colour the annotation:

                    feature.qualifiers["status"] = status_symbol

                    filtered_features += [feature]

        record.features = filtered_features

        if len(self.record.features) > self.feature_cutoff:

            self.img_created = False

        else:

            self.img_created = True

            fig, ax1 = plt.subplots(1, 1, figsize=(8, 3))

            graphic_record = PatternTranslator().translate_record(self.record)

            graphic_record.plot(ax=ax1, with_ruler=False, strand_in_label_threshold=4)

            self.plot = fig

class BedmethylItem:

    """Analyse a bedmethyl file.

    **Parameters**

    **sample**

    > The name of the sample, for example a barcode id (`str`).

    **reference**

    > A Biopython SeqRecord.

    **bedmethyl**

    > A pandas dataframe of a bedmethyl file for the reference.

    """

    def __init__(self, sample, reference, bedmethyl):

        self.sample = sample

        self.record = reference

        self.name = reference.name

        self.id = reference.id

        self.bed = bedmethyl

        self.reference_length = len(self.record)

    def perform_analysis(self, parameter_dict):

        """Perform analysis and plot the sequence."""

        methylase_str = parameter_dict["methylases"]

        methylase = METHYLASES[methylase_str]

        self.fig = self.plot_record()

        self.results = []  # BedResult instances. For easy reference in pug template.

        for modification in self.bed.modified_base_code_and_motif.unique():

            # RUN BED SUBSET ETC

            bed_subtype = self.subset_bed_to_mod_subtype(self.bed, mod=modification)

            annotated_record, bed_pattern_match = self.subset_bed_to_pattern_match(

                methylase, bed=bed_subtype

            )

            bed_binarized = self.binarize_bed(

                bed_pattern_match,

                met_cutoff=parameter_dict["methylated_cutoff"],

                nonmet_cutoff=parameter_dict["unmethylated_cutoff"],

            )

            final_bed = subset_bed_columns(bed_binarized)

            bedresult = BedResult(modification, bed=final_bed, record=annotated_record)

            self.results += [bedresult]

    def annotate_record(self):

        return self.record

    def plot_record(self):

        fig, ax1 = plt.subplots(1, 1, figsize=(8, 2))

        graphic_record = CustomTranslator().translate_record(self.record)

        graphic_record.plot(ax=ax1, with_ruler=False, strand_in_label_threshold=4)

        return fig

    @staticmethod

    def subset_bed_to_mod_subtype(bed, mod):

        # Columnname from Bedmethyl file specification

        bed_subtype = bed.loc[bed["modified_base_code_and_motif"] == mod]

        return bed_subtype

    def subset_bed_to_base_matches(self, bed=None, modified_base="C"):

        if bed is None:  # optional bed allows linking multiple bed subset methods

            bed = self.bed

        mod_base = modified_base.upper()

        mod_base_complement = COMPLEMENTS[mod_base]

        # POSITIVE STRAND

        matching_positions_on_positive_strand = [

            pos

            for pos, base in enumerate(str(self.record.seq.upper()))

            if base == mod_base

        ]

        # Columnname from Bedmethyl file specification:

        positive_strand_filter = bed["start_position"].isin(

            matching_positions_on_positive_strand

        ) & (bed["strand"] == "+")

        # NEGATIVE STRAND

        matching_positions_on_negative_strand = [

            pos

            for pos, base in enumerate(str(self.record.seq.upper()))

            if base == mod_base_complement

        ]

        negative_strand_filter = bed["start_position"].isin(

            matching_positions_on_negative_strand

        ) & (bed["strand"] == "-")

        bed_basematch = bed.loc[positive_strand_filter | negative_strand_filter]

        return bed_basematch

    def subset_bed_to_pattern_match(self, methylase, bed=None):

        if bed is None:  # optional bed allows linking multiple bed subset methods

            bed = self.bed

        methylated_index = methylase.index_pos

        mod_base = methylase.sequence[methylated_index].upper()

        record_copy = copy.deepcopy(self.record)

        annotated_record = annotate_methylation(record_copy, methylases=[methylase])

        # CREATE LIST OF POSITIONS

        positive_strand_locations = []

        negative_strand_locations = []

        label_pos = "@epijinn(" + mod_base + ", strand=1)"

        label_neg = "@epijinn(" + mod_base + ", strand=-1)"

        for feature in annotated_record.features:

            if feature.id == "@epijinn":  # as annotated by function above

                if feature.qualifiers["label"] == label_pos:

                    positive_strand_locations += [feature.location.start]

                elif feature.qualifiers["label"] == label_neg:

                    negative_strand_locations += [feature.location.start]

        # SUBSET USING POSITIONS

        # Columnname from Bedmethyl file specification:

        positive_strand_filter = bed["start_position"].isin(

            positive_strand_locations

        ) & (bed["strand"] == "+")

        negative_strand_filter = bed["start_position"].isin(

            negative_strand_locations

        ) & (bed["strand"] == "-")

        bed_pattern_match = bed.loc[positive_strand_filter | negative_strand_filter]

        return annotated_record, bed_pattern_match

    @staticmethod

    def binarize_bed(bed, met_cutoff, nonmet_cutoff):

        bed["status"] = "U"  # prefill undetermined

        for index, row in bed.iterrows():

            if row["percent_modified"] >= float(met_cutoff) * 100:

                bed.loc[index, "status"] = "1"  # methylated, symbol may change

            elif row["percent_modified"] <= float(nonmet_cutoff) * 100:

                bed.loc[index, "status"] = "0"  # unmethylated

            # symbol "?" reserved for low coverage to be implemented later

        return bed

def read_sample_sheet(

    sample_sheet, genbank_dir="", bedmethyl_dir="", parameter_sheet=""

):

    """Read a sample sheet into a BedmethylItemGroup.

    **Parameters**

    **sample_sheet**

    > CSV file path (`str`). No header and columns must be in this order: projectname,

    sample, Genbank name (without extension), bedmethyl file.

    **genbank_dir**

    > Directory of the Genbank files (`str`). Default: local directory.

    **bedmethyl_dir**

    > Directory of the bedmethyl files (`str`). Default: local directory.

    **parameter_sheet**

    > CSV file path (`str`). Use 'Parameter', 'Value' header for columns. If a

    'projectname' is specified, it overwrites the sample sheet value.

    """

    # READ PARAMETERS

    param_df = pandas.read_csv(parameter_sheet, usecols=["Parameter", "Value"])

    parameter_dict = dict(param_df.values)

    # READ SAMPLES

    sample_df = pandas.read_csv(sample_sheet, header=None)

    # add columnnames

    # CREATE ITEMS

    # We allow Sequeduct to specify the projectname as a command parameter as well;

    if not "projectname" in parameter_dict:

        # first entry of the first column (contains projectname):

        parameter_dict["projectname"] = sample_df.iloc[0, 0]

    # Allow the user to not specify these in the sheet:

    # The default cutoffs are based on preliminary data.

    if not "methylated_cutoff" in parameter_dict:

        parameter_dict["methylated_cutoff"] = 0.7

    if not "unmethylated_cutoff" in parameter_dict:

        parameter_dict["unmethylated_cutoff"] = 0.3

    bedmethylitems = []

    for index, row in sample_df.iterrows():

        genbank_name = row[2]  # number specified by the sample sheet format

        genbank_path = os.path.join(genbank_dir, genbank_name + ".gb")  # Genbank ext

        record = Bio.SeqIO.read(genbank_path, "genbank")

        record.id = genbank_name

        record.name = genbank_name

        record.annotations["molecule_type"] = "DNA"

        bed_name = row[3]  # number specified by the sample sheet format

        bed_path = os.path.join(bedmethyl_dir, bed_name)

        bed_df = pandas.read_csv(bed_path, header=None, delimiter="\t")

        bed_df.columns = BEDMETHYL_HEADER

        bedmethylitems += [

            BedmethylItem(sample=row[1], reference=record, bedmethyl=bed_df)

        ]  # number specified by the sample sheet format

    bedmethylitemgroup = BedmethylItemGroup(

        bedmethylitems=bedmethylitems, parameter_dict=parameter_dict

    )

    return bedmethylitemgroup

def subset_bed_columns(bed):

    bed_report = bed[columns_for_pdf_report]

    # These were designed to be more informative and fit the report:

    new_columnnames = [

        "LOC",

        "Strand",

        "COV",

        "% mod",

        "MOD",

        "STD",

        "OTH",

        "del",

        "fail",

        "diff",

        "nocall",

        "STATUS",

    ]

    bed_report.columns = new_columnnames

    return bed_report

class BedmethylItemGroup:

    """A group of BedmethylItem instances for reporting.

    **Parameters**

    **bedmethylitems**

    > A list of BedmethylItem instances.

    **parameter_dict**

    > A dictionary of analysis parameters (`dict`).

    """

    def __init__(self, bedmethylitems, parameter_dict):

        self.bedmethylitems = bedmethylitems

        self.parameter_dict = parameter_dict

        self.number_of_samples = len(bedmethylitems)

    def perform_all_analysis_in_bedmethylitemgroup(self):

        for bedmethylitem in self.bedmethylitems:

            bedmethylitem.perform_analysis(parameter_dict=self.parameter_dict)

        self.comparisons_performed = True

Variables

BEDMETHYL_HEADER

COMPLEMENTS

DATA_DIR

METHYLASES

MODIFICATION_CODES

columns_for_pdf_report

Functions

read_sample_sheet

def read_sample_sheet(
    sample_sheet,
    genbank_dir='',
    bedmethyl_dir='',
    parameter_sheet=''
)

Read a sample sheet into a BedmethylItemGroup.

Parameters

sample_sheet

CSV file path (str). No header and columns must be in this order: projectname, sample, Genbank name (without extension), bedmethyl file.

genbank_dir

Directory of the Genbank files (str). Default: local directory.

bedmethyl_dir

Directory of the bedmethyl files (str). Default: local directory.

parameter_sheet

CSV file path (str). Use 'Parameter', 'Value' header for columns. If a 'projectname' is specified, it overwrites the sample sheet value.

View Source

def read_sample_sheet(

    sample_sheet, genbank_dir="", bedmethyl_dir="", parameter_sheet=""

):

    """Read a sample sheet into a BedmethylItemGroup.

    **Parameters**

    **sample_sheet**

    > CSV file path (`str`). No header and columns must be in this order: projectname,

    sample, Genbank name (without extension), bedmethyl file.

    **genbank_dir**

    > Directory of the Genbank files (`str`). Default: local directory.

    **bedmethyl_dir**

    > Directory of the bedmethyl files (`str`). Default: local directory.

    **parameter_sheet**

    > CSV file path (`str`). Use 'Parameter', 'Value' header for columns. If a

    'projectname' is specified, it overwrites the sample sheet value.

    """

    # READ PARAMETERS

    param_df = pandas.read_csv(parameter_sheet, usecols=["Parameter", "Value"])

    parameter_dict = dict(param_df.values)

    # READ SAMPLES

    sample_df = pandas.read_csv(sample_sheet, header=None)

    # add columnnames

    # CREATE ITEMS

    # We allow Sequeduct to specify the projectname as a command parameter as well;

    if not "projectname" in parameter_dict:

        # first entry of the first column (contains projectname):

        parameter_dict["projectname"] = sample_df.iloc[0, 0]

    # Allow the user to not specify these in the sheet:

    # The default cutoffs are based on preliminary data.

    if not "methylated_cutoff" in parameter_dict:

        parameter_dict["methylated_cutoff"] = 0.7

    if not "unmethylated_cutoff" in parameter_dict:

        parameter_dict["unmethylated_cutoff"] = 0.3

    bedmethylitems = []

    for index, row in sample_df.iterrows():

        genbank_name = row[2]  # number specified by the sample sheet format

        genbank_path = os.path.join(genbank_dir, genbank_name + ".gb")  # Genbank ext

        record = Bio.SeqIO.read(genbank_path, "genbank")

        record.id = genbank_name

        record.name = genbank_name

        record.annotations["molecule_type"] = "DNA"

        bed_name = row[3]  # number specified by the sample sheet format

        bed_path = os.path.join(bedmethyl_dir, bed_name)

        bed_df = pandas.read_csv(bed_path, header=None, delimiter="\t")

        bed_df.columns = BEDMETHYL_HEADER

        bedmethylitems += [

            BedmethylItem(sample=row[1], reference=record, bedmethyl=bed_df)

        ]  # number specified by the sample sheet format

    bedmethylitemgroup = BedmethylItemGroup(

        bedmethylitems=bedmethylitems, parameter_dict=parameter_dict

    )

    return bedmethylitemgroup

subset_bed_columns

def subset_bed_columns(
    bed
)

View Source

def subset_bed_columns(bed):

    bed_report = bed[columns_for_pdf_report]

    # These were designed to be more informative and fit the report:

    new_columnnames = [

        "LOC",

        "Strand",

        "COV",

        "% mod",

        "MOD",

        "STD",

        "OTH",

        "del",

        "fail",

        "diff",

        "nocall",

        "STATUS",

    ]

    bed_report.columns = new_columnnames

    return bed_report

Classes

BedResult

class BedResult(
    modification,
    bed,
    record
)

Results of a bedmethyl table analysis.

View Source

class BedResult:

    """Results of a bedmethyl table analysis."""

    def __init__(self, modification, bed, record):

        self.feature_cutoff = 50  # do not create a plot if there are more features

        self.modification = modification

        self.mod_abbreviation = MODIFICATION_CODES.loc[

            MODIFICATION_CODES["Code"] == self.modification

        ].Abbreviation.iloc[

            0

        ]  # there is only one entry so we take the first

        self.mod_name = MODIFICATION_CODES.loc[

            MODIFICATION_CODES["Code"] == self.modification

        ].Name.iloc[0]

        self.mod_chebi = MODIFICATION_CODES.loc[

            MODIFICATION_CODES["Code"] == self.modification

        ].ChEBI.iloc[0]

        self.unmodified_base = MODIFICATION_CODES.loc[

            MODIFICATION_CODES["Code"] == self.modification

        ].Unmodified_base.iloc[0]

        self.bed = bed

        self.record = record

        # FILTER (ANNOTATED) RECORD AND ADD STATUS:

        filtered_features = []

        labelstart = (

            "@epijinn(" + self.unmodified_base + ", strand="

        )  # unfinished to account for +/- strands

        for feature in record.features:

            if feature.id == "@epijinn":  # as annotated by the function

                if feature.qualifiers["label"].startswith(labelstart):

                    # To display seq in report:

                    feature.qualifiers["label"] = self.unmodified_base

                    # Assign status:

                    # no need to check for strand as the complement won't be modified

                    location_start = int(feature.location.start)

                    # expect exactly one bed table entry per modified nucleotide:

                    status_symbol = self.bed.loc[

                        self.bed["LOC"] == location_start

                    ].STATUS.iloc[0]

                    # used by a custom BiopythonTranslator to colour the annotation:

                    feature.qualifiers["status"] = status_symbol

                    filtered_features += [feature]

        record.features = filtered_features

        if len(self.record.features) > self.feature_cutoff:

            self.img_created = False

        else:

            self.img_created = True

            fig, ax1 = plt.subplots(1, 1, figsize=(8, 3))

            graphic_record = PatternTranslator().translate_record(self.record)

            graphic_record.plot(ax=ax1, with_ruler=False, strand_in_label_threshold=4)

            self.plot = fig

BedmethylItem

class BedmethylItem(
    sample,
    reference,
    bedmethyl
)

Analyse a bedmethyl file.

Parameters

sample

The name of the sample, for example a barcode id (str).

reference

A Biopython SeqRecord.

bedmethyl

A pandas dataframe of a bedmethyl file for the reference.

View Source

class BedmethylItem:

    """Analyse a bedmethyl file.

    **Parameters**

    **sample**

    > The name of the sample, for example a barcode id (`str`).

    **reference**

    > A Biopython SeqRecord.

    **bedmethyl**

    > A pandas dataframe of a bedmethyl file for the reference.

    """

    def __init__(self, sample, reference, bedmethyl):

        self.sample = sample

        self.record = reference

        self.name = reference.name

        self.id = reference.id

        self.bed = bedmethyl

        self.reference_length = len(self.record)

    def perform_analysis(self, parameter_dict):

        """Perform analysis and plot the sequence."""

        methylase_str = parameter_dict["methylases"]

        methylase = METHYLASES[methylase_str]

        self.fig = self.plot_record()

        self.results = []  # BedResult instances. For easy reference in pug template.

        for modification in self.bed.modified_base_code_and_motif.unique():

            # RUN BED SUBSET ETC

            bed_subtype = self.subset_bed_to_mod_subtype(self.bed, mod=modification)

            annotated_record, bed_pattern_match = self.subset_bed_to_pattern_match(

                methylase, bed=bed_subtype

            )

            bed_binarized = self.binarize_bed(

                bed_pattern_match,

                met_cutoff=parameter_dict["methylated_cutoff"],

                nonmet_cutoff=parameter_dict["unmethylated_cutoff"],

            )

            final_bed = subset_bed_columns(bed_binarized)

            bedresult = BedResult(modification, bed=final_bed, record=annotated_record)

            self.results += [bedresult]

    def annotate_record(self):

        return self.record

    def plot_record(self):

        fig, ax1 = plt.subplots(1, 1, figsize=(8, 2))

        graphic_record = CustomTranslator().translate_record(self.record)

        graphic_record.plot(ax=ax1, with_ruler=False, strand_in_label_threshold=4)

        return fig

    @staticmethod

    def subset_bed_to_mod_subtype(bed, mod):

        # Columnname from Bedmethyl file specification

        bed_subtype = bed.loc[bed["modified_base_code_and_motif"] == mod]

        return bed_subtype

    def subset_bed_to_base_matches(self, bed=None, modified_base="C"):

        if bed is None:  # optional bed allows linking multiple bed subset methods

            bed = self.bed

        mod_base = modified_base.upper()

        mod_base_complement = COMPLEMENTS[mod_base]

        # POSITIVE STRAND

        matching_positions_on_positive_strand = [

            pos

            for pos, base in enumerate(str(self.record.seq.upper()))

            if base == mod_base

        ]

        # Columnname from Bedmethyl file specification:

        positive_strand_filter = bed["start_position"].isin(

            matching_positions_on_positive_strand

        ) & (bed["strand"] == "+")

        # NEGATIVE STRAND

        matching_positions_on_negative_strand = [

            pos

            for pos, base in enumerate(str(self.record.seq.upper()))

            if base == mod_base_complement

        ]

        negative_strand_filter = bed["start_position"].isin(

            matching_positions_on_negative_strand

        ) & (bed["strand"] == "-")

        bed_basematch = bed.loc[positive_strand_filter | negative_strand_filter]

        return bed_basematch

    def subset_bed_to_pattern_match(self, methylase, bed=None):

        if bed is None:  # optional bed allows linking multiple bed subset methods

            bed = self.bed

        methylated_index = methylase.index_pos

        mod_base = methylase.sequence[methylated_index].upper()

        record_copy = copy.deepcopy(self.record)

        annotated_record = annotate_methylation(record_copy, methylases=[methylase])

        # CREATE LIST OF POSITIONS

        positive_strand_locations = []

        negative_strand_locations = []

        label_pos = "@epijinn(" + mod_base + ", strand=1)"

        label_neg = "@epijinn(" + mod_base + ", strand=-1)"

        for feature in annotated_record.features:

            if feature.id == "@epijinn":  # as annotated by function above

                if feature.qualifiers["label"] == label_pos:

                    positive_strand_locations += [feature.location.start]

                elif feature.qualifiers["label"] == label_neg:

                    negative_strand_locations += [feature.location.start]

        # SUBSET USING POSITIONS

        # Columnname from Bedmethyl file specification:

        positive_strand_filter = bed["start_position"].isin(

            positive_strand_locations

        ) & (bed["strand"] == "+")

        negative_strand_filter = bed["start_position"].isin(

            negative_strand_locations

        ) & (bed["strand"] == "-")

        bed_pattern_match = bed.loc[positive_strand_filter | negative_strand_filter]

        return annotated_record, bed_pattern_match

    @staticmethod

    def binarize_bed(bed, met_cutoff, nonmet_cutoff):

        bed["status"] = "U"  # prefill undetermined

        for index, row in bed.iterrows():

            if row["percent_modified"] >= float(met_cutoff) * 100:

                bed.loc[index, "status"] = "1"  # methylated, symbol may change

            elif row["percent_modified"] <= float(nonmet_cutoff) * 100:

                bed.loc[index, "status"] = "0"  # unmethylated

            # symbol "?" reserved for low coverage to be implemented later

        return bed

Static methods

binarize_bed

def binarize_bed(
    bed,
    met_cutoff,
    nonmet_cutoff
)

View Source

    @staticmethod

    def binarize_bed(bed, met_cutoff, nonmet_cutoff):

        bed["status"] = "U"  # prefill undetermined

        for index, row in bed.iterrows():

            if row["percent_modified"] >= float(met_cutoff) * 100:

                bed.loc[index, "status"] = "1"  # methylated, symbol may change

            elif row["percent_modified"] <= float(nonmet_cutoff) * 100:

                bed.loc[index, "status"] = "0"  # unmethylated

            # symbol "?" reserved for low coverage to be implemented later

        return bed

subset_bed_to_mod_subtype

def subset_bed_to_mod_subtype(
    bed,
    mod
)

View Source

    @staticmethod

    def subset_bed_to_mod_subtype(bed, mod):

        # Columnname from Bedmethyl file specification

        bed_subtype = bed.loc[bed["modified_base_code_and_motif"] == mod]

        return bed_subtype

Methods

annotate_record

def annotate_record(
    self
)

View Source

    def annotate_record(self):

        return self.record

perform_analysis

def perform_analysis(
    self,
    parameter_dict
)

Perform analysis and plot the sequence.

View Source

    def perform_analysis(self, parameter_dict):

        """Perform analysis and plot the sequence."""

        methylase_str = parameter_dict["methylases"]

        methylase = METHYLASES[methylase_str]

        self.fig = self.plot_record()

        self.results = []  # BedResult instances. For easy reference in pug template.

        for modification in self.bed.modified_base_code_and_motif.unique():

            # RUN BED SUBSET ETC

            bed_subtype = self.subset_bed_to_mod_subtype(self.bed, mod=modification)

            annotated_record, bed_pattern_match = self.subset_bed_to_pattern_match(

                methylase, bed=bed_subtype

            )

            bed_binarized = self.binarize_bed(

                bed_pattern_match,

                met_cutoff=parameter_dict["methylated_cutoff"],

                nonmet_cutoff=parameter_dict["unmethylated_cutoff"],

            )

            final_bed = subset_bed_columns(bed_binarized)

            bedresult = BedResult(modification, bed=final_bed, record=annotated_record)

            self.results += [bedresult]

plot_record

def plot_record(
    self
)

View Source

    def plot_record(self):

        fig, ax1 = plt.subplots(1, 1, figsize=(8, 2))

        graphic_record = CustomTranslator().translate_record(self.record)

        graphic_record.plot(ax=ax1, with_ruler=False, strand_in_label_threshold=4)

        return fig

subset_bed_to_base_matches

def subset_bed_to_base_matches(
    self,
    bed=None,
    modified_base='C'
)

View Source

    def subset_bed_to_base_matches(self, bed=None, modified_base="C"):

        if bed is None:  # optional bed allows linking multiple bed subset methods

            bed = self.bed

        mod_base = modified_base.upper()

        mod_base_complement = COMPLEMENTS[mod_base]

        # POSITIVE STRAND

        matching_positions_on_positive_strand = [

            pos

            for pos, base in enumerate(str(self.record.seq.upper()))

            if base == mod_base

        ]

        # Columnname from Bedmethyl file specification:

        positive_strand_filter = bed["start_position"].isin(

            matching_positions_on_positive_strand

        ) & (bed["strand"] == "+")

        # NEGATIVE STRAND

        matching_positions_on_negative_strand = [

            pos

            for pos, base in enumerate(str(self.record.seq.upper()))

            if base == mod_base_complement

        ]

        negative_strand_filter = bed["start_position"].isin(

            matching_positions_on_negative_strand

        ) & (bed["strand"] == "-")

        bed_basematch = bed.loc[positive_strand_filter | negative_strand_filter]

        return bed_basematch

subset_bed_to_pattern_match

def subset_bed_to_pattern_match(
    self,
    methylase,
    bed=None
)

View Source

    def subset_bed_to_pattern_match(self, methylase, bed=None):

        if bed is None:  # optional bed allows linking multiple bed subset methods

            bed = self.bed

        methylated_index = methylase.index_pos

        mod_base = methylase.sequence[methylated_index].upper()

        record_copy = copy.deepcopy(self.record)

        annotated_record = annotate_methylation(record_copy, methylases=[methylase])

        # CREATE LIST OF POSITIONS

        positive_strand_locations = []

        negative_strand_locations = []

        label_pos = "@epijinn(" + mod_base + ", strand=1)"

        label_neg = "@epijinn(" + mod_base + ", strand=-1)"

        for feature in annotated_record.features:

            if feature.id == "@epijinn":  # as annotated by function above

                if feature.qualifiers["label"] == label_pos:

                    positive_strand_locations += [feature.location.start]

                elif feature.qualifiers["label"] == label_neg:

                    negative_strand_locations += [feature.location.start]

        # SUBSET USING POSITIONS

        # Columnname from Bedmethyl file specification:

        positive_strand_filter = bed["start_position"].isin(

            positive_strand_locations

        ) & (bed["strand"] == "+")

        negative_strand_filter = bed["start_position"].isin(

            negative_strand_locations

        ) & (bed["strand"] == "-")

        bed_pattern_match = bed.loc[positive_strand_filter | negative_strand_filter]

        return annotated_record, bed_pattern_match

BedmethylItemGroup

class BedmethylItemGroup(
    bedmethylitems,
    parameter_dict
)

A group of BedmethylItem instances for reporting.

Parameters

bedmethylitems

A list of BedmethylItem instances.

parameter_dict

A dictionary of analysis parameters (dict).

View Source

class BedmethylItemGroup:

    """A group of BedmethylItem instances for reporting.

    **Parameters**

    **bedmethylitems**

    > A list of BedmethylItem instances.

    **parameter_dict**

    > A dictionary of analysis parameters (`dict`).

    """

    def __init__(self, bedmethylitems, parameter_dict):

        self.bedmethylitems = bedmethylitems

        self.parameter_dict = parameter_dict

        self.number_of_samples = len(bedmethylitems)

    def perform_all_analysis_in_bedmethylitemgroup(self):

        for bedmethylitem in self.bedmethylitems:

            bedmethylitem.perform_analysis(parameter_dict=self.parameter_dict)

        self.comparisons_performed = True

Methods

perform_all_analysis_in_bedmethylitemgroup

def perform_all_analysis_in_bedmethylitemgroup(
    self
)

View Source

    def perform_all_analysis_in_bedmethylitemgroup(self):

        for bedmethylitem in self.bedmethylitems:

            bedmethylitem.perform_analysis(parameter_dict=self.parameter_dict)

        self.comparisons_performed = True

CustomTranslator

class CustomTranslator(
    features_filters=(),
    features_properties=None
)

Custom translator used for the main plot.

View Source

class CustomTranslator(dna_features_viewer.BiopythonTranslator):

    """Custom translator used for the main plot."""

    def compute_filtered_features(self, features):

        """Display only selected features."""

        filtered_features = []

        n = 8  # a good number of features to display

        # Keep longest features

        if len(features) > n:

            feature_lengths = []

            for feature in features:

                feature_lengths += [len(feature.location)]

            feature_lengths.sort(reverse=True)

            max_length = feature_lengths[n]

            for feature in features:

                if len(feature.location) > max_length:

                    filtered_features += [feature]

        else:  # no need to do anything if not enough features

            filtered_features = features

        return filtered_features

Ancestors (in MRO)

dna_features_viewer.BiopythonTranslator.BiopythonTranslator.BiopythonTranslator
dna_features_viewer.BiopythonTranslator.BiopythonTranslatorBase.BiopythonTranslatorBase

Class variables

default_feature_color

graphic_record_parameters

ignored_features_types

label_fields

Static methods

quick_class_plot

def quick_class_plot(
    record,
    figure_width=12,
    **kwargs
)

Allows super quick and dirty plotting of Biopython records.

This is really meant for use in a Jupyter/Ipython notebook with the "%matplotlib inline" setting.

from dna_features_viewer import BiopythonTranslator BiopythonTranslator.quick_plot(my_record)

View Source

    @classmethod

    def quick_class_plot(cls, record, figure_width=12, **kwargs):

        """Allows super quick and dirty plotting of Biopython records.

        This is really meant for use in a Jupyter/Ipython notebook with

        the "%matplotlib inline" setting.

        >>> from dna_features_viewer import BiopythonTranslator

        >>> BiopythonTranslator.quick_plot(my_record)

        """

        graphic_record = cls().translate_record(record)

        ax, _ = graphic_record.plot(figure_width=figure_width, **kwargs)

        return ax

Methods

compute_feature_box_color

def compute_feature_box_color(
    self,
    feature
)

Compute a box_color for this feature.

View Source

    def compute_feature_box_color(self, feature):

        """Compute a box_color for this feature."""

        return "auto"

compute_feature_box_linewidth

def compute_feature_box_linewidth(
    self,
    feature
)

Compute a box_linewidth for this feature.

View Source

    def compute_feature_box_linewidth(self, feature):

        """Compute a box_linewidth for this feature."""

        return 0.3

compute_feature_color

def compute_feature_color(
    self,
    feature
)

Compute a color for this feature.

If the feature has a color qualifier it will be used. Otherwise, the classe's default_feature_color is used.

To change the behaviour, create a subclass of BiopythonTranslator and overwrite this method.

View Source

    def compute_feature_color(self, feature):

        """Compute a color for this feature.

        If the feature has a ``color`` qualifier it will be used. Otherwise,

        the classe's ``default_feature_color`` is used.

        To change the behaviour, create a subclass of ``BiopythonTranslator``

        and overwrite this method.

        """

        if "color" in feature.qualifiers:

            color = feature.qualifiers["color"]

            if isinstance(color[0], str):

                return "".join(feature.qualifiers["color"])

            else:

                return color

        else:

            return self.default_feature_color

compute_feature_fontdict

def compute_feature_fontdict(
    self,
    feature
)

Compute a font dict for this feature.

View Source

    def compute_feature_fontdict(self, feature):

        """Compute a font dict for this feature."""

        return None

compute_feature_html

def compute_feature_html(
    self,
    feature
)

Gets the 'label' of the feature.

View Source

    def compute_feature_html(self, feature):

        """Gets the 'label' of the feature."""

        return self.compute_feature_label(feature)

compute_feature_label

def compute_feature_label(
    self,
    feature
)

Compute the label of the feature.

View Source

    def compute_feature_label(self, feature):

        """Compute the label of the feature."""

        label = feature.type

        for key in self.label_fields:

            if key in feature.qualifiers and len(feature.qualifiers[key]):

                label = feature.qualifiers[key]

                break

        if isinstance(label, list):

            label = "|".join(label)

        return label

compute_feature_label_link_color

def compute_feature_label_link_color(
    self,
    feature
)

Compute the color of the line linking the label to its feature.

View Source

    def compute_feature_label_link_color(self, feature):

        """Compute the color of the line linking the label to its feature."""

        return "black"

compute_feature_legend_text

def compute_feature_legend_text(
    self,
    feature
)

View Source

    def compute_feature_legend_text(self, feature):

        return None

compute_feature_linewidth

def compute_feature_linewidth(
    self,
    feature
)

Compute the edge width of the feature's arrow/rectangle.

View Source

    def compute_feature_linewidth(self, feature):

        """Compute the edge width of the feature's arrow/rectangle."""

        return 1.0

compute_filtered_features

def compute_filtered_features(
    self,
    features
)

Display only selected features.

View Source

    def compute_filtered_features(self, features):

        """Display only selected features."""

        filtered_features = []

        n = 8  # a good number of features to display

        # Keep longest features

        if len(features) > n:

            feature_lengths = []

            for feature in features:

                feature_lengths += [len(feature.location)]

            feature_lengths.sort(reverse=True)

            max_length = feature_lengths[n]

            for feature in features:

                if len(feature.location) > max_length:

                    filtered_features += [feature]

        else:  # no need to do anything if not enough features

            filtered_features = features

        return filtered_features

quick_plot

def quick_plot(
    self,
    record,
    figure_width=12,
    **kwargs
)

Allows super quick and dirty plotting of Biopython records.

This is really meant for use in a Jupyter/Ipython notebook with the "%matplotlib inline" setting.

from dna_features_viewer import BiopythonTranslator BiopythonTranslator.quick_plot(my_record)

View Source

    def quick_plot(self, record, figure_width=12, **kwargs):

        """Allows super quick and dirty plotting of Biopython records.

        This is really meant for use in a Jupyter/Ipython notebook with

        the "%matplotlib inline" setting.

        >>> from dna_features_viewer import BiopythonTranslator

        >>> BiopythonTranslator.quick_plot(my_record)

        """

        graphic_record = self.translate_record(record)

        ax, _ = graphic_record.plot(figure_width=figure_width, **kwargs)

        return ax

translate_feature

def translate_feature(
    self,
    feature
)

Translate a Biopython feature into a Dna Features Viewer feature.

View Source

    def translate_feature(self, feature):

        """Translate a Biopython feature into a Dna Features Viewer feature."""

        properties = dict(

            label=self.compute_feature_label(feature),

            color=self.compute_feature_color(feature),

            html=self.compute_feature_html(feature),

            fontdict=self.compute_feature_fontdict(feature),

            box_linewidth=self.compute_feature_box_linewidth(feature),

            box_color=self.compute_feature_box_color(feature),

            linewidth=self.compute_feature_linewidth(feature),

            label_link_color=self.compute_feature_label_link_color(feature),

            legend_text=self.compute_feature_legend_text(feature),

        )

        if self.features_properties is not None:

            other_properties = self.features_properties

            if hasattr(other_properties, "__call__"):

                other_properties = other_properties(feature)

            properties.update(other_properties)

        return GraphicFeature(

            start=feature.location.start,

            end=feature.location.end,

            strand=feature.location.strand,

            **properties

        )

translate_record

def translate_record(
    self,
    record,
    record_class=None,
    filetype=None
)

Create a new GraphicRecord from a BioPython Record object.

Parameters:

Name	Type	Description	Default
record	None	A BioPython Record object or the path to a Genbank or a GFF file.	None
record_class	None	The graphic record class to use, e.g. GraphicRecord (default) or CircularGraphicRecord. Strings 'circular' and 'linear' can also be provided.	None
filetype	None	Used only when a Genbank or a GFF file is provided; one of "genbank" or "gff" to be used. Default None infers from file extension.	None

View Source

    def translate_record(self, record, record_class=None, filetype=None):

        """Create a new GraphicRecord from a BioPython Record object.

        Parameters

        ----------

        record

          A BioPython Record object or the path to a Genbank or a GFF file.

        record_class

          The graphic record class to use, e.g. GraphicRecord (default) or

          CircularGraphicRecord. Strings 'circular' and 'linear' can also be

          provided.

        filetype

          Used only when a Genbank or a GFF file is provided; one of "genbank"

          or "gff" to be used. Default None infers from file extension.

        """

        classes = {

            "linear": GraphicRecord,

            "circular": CircularGraphicRecord,

            None: GraphicRecord,

        }

        if record_class in classes:

            record_class = classes[record_class]

        if isinstance(record, str) or hasattr(record, "read"):

            record = load_record(record, filetype=filetype)

        filtered_features = self.compute_filtered_features(record.features)

        return record_class(

            sequence_length=len(record),

            sequence=str(record.seq) if record.seq.defined else None,

            features=[

                self.translate_feature(feature)

                for feature in filtered_features

                if feature.location is not None

            ],

            **self.graphic_record_parameters

        )

PatternTranslator

class PatternTranslator(
    features_filters=(),
    features_properties=None
)

Custom translator used for the pattern annotation plot.

View Source

class PatternTranslator(dna_features_viewer.BiopythonTranslator):

    """Custom translator used for the pattern annotation plot."""

    def compute_feature_color(self, feature):

        status = "status"

        # Values are from binarize_bed() :

        if feature.qualifiers[status] == "1":

            return "red"

        elif feature.qualifiers[status] == "U":

            return "yellow"

        elif feature.qualifiers[status] == "0":

            return "grey"

        # to be implemented later:

        elif feature.qualifiers[status] == "?":

            return "tab:cyan"

        # There should be no other options.

Ancestors (in MRO)

dna_features_viewer.BiopythonTranslator.BiopythonTranslator.BiopythonTranslator
dna_features_viewer.BiopythonTranslator.BiopythonTranslatorBase.BiopythonTranslatorBase

Class variables

default_feature_color

graphic_record_parameters

ignored_features_types

label_fields

Static methods

quick_class_plot

def quick_class_plot(
    record,
    figure_width=12,
    **kwargs
)

Allows super quick and dirty plotting of Biopython records.

This is really meant for use in a Jupyter/Ipython notebook with the "%matplotlib inline" setting.

from dna_features_viewer import BiopythonTranslator BiopythonTranslator.quick_plot(my_record)

View Source

    @classmethod

    def quick_class_plot(cls, record, figure_width=12, **kwargs):

        """Allows super quick and dirty plotting of Biopython records.

        This is really meant for use in a Jupyter/Ipython notebook with

        the "%matplotlib inline" setting.

        >>> from dna_features_viewer import BiopythonTranslator

        >>> BiopythonTranslator.quick_plot(my_record)

        """

        graphic_record = cls().translate_record(record)

        ax, _ = graphic_record.plot(figure_width=figure_width, **kwargs)

        return ax

Methods

compute_feature_box_color

def compute_feature_box_color(
    self,
    feature
)

Compute a box_color for this feature.

View Source

    def compute_feature_box_color(self, feature):

        """Compute a box_color for this feature."""

        return "auto"

compute_feature_box_linewidth

def compute_feature_box_linewidth(
    self,
    feature
)

Compute a box_linewidth for this feature.

View Source

    def compute_feature_box_linewidth(self, feature):

        """Compute a box_linewidth for this feature."""

        return 0.3

compute_feature_color

def compute_feature_color(
    self,
    feature
)

Compute a color for this feature.

If the feature has a color qualifier it will be used. Otherwise, the classe's default_feature_color is used.

To change the behaviour, create a subclass of BiopythonTranslator and overwrite this method.

View Source

    def compute_feature_color(self, feature):

        status = "status"

        # Values are from binarize_bed() :

        if feature.qualifiers[status] == "1":

            return "red"

        elif feature.qualifiers[status] == "U":

            return "yellow"

        elif feature.qualifiers[status] == "0":

            return "grey"

        # to be implemented later:

        elif feature.qualifiers[status] == "?":

            return "tab:cyan"

        # There should be no other options.

compute_feature_fontdict

def compute_feature_fontdict(
    self,
    feature
)

Compute a font dict for this feature.

View Source

    def compute_feature_fontdict(self, feature):

        """Compute a font dict for this feature."""

        return None

compute_feature_html

def compute_feature_html(
    self,
    feature
)

Gets the 'label' of the feature.

View Source

    def compute_feature_html(self, feature):

        """Gets the 'label' of the feature."""

        return self.compute_feature_label(feature)

compute_feature_label

def compute_feature_label(
    self,
    feature
)

Compute the label of the feature.

View Source

    def compute_feature_label(self, feature):

        """Compute the label of the feature."""

        label = feature.type

        for key in self.label_fields:

            if key in feature.qualifiers and len(feature.qualifiers[key]):

                label = feature.qualifiers[key]

                break

        if isinstance(label, list):

            label = "|".join(label)

        return label

compute_feature_label_link_color

def compute_feature_label_link_color(
    self,
    feature
)

Compute the color of the line linking the label to its feature.

View Source

    def compute_feature_label_link_color(self, feature):

        """Compute the color of the line linking the label to its feature."""

        return "black"

compute_feature_legend_text

def compute_feature_legend_text(
    self,
    feature
)

View Source

    def compute_feature_legend_text(self, feature):

        return None

compute_feature_linewidth

def compute_feature_linewidth(
    self,
    feature
)

Compute the edge width of the feature's arrow/rectangle.

View Source

    def compute_feature_linewidth(self, feature):

        """Compute the edge width of the feature's arrow/rectangle."""

        return 1.0

compute_filtered_features

def compute_filtered_features(
    self,
    features
)

Return the list of features minus the ignored ones.

By the method keeps any feature whose type is not in ignored_features_types and for which all filter(f) pass.

View Source

    def compute_filtered_features(self, features):

        """Return the list of features minus the ignored ones.

        By the method keeps any feature whose type is not in

        ignored_features_types and for which all filter(f) pass.

        """

        return [

            f

            for f in features

            if all([fl(f) for fl in self.features_filters])

            and f.type not in self.ignored_features_types

        ]

quick_plot

def quick_plot(
    self,
    record,
    figure_width=12,
    **kwargs
)

Allows super quick and dirty plotting of Biopython records.

This is really meant for use in a Jupyter/Ipython notebook with the "%matplotlib inline" setting.

from dna_features_viewer import BiopythonTranslator BiopythonTranslator.quick_plot(my_record)

View Source

    def quick_plot(self, record, figure_width=12, **kwargs):

        """Allows super quick and dirty plotting of Biopython records.

        This is really meant for use in a Jupyter/Ipython notebook with

        the "%matplotlib inline" setting.

        >>> from dna_features_viewer import BiopythonTranslator

        >>> BiopythonTranslator.quick_plot(my_record)

        """

        graphic_record = self.translate_record(record)

        ax, _ = graphic_record.plot(figure_width=figure_width, **kwargs)

        return ax

translate_feature

def translate_feature(
    self,
    feature
)

Translate a Biopython feature into a Dna Features Viewer feature.

View Source

    def translate_feature(self, feature):

        """Translate a Biopython feature into a Dna Features Viewer feature."""

        properties = dict(

            label=self.compute_feature_label(feature),

            color=self.compute_feature_color(feature),

            html=self.compute_feature_html(feature),

            fontdict=self.compute_feature_fontdict(feature),

            box_linewidth=self.compute_feature_box_linewidth(feature),

            box_color=self.compute_feature_box_color(feature),

            linewidth=self.compute_feature_linewidth(feature),

            label_link_color=self.compute_feature_label_link_color(feature),

            legend_text=self.compute_feature_legend_text(feature),

        )

        if self.features_properties is not None:

            other_properties = self.features_properties

            if hasattr(other_properties, "__call__"):

                other_properties = other_properties(feature)

            properties.update(other_properties)

        return GraphicFeature(

            start=feature.location.start,

            end=feature.location.end,

            strand=feature.location.strand,

            **properties

        )

translate_record

def translate_record(
    self,
    record,
    record_class=None,
    filetype=None
)

Create a new GraphicRecord from a BioPython Record object.

Parameters:

Name	Type	Description	Default
record	None	A BioPython Record object or the path to a Genbank or a GFF file.	None
record_class	None	The graphic record class to use, e.g. GraphicRecord (default) or CircularGraphicRecord. Strings 'circular' and 'linear' can also be provided.	None
filetype	None	Used only when a Genbank or a GFF file is provided; one of "genbank" or "gff" to be used. Default None infers from file extension.	None

View Source

    def translate_record(self, record, record_class=None, filetype=None):

        """Create a new GraphicRecord from a BioPython Record object.

        Parameters

        ----------

        record

          A BioPython Record object or the path to a Genbank or a GFF file.

        record_class

          The graphic record class to use, e.g. GraphicRecord (default) or

          CircularGraphicRecord. Strings 'circular' and 'linear' can also be

          provided.

        filetype

          Used only when a Genbank or a GFF file is provided; one of "genbank"

          or "gff" to be used. Default None infers from file extension.

        """

        classes = {

            "linear": GraphicRecord,

            "circular": CircularGraphicRecord,

            None: GraphicRecord,

        }

        if record_class in classes:

            record_class = classes[record_class]

        if isinstance(record, str) or hasattr(record, "read"):

            record = load_record(record, filetype=filetype)

        filtered_features = self.compute_filtered_features(record.features)

        return record_class(

            sequence_length=len(record),

            sequence=str(record.seq) if record.seq.defined else None,

            features=[

                self.translate_feature(feature)

                for feature in filtered_features

                if feature.location is not None

            ],

            **self.graphic_record_parameters

        )