Module epijinn.Methyl
View Source
# Copyright 2024 Edinburgh Genome Foundry, University of Edinburgh
#
# This file is part of EpiJinn.
#
# EpiJinn is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
#
# EpiJinn is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with Ediacara. If not, see <https://www.gnu.org/licenses/>.
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
import dnachisel
class Methylase:
"""Methylase enzyme class.
**Parameters**
**name**
> Name of the enzyme (`str`).
**sequence**
> Sequence of extended nucleotide characters (`str`).
**index_pos**
> Index of the methylated base on the positive strand (`int`).
**index_neg**
> Index of the methylated base on the negative strand (`int`).
"""
complement_table = {
"A": "T",
"G": "C",
"C": "G",
"T": "A",
"Y": "R",
"R": "Y",
"W": "W",
"S": "S",
"K": "M",
"M": "K",
"D": "H",
"V": "B",
"H": "D",
"B": "V",
"X": "X",
"N": "N",
}
def __init__(self, name, sequence, index_pos, index_neg):
self.name = name
self.sequence = sequence
self.rc = self.reverse_complement(sequence)
self.index_pos = index_pos
self.index_neg = index_neg
@staticmethod
def reverse(sequence):
reverse = sequence[::-1]
return reverse
@staticmethod
def complement(sequence):
complement_letters = [Methylase.complement_table[letter] for letter in sequence]
complement = "".join(complement_letters)
return complement
@staticmethod
def reverse_complement(sequence):
r = Methylase.reverse(sequence)
rc = Methylase.complement(r)
return rc
class Methylator:
"""Class for finding methylation sites within a pattern (site) in a sequence.
**Parameters**
**sequence**
> Sequence of ATGC characters (`str`).
**site**
> Sequence of restriction enzyme recognition site (`str`).
**methylases**
> Methylase class instances (`list`). Default uses built-in methylases.
"""
def __init__(self, sequence, site, methylases=None):
self.sequence = sequence
if methylases is None:
self.methylases = list(METHYLASES.values())
else:
self.methylases = methylases
self.site = site
self.pattern = dnachisel.SequencePattern(site)
self.regions_seq = self.pattern.find_matches(self.sequence)
self.site_rc = Methylase.reverse_complement(site)
self.pattern_rc = dnachisel.SequencePattern(self.site_rc)
self.regions_rc = self.pattern_rc.find_matches(self.sequence)
self.regions = self.regions_seq + self.regions_rc
self.report = ""
def find_methylation_sites_in_pattern(self):
"""Run find_one_methylation_site_in_pattern() for each enzyme in methylases."""
self.report += "Matches against methylase enzyme sites:\n\n"
for methylase in self.methylases:
self.find_one_methylation_site_in_pattern(methylase)
self.report += "\n"
def find_one_methylation_site_in_pattern(self, methylase):
"""Find overlapping methylation and restriction sites."""
extended_regions = self.extend_restriction_regions(methylase)
# For matching against positive strand of methylation pattern:
expression = dnachisel.DnaNotationPattern.dna_sequence_to_regexpr(
methylase.sequence
)
pattern = dnachisel.SequencePattern(expression)
# For matching against negative strand of methylation pattern:
expression_rc = dnachisel.DnaNotationPattern.dna_sequence_to_regexpr(
methylase.rc
)
pattern_rc = dnachisel.SequencePattern(expression_rc)
self.report += methylase.name + "\n"
self.report += "=" * len(methylase.name) + "\n"
for region in extended_regions:
region_sequence = self.sequence[region.start : region.end]
self.report += "Region:" + str(region) + "\n"
match_location = pattern.find_matches(region_sequence)
if len(match_location) != 0:
self.report += "Match in positive strand: %s\n" % region_sequence
else:
self.report += "Positive strand: -\n"
match_location_rc = pattern_rc.find_matches(region_sequence)
if len(match_location_rc) != 0:
self.report += "Match in negative strand: %s\n" % region_sequence
else:
self.report += "Negative strand: -\n"
self.report += "\n"
def extend_restriction_regions(self, methylase):
"""Modify list of dnachisel.Location of restriction sites to include
flanking nucleotides around restriction sites.
"""
extended_regions = []
for region in self.regions:
region = region.extended(
methylase.index_neg, left=True, right=False
) # extension upstream
m = len(methylase.sequence) - (methylase.index_pos + 1)
region = region.extended(
m, upper_limit=len(self.sequence), left=False, right=True
) # extension downstream
extended_regions.append(region)
return extended_regions
def annotate_methylation(seqrecord, methylases=None):
if methylases is None:
methylases = list(METHYLASES.values())
for methylase in methylases:
name = methylase.name
regex = dnachisel.DnaNotationPattern.dna_sequence_to_regexpr(methylase.sequence)
pattern = dnachisel.SequencePattern(regex)
match_location = pattern.find_matches(str(seqrecord.seq))
if len(match_location) != 0:
for match in match_location:
label = "@epijinn(" + methylase.name + ")"
seqrecord.features.append(
SeqFeature(
FeatureLocation(match.start, match.end), # dnachisel.Location
type="misc_feature",
id="@epijinn",
qualifiers={"label": label, "note": name},
)
)
# Mark the methylation site for checking overlap with restriction site
methylated_position = match.start + methylase.index_pos
methylated_nucleotide = str(methylase.sequence[methylase.index_pos])
label = "@epijinn(" + methylated_nucleotide + ", strand=1)"
seqrecord.features.append(
SeqFeature(
FeatureLocation(
methylated_position, methylated_position + 1, strand=1
),
type="misc_feature",
id="@epijinn",
qualifiers={"label": label, "note": name},
)
)
# Negative strand:
methylated_position = match.start + methylase.index_neg
methylated_nucleotide = str(
Seq(methylase.sequence[methylase.index_neg]).reverse_complement()
)
label = "@epijinn(" + methylated_nucleotide + ", strand=-1)"
seqrecord.features.append(
SeqFeature(
FeatureLocation(
methylated_position, methylated_position + 1, strand=-1
),
type="misc_feature",
id="@epijinn",
qualifiers={"label": label, "note": name},
)
)
# Repeat for reverse complement, if not palindromic:
if Seq(methylase.sequence) != Seq(methylase.sequence).reverse_complement():
name = methylase.name
methylase_rc_seq = dnachisel.reverse_complement(methylase.sequence)
regex = dnachisel.DnaNotationPattern.dna_sequence_to_regexpr(
methylase_rc_seq
)
pattern = dnachisel.SequencePattern(regex)
match_location = pattern.find_matches(str(seqrecord.seq))
if len(match_location) != 0:
for match in match_location:
label = "@epijinn_rc(" + methylase.name + ")"
seqrecord.features.append(
SeqFeature(
FeatureLocation(
match.start, match.end
), # dnachisel.Location
type="misc_feature",
id="@epijinn_rc",
qualifiers={"label": label, "note": name},
)
)
# Mark the methylation site:
# reverse complement so need to count backwards, and strand=-1
# subtract 1 to account for range
methylated_position = match.end - 1 - methylase.index_pos
methylated_nucleotide = str(methylase.sequence[methylase.index_pos])
label = "@epijinn(" + methylated_nucleotide + ", strand=-1)"
seqrecord.features.append(
SeqFeature(
FeatureLocation(
methylated_position, methylated_position + 1, strand=-1
),
type="misc_feature",
id="@epijinn",
qualifiers={"label": label, "note": name},
)
)
# Mark methylation in antisense of the enzyme site:
# reverse complement so need to count backwards, and strand=1
# subtract 1 to account for range
methylated_position = match.end - 1 - methylase.index_neg
methylated_nucleotide = str(
Seq(
methylase.sequence[methylase.index_neg]
).reverse_complement()
)
label = "@epijinn(" + methylated_nucleotide + ", strand=1)"
seqrecord.features.append(
SeqFeature(
FeatureLocation(
methylated_position, methylated_position + 1, strand=1
),
type="misc_feature",
id="@epijinn",
qualifiers={"label": label, "note": name},
)
)
return seqrecord
# Cytosine:
AluI = Methylase(name="AluI", sequence="AGCT", index_pos=2, index_neg=1)
BamHI = Methylase(name="BamHI", sequence="GGATCC", index_pos=4, index_neg=1)
CpG = Methylase(name="CpG", sequence="CG", index_pos=0, index_neg=1)
EcoKDcm = Methylase(name="EcoKDcm", sequence="CCWGG", index_pos=1, index_neg=3)
GpC = Methylase(name="GpC", sequence="GC", index_pos=1, index_neg=0)
HaeIII = Methylase(name="HaeIII", sequence="GGCC", index_pos=2, index_neg=1)
Hhal = Methylase(name="Hhal", sequence="GCGC", index_pos=1, index_neg=2)
HpaII = Methylase(name="HpaII", sequence="CCGG", index_pos=1, index_neg=2)
MspI = Methylase(name="MspI", sequence="CCGG", index_pos=0, index_neg=3)
# Adenine:
EcoBI = Methylase(name="EcoBI", sequence="TGANNNNNNNNTGCT", index_pos=2, index_neg=11)
# EcoGII = Methylase(name="EcoGII", sequence="A", index_pos=0, index_neg=) # no valid index_neg value
EcoKDam = Methylase(name="EcoKDam", sequence="GATC", index_pos=1, index_neg=2)
EcoKI = Methylase(name="EcoKI", sequence="AACNNNNNNGTGC", index_pos=1, index_neg=10)
EcoRI = Methylase(name="EcoRI", sequence="GAATTC", index_pos=2, index_neg=3)
TaqI = Methylase(name="TaqI", sequence="TCGA", index_pos=3, index_neg=0)
METHYLASES = {
AluI.name: AluI,
BamHI.name: BamHI,
CpG.name: CpG,
EcoKDcm.name: EcoKDcm,
GpC.name: GpC,
HaeIII.name: HaeIII,
Hhal.name: Hhal,
HpaII.name: HpaII,
MspI.name: MspI,
EcoBI.name: EcoBI,
# EcoGII.name: EcoGII,
EcoKDam.name: EcoKDam,
EcoKI.name: EcoKI,
EcoRI.name: EcoRI,
TaqI.name: TaqI,
}
Variables
AluI
BamHI
CpG
EcoBI
EcoKDam
EcoKDcm
EcoKI
EcoRI
GpC
HaeIII
Hhal
HpaII
METHYLASES
MspI
TaqI
Functions
annotate_methylation
def annotate_methylation(
seqrecord,
methylases=None
)
View Source
def annotate_methylation(seqrecord, methylases=None):
if methylases is None:
methylases = list(METHYLASES.values())
for methylase in methylases:
name = methylase.name
regex = dnachisel.DnaNotationPattern.dna_sequence_to_regexpr(methylase.sequence)
pattern = dnachisel.SequencePattern(regex)
match_location = pattern.find_matches(str(seqrecord.seq))
if len(match_location) != 0:
for match in match_location:
label = "@epijinn(" + methylase.name + ")"
seqrecord.features.append(
SeqFeature(
FeatureLocation(match.start, match.end), # dnachisel.Location
type="misc_feature",
id="@epijinn",
qualifiers={"label": label, "note": name},
)
)
# Mark the methylation site for checking overlap with restriction site
methylated_position = match.start + methylase.index_pos
methylated_nucleotide = str(methylase.sequence[methylase.index_pos])
label = "@epijinn(" + methylated_nucleotide + ", strand=1)"
seqrecord.features.append(
SeqFeature(
FeatureLocation(
methylated_position, methylated_position + 1, strand=1
),
type="misc_feature",
id="@epijinn",
qualifiers={"label": label, "note": name},
)
)
# Negative strand:
methylated_position = match.start + methylase.index_neg
methylated_nucleotide = str(
Seq(methylase.sequence[methylase.index_neg]).reverse_complement()
)
label = "@epijinn(" + methylated_nucleotide + ", strand=-1)"
seqrecord.features.append(
SeqFeature(
FeatureLocation(
methylated_position, methylated_position + 1, strand=-1
),
type="misc_feature",
id="@epijinn",
qualifiers={"label": label, "note": name},
)
)
# Repeat for reverse complement, if not palindromic:
if Seq(methylase.sequence) != Seq(methylase.sequence).reverse_complement():
name = methylase.name
methylase_rc_seq = dnachisel.reverse_complement(methylase.sequence)
regex = dnachisel.DnaNotationPattern.dna_sequence_to_regexpr(
methylase_rc_seq
)
pattern = dnachisel.SequencePattern(regex)
match_location = pattern.find_matches(str(seqrecord.seq))
if len(match_location) != 0:
for match in match_location:
label = "@epijinn_rc(" + methylase.name + ")"
seqrecord.features.append(
SeqFeature(
FeatureLocation(
match.start, match.end
), # dnachisel.Location
type="misc_feature",
id="@epijinn_rc",
qualifiers={"label": label, "note": name},
)
)
# Mark the methylation site:
# reverse complement so need to count backwards, and strand=-1
# subtract 1 to account for range
methylated_position = match.end - 1 - methylase.index_pos
methylated_nucleotide = str(methylase.sequence[methylase.index_pos])
label = "@epijinn(" + methylated_nucleotide + ", strand=-1)"
seqrecord.features.append(
SeqFeature(
FeatureLocation(
methylated_position, methylated_position + 1, strand=-1
),
type="misc_feature",
id="@epijinn",
qualifiers={"label": label, "note": name},
)
)
# Mark methylation in antisense of the enzyme site:
# reverse complement so need to count backwards, and strand=1
# subtract 1 to account for range
methylated_position = match.end - 1 - methylase.index_neg
methylated_nucleotide = str(
Seq(
methylase.sequence[methylase.index_neg]
).reverse_complement()
)
label = "@epijinn(" + methylated_nucleotide + ", strand=1)"
seqrecord.features.append(
SeqFeature(
FeatureLocation(
methylated_position, methylated_position + 1, strand=1
),
type="misc_feature",
id="@epijinn",
qualifiers={"label": label, "note": name},
)
)
return seqrecord
Classes
Methylase
class Methylase(
name,
sequence,
index_pos,
index_neg
)
Methylase enzyme class.
Parameters
name
Name of the enzyme (
str
).
sequence
Sequence of extended nucleotide characters (
str
).
index_pos
Index of the methylated base on the positive strand (
int
).
index_neg
Index of the methylated base on the negative strand (
int
).
View Source
class Methylase:
"""Methylase enzyme class.
**Parameters**
**name**
> Name of the enzyme (`str`).
**sequence**
> Sequence of extended nucleotide characters (`str`).
**index_pos**
> Index of the methylated base on the positive strand (`int`).
**index_neg**
> Index of the methylated base on the negative strand (`int`).
"""
complement_table = {
"A": "T",
"G": "C",
"C": "G",
"T": "A",
"Y": "R",
"R": "Y",
"W": "W",
"S": "S",
"K": "M",
"M": "K",
"D": "H",
"V": "B",
"H": "D",
"B": "V",
"X": "X",
"N": "N",
}
def __init__(self, name, sequence, index_pos, index_neg):
self.name = name
self.sequence = sequence
self.rc = self.reverse_complement(sequence)
self.index_pos = index_pos
self.index_neg = index_neg
@staticmethod
def reverse(sequence):
reverse = sequence[::-1]
return reverse
@staticmethod
def complement(sequence):
complement_letters = [Methylase.complement_table[letter] for letter in sequence]
complement = "".join(complement_letters)
return complement
@staticmethod
def reverse_complement(sequence):
r = Methylase.reverse(sequence)
rc = Methylase.complement(r)
return rc
Descendants
- epijinn.dnd.Dnd
Class variables
complement_table
Static methods
complement
def complement(
sequence
)
View Source
@staticmethod
def complement(sequence):
complement_letters = [Methylase.complement_table[letter] for letter in sequence]
complement = "".join(complement_letters)
return complement
reverse
def reverse(
sequence
)
View Source
@staticmethod
def reverse(sequence):
reverse = sequence[::-1]
return reverse
reverse_complement
def reverse_complement(
sequence
)
View Source
@staticmethod
def reverse_complement(sequence):
r = Methylase.reverse(sequence)
rc = Methylase.complement(r)
return rc
Methylator
class Methylator(
sequence,
site,
methylases=None
)
Class for finding methylation sites within a pattern (site) in a sequence.
Parameters
sequence
Sequence of ATGC characters (
str
).
site
Sequence of restriction enzyme recognition site (
str
).
methylases
Methylase class instances (
list
). Default uses built-in methylases.
View Source
class Methylator:
"""Class for finding methylation sites within a pattern (site) in a sequence.
**Parameters**
**sequence**
> Sequence of ATGC characters (`str`).
**site**
> Sequence of restriction enzyme recognition site (`str`).
**methylases**
> Methylase class instances (`list`). Default uses built-in methylases.
"""
def __init__(self, sequence, site, methylases=None):
self.sequence = sequence
if methylases is None:
self.methylases = list(METHYLASES.values())
else:
self.methylases = methylases
self.site = site
self.pattern = dnachisel.SequencePattern(site)
self.regions_seq = self.pattern.find_matches(self.sequence)
self.site_rc = Methylase.reverse_complement(site)
self.pattern_rc = dnachisel.SequencePattern(self.site_rc)
self.regions_rc = self.pattern_rc.find_matches(self.sequence)
self.regions = self.regions_seq + self.regions_rc
self.report = ""
def find_methylation_sites_in_pattern(self):
"""Run find_one_methylation_site_in_pattern() for each enzyme in methylases."""
self.report += "Matches against methylase enzyme sites:\n\n"
for methylase in self.methylases:
self.find_one_methylation_site_in_pattern(methylase)
self.report += "\n"
def find_one_methylation_site_in_pattern(self, methylase):
"""Find overlapping methylation and restriction sites."""
extended_regions = self.extend_restriction_regions(methylase)
# For matching against positive strand of methylation pattern:
expression = dnachisel.DnaNotationPattern.dna_sequence_to_regexpr(
methylase.sequence
)
pattern = dnachisel.SequencePattern(expression)
# For matching against negative strand of methylation pattern:
expression_rc = dnachisel.DnaNotationPattern.dna_sequence_to_regexpr(
methylase.rc
)
pattern_rc = dnachisel.SequencePattern(expression_rc)
self.report += methylase.name + "\n"
self.report += "=" * len(methylase.name) + "\n"
for region in extended_regions:
region_sequence = self.sequence[region.start : region.end]
self.report += "Region:" + str(region) + "\n"
match_location = pattern.find_matches(region_sequence)
if len(match_location) != 0:
self.report += "Match in positive strand: %s\n" % region_sequence
else:
self.report += "Positive strand: -\n"
match_location_rc = pattern_rc.find_matches(region_sequence)
if len(match_location_rc) != 0:
self.report += "Match in negative strand: %s\n" % region_sequence
else:
self.report += "Negative strand: -\n"
self.report += "\n"
def extend_restriction_regions(self, methylase):
"""Modify list of dnachisel.Location of restriction sites to include
flanking nucleotides around restriction sites.
"""
extended_regions = []
for region in self.regions:
region = region.extended(
methylase.index_neg, left=True, right=False
) # extension upstream
m = len(methylase.sequence) - (methylase.index_pos + 1)
region = region.extended(
m, upper_limit=len(self.sequence), left=False, right=True
) # extension downstream
extended_regions.append(region)
return extended_regions
Methods
extend_restriction_regions
def extend_restriction_regions(
self,
methylase
)
Modify list of dnachisel.Location of restriction sites to include
flanking nucleotides around restriction sites.
View Source
def extend_restriction_regions(self, methylase):
"""Modify list of dnachisel.Location of restriction sites to include
flanking nucleotides around restriction sites.
"""
extended_regions = []
for region in self.regions:
region = region.extended(
methylase.index_neg, left=True, right=False
) # extension upstream
m = len(methylase.sequence) - (methylase.index_pos + 1)
region = region.extended(
m, upper_limit=len(self.sequence), left=False, right=True
) # extension downstream
extended_regions.append(region)
return extended_regions
find_methylation_sites_in_pattern
def find_methylation_sites_in_pattern(
self
)
Run find_one_methylation_site_in_pattern() for each enzyme in methylases.
View Source
def find_methylation_sites_in_pattern(self):
"""Run find_one_methylation_site_in_pattern() for each enzyme in methylases."""
self.report += "Matches against methylase enzyme sites:\n\n"
for methylase in self.methylases:
self.find_one_methylation_site_in_pattern(methylase)
self.report += "\n"
find_one_methylation_site_in_pattern
def find_one_methylation_site_in_pattern(
self,
methylase
)
Find overlapping methylation and restriction sites.
View Source
def find_one_methylation_site_in_pattern(self, methylase):
"""Find overlapping methylation and restriction sites."""
extended_regions = self.extend_restriction_regions(methylase)
# For matching against positive strand of methylation pattern:
expression = dnachisel.DnaNotationPattern.dna_sequence_to_regexpr(
methylase.sequence
)
pattern = dnachisel.SequencePattern(expression)
# For matching against negative strand of methylation pattern:
expression_rc = dnachisel.DnaNotationPattern.dna_sequence_to_regexpr(
methylase.rc
)
pattern_rc = dnachisel.SequencePattern(expression_rc)
self.report += methylase.name + "\n"
self.report += "=" * len(methylase.name) + "\n"
for region in extended_regions:
region_sequence = self.sequence[region.start : region.end]
self.report += "Region:" + str(region) + "\n"
match_location = pattern.find_matches(region_sequence)
if len(match_location) != 0:
self.report += "Match in positive strand: %s\n" % region_sequence
else:
self.report += "Positive strand: -\n"
match_location_rc = pattern_rc.find_matches(region_sequence)
if len(match_location_rc) != 0:
self.report += "Match in negative strand: %s\n" % region_sequence
else:
self.report += "Negative strand: -\n"
self.report += "\n"