Module overhang.tools
View Source
import itertools
import tatapov
import numpy as np
import matplotlib.pyplot as plt
complements = {"A": "T", "T": "A", "C": "G", "G": "C"}
enzyme_tatapov_lookup = {
"BsaI": "2020_01h_BsaI",
"BsmBI": "2020_01h_BsmBI",
"Esp3I": "2020_01h_Esp3I",
"BbsI": "2020_01h_BbsI",
}
def reverse_complement(sequence):
"""Return the reverse complement of a DNA sequence.
For instance `reverse_complement("ATGC")` returns `"GCAT"`.
**Parameters**
**sequence**
> An ATGC string (`str`).
"""
return "".join([complements[c] for c in sequence[::-1]])
def gc_content(sequence):
"""Return the proportion of G and C in the sequence (between 0 and 1).
This function is equivalent to `goldenhinges.biotools.gc_content()`.
**Parameters**
**sequence**
> An ATGC string (`str`).
"""
return 1.0 * len([c for c in sequence if c in "GC"]) / len(sequence)
def order_overhangs(seq):
"""Create an overhang's reverse complement, and return them in order.
Overhangs are ordered by the letters, e.g. AATA < TATT.
**Parameters**
**seq**
> ACGT sequence (`str`)."""
sorted_overhangs = [seq, reverse_complement(seq)]
sorted_overhangs.sort()
return sorted_overhangs[0], sorted_overhangs[1]
def generate_overhang_pairs(overhang_length=4):
"""Generate all overhang pairs of given length.
**Parameters**
**overhang_length**
> Length of overhangs (`int`).
"""
raw_overhangs = [
"".join(overhang)
for overhang in itertools.product(*overhang_length * ("ACGT",))
] # adapted from goldenhinges
overhang_pairs = [] # for each overhang and its complement
for overhang in raw_overhangs:
overhang_pairs += [frozenset([overhang, reverse_complement(overhang)])]
overhang_pairs = set(overhang_pairs) # remove duplicate pairs
return overhang_pairs
def subset_data_for_overhang(dataframe, overhang, horizontal=True, filter=True):
"""Subset Tatapov dataframe for given overhang.
**Parameters**
**dataframe**
> Tatapov dataset, for example `tatapov.annealing_data["25C"]["01h"]`
**overhang**
> Overhang class instance (`Overhang`)
**horizontal**
> Orientation of returned dataframe (`bool`).
**filter**
> If True, keep only columns (if horizontal=True) or rows (if horizontal=False)
with nonzero values (`bool`).
"""
overhangs = [overhang.overhang, overhang.overhang_rc]
if horizontal:
subset_data = dataframe.loc[overhangs]
if filter:
subset_data = subset_data.loc[:, subset_data.sum(axis=0) != 0]
return subset_data
else: # vertical
subset_data = dataframe[overhangs]
if filter:
subset_data = subset_data.loc[subset_data.sum(axis=1) != 0, :]
return subset_data
def plot_data(df, ax=None, colorbar=True, figwidth=8, plot_color="Reds"):
"""Plot a (restricted) Tatapov dataframe.
**Parameters**
**df**
> One of the data sheets provided by tatapov, e.g. ``annealing_data["37C"]["01h"]``.
Or a restriction using ``data_subset``.
**ax**
> A Matplotlib ax. If none is provided, one will be created and returned at the end.
**colorbar**
> If True, the figure will have a colorbar.
**figwidth**
> Custom width of the figure.
**plot_color**
> A Matplotlib colormap name.
"""
# Adapted from tatapov.plot_data()
if ax is None:
_, ax = plt.subplots(1, figsize=(figwidth, 1.5))
values = np.log10(np.maximum(0.5, df.values[::-1]))
im = ax.imshow(values, cmap=plot_color)
if colorbar:
ax.figure.colorbar(im, label="log10( occurrences )")
xtick_labels = df.columns
ax.set_xticks(range(len(xtick_labels)))
ax.set_xticklabels(xtick_labels, rotation=90)
ax.xaxis.tick_top()
ax.set_xlim(right=len(xtick_labels) - 0.5)
ytick_labels = df.index[::-1]
ax.set_yticks(range(len(ytick_labels)))
ax.set_yticklabels(ytick_labels)
ax.set_ylim(-0.5, len(ytick_labels) - 0.5)
plt.close()
return ax, im
def filter_overhangs(overhangs, enzyme="Esp3I"):
"""Filter overhangs using the Tatapov package.
Filter out the weakly annealing and self-misannealing overhangs.
**Parameters**
**overhangs**
> List of Overhang instances (`list`).
**enzyme**
> Enzyme used with the overhangs (`str`). See `overhang.tools.enzyme_tatapov_lookup`
for options.
"""
data = tatapov.annealing_data["37C"][enzyme_tatapov_lookup[enzyme]]
overhang_input = [overhang.overhang for overhang in overhangs]
subset = tatapov.data_subset(data, overhang_input, add_reverse=True)
# WEAK ANNEALS
# See cutoff 400 in Pryor et al. Figure 2.
strong_overhangs = []
for overhang in overhangs:
if subset[overhang.overhang][overhang.overhang_rc] >= 400:
strong_overhangs += [overhang]
# SELF-MISANNEALS
# Use 0 as cutoff:
good_overhangs = []
for overhang in strong_overhangs:
if (
subset[overhang.overhang][overhang.overhang] == 0
and subset[overhang.overhang_rc][overhang.overhang_rc] == 0
):
good_overhangs += [overhang]
return good_overhangs
Variables
complements
enzyme_tatapov_lookup
Functions
filter_overhangs
def filter_overhangs(
overhangs,
enzyme='Esp3I'
)
Filter overhangs using the Tatapov package.
Filter out the weakly annealing and self-misannealing overhangs.
Parameters
overhangs
List of Overhang instances (
list
).
enzyme
Enzyme used with the overhangs (
str
). Seeoverhang.tools.enzyme_tatapov_lookup
for options.
View Source
def filter_overhangs(overhangs, enzyme="Esp3I"):
"""Filter overhangs using the Tatapov package.
Filter out the weakly annealing and self-misannealing overhangs.
**Parameters**
**overhangs**
> List of Overhang instances (`list`).
**enzyme**
> Enzyme used with the overhangs (`str`). See `overhang.tools.enzyme_tatapov_lookup`
for options.
"""
data = tatapov.annealing_data["37C"][enzyme_tatapov_lookup[enzyme]]
overhang_input = [overhang.overhang for overhang in overhangs]
subset = tatapov.data_subset(data, overhang_input, add_reverse=True)
# WEAK ANNEALS
# See cutoff 400 in Pryor et al. Figure 2.
strong_overhangs = []
for overhang in overhangs:
if subset[overhang.overhang][overhang.overhang_rc] >= 400:
strong_overhangs += [overhang]
# SELF-MISANNEALS
# Use 0 as cutoff:
good_overhangs = []
for overhang in strong_overhangs:
if (
subset[overhang.overhang][overhang.overhang] == 0
and subset[overhang.overhang_rc][overhang.overhang_rc] == 0
):
good_overhangs += [overhang]
return good_overhangs
gc_content
def gc_content(
sequence
)
Return the proportion of G and C in the sequence (between 0 and 1).
This function is equivalent to goldenhinges.biotools.gc_content()
.
Parameters
sequence
An ATGC string (
str
).
View Source
def gc_content(sequence):
"""Return the proportion of G and C in the sequence (between 0 and 1).
This function is equivalent to `goldenhinges.biotools.gc_content()`.
**Parameters**
**sequence**
> An ATGC string (`str`).
"""
return 1.0 * len([c for c in sequence if c in "GC"]) / len(sequence)
generate_overhang_pairs
def generate_overhang_pairs(
overhang_length=4
)
Generate all overhang pairs of given length.
Parameters
overhang_length
Length of overhangs (
int
).
View Source
def generate_overhang_pairs(overhang_length=4):
"""Generate all overhang pairs of given length.
**Parameters**
**overhang_length**
> Length of overhangs (`int`).
"""
raw_overhangs = [
"".join(overhang)
for overhang in itertools.product(*overhang_length * ("ACGT",))
] # adapted from goldenhinges
overhang_pairs = [] # for each overhang and its complement
for overhang in raw_overhangs:
overhang_pairs += [frozenset([overhang, reverse_complement(overhang)])]
overhang_pairs = set(overhang_pairs) # remove duplicate pairs
return overhang_pairs
order_overhangs
def order_overhangs(
seq
)
Create an overhang's reverse complement, and return them in order.
Overhangs are ordered by the letters, e.g. AATA < TATT.
Parameters
seq
ACGT sequence (
str
).
View Source
def order_overhangs(seq):
"""Create an overhang's reverse complement, and return them in order.
Overhangs are ordered by the letters, e.g. AATA < TATT.
**Parameters**
**seq**
> ACGT sequence (`str`)."""
sorted_overhangs = [seq, reverse_complement(seq)]
sorted_overhangs.sort()
return sorted_overhangs[0], sorted_overhangs[1]
plot_data
def plot_data(
df,
ax=None,
colorbar=True,
figwidth=8,
plot_color='Reds'
)
Plot a (restricted) Tatapov dataframe.
Parameters
df
One of the data sheets provided by tatapov, e.g.
annealing_data["37C"]["01h"]
. Or a restriction usingdata_subset
.
ax
A Matplotlib ax. If none is provided, one will be created and returned at the end.
colorbar
If True, the figure will have a colorbar.
figwidth
Custom width of the figure.
plot_color
A Matplotlib colormap name.
View Source
def plot_data(df, ax=None, colorbar=True, figwidth=8, plot_color="Reds"):
"""Plot a (restricted) Tatapov dataframe.
**Parameters**
**df**
> One of the data sheets provided by tatapov, e.g. ``annealing_data["37C"]["01h"]``.
Or a restriction using ``data_subset``.
**ax**
> A Matplotlib ax. If none is provided, one will be created and returned at the end.
**colorbar**
> If True, the figure will have a colorbar.
**figwidth**
> Custom width of the figure.
**plot_color**
> A Matplotlib colormap name.
"""
# Adapted from tatapov.plot_data()
if ax is None:
_, ax = plt.subplots(1, figsize=(figwidth, 1.5))
values = np.log10(np.maximum(0.5, df.values[::-1]))
im = ax.imshow(values, cmap=plot_color)
if colorbar:
ax.figure.colorbar(im, label="log10( occurrences )")
xtick_labels = df.columns
ax.set_xticks(range(len(xtick_labels)))
ax.set_xticklabels(xtick_labels, rotation=90)
ax.xaxis.tick_top()
ax.set_xlim(right=len(xtick_labels) - 0.5)
ytick_labels = df.index[::-1]
ax.set_yticks(range(len(ytick_labels)))
ax.set_yticklabels(ytick_labels)
ax.set_ylim(-0.5, len(ytick_labels) - 0.5)
plt.close()
return ax, im
reverse_complement
def reverse_complement(
sequence
)
Return the reverse complement of a DNA sequence.
For instance reverse_complement("ATGC")
returns "GCAT"
.
Parameters
sequence
An ATGC string (
str
).
View Source
def reverse_complement(sequence):
"""Return the reverse complement of a DNA sequence.
For instance `reverse_complement("ATGC")` returns `"GCAT"`.
**Parameters**
**sequence**
> An ATGC string (`str`).
"""
return "".join([complements[c] for c in sequence[::-1]])
subset_data_for_overhang
def subset_data_for_overhang(
dataframe,
overhang,
horizontal=True,
filter=True
)
Subset Tatapov dataframe for given overhang.
Parameters
dataframe
Tatapov dataset, for example
tatapov.annealing_data["25C"]["01h"]
overhang
Overhang class instance (
Overhang
)
horizontal
Orientation of returned dataframe (
bool
).
filter
If True, keep only columns (if horizontal=True) or rows (if horizontal=False) with nonzero values (
bool
).
View Source
def subset_data_for_overhang(dataframe, overhang, horizontal=True, filter=True):
"""Subset Tatapov dataframe for given overhang.
**Parameters**
**dataframe**
> Tatapov dataset, for example `tatapov.annealing_data["25C"]["01h"]`
**overhang**
> Overhang class instance (`Overhang`)
**horizontal**
> Orientation of returned dataframe (`bool`).
**filter**
> If True, keep only columns (if horizontal=True) or rows (if horizontal=False)
with nonzero values (`bool`).
"""
overhangs = [overhang.overhang, overhang.overhang_rc]
if horizontal:
subset_data = dataframe.loc[overhangs]
if filter:
subset_data = subset_data.loc[:, subset_data.sum(axis=0) != 0]
return subset_data
else: # vertical
subset_data = dataframe[overhangs]
if filter:
subset_data = subset_data.loc[subset_data.sum(axis=1) != 0, :]
return subset_data