Source code for goldenhinges.clique_methods

import networkx
import numpy as np
from tqdm import tqdm
import itertools
from .biotools import (
    sequences_differences,
    gc_content,
    list_overhangs,
    memo_reverse_complement,
)


[docs]def find_large_compatible_subset( all_elements, mandatory_elements=(), compatibility_conditions=(), elements_filters=(), solution_validity_conditions=(), score="subset_size", n_solutions_considered=5000, progress_bar=False, randomize=False, ): """Return a maximal subset of `all_elements` where all elements are valid and inter-compatibles. The algorithm takes in a set of elements `all_elements` and filters out some elements using the filters in `elements_filters`. Then it creates a graph whose nodes are the remaining elements. Edges are added between all pairs of elements which are "compatible" as defined by the `compatibility_conditions`. Finally we look for the "cliques" i.e. subsets of the graph made of elements that are all inter-compatibles. We consider a number `n_solutions_considered` of these, and return the one which scored highest as defined by the `score` function. Parameters ---------- all_elements A tuple or list of all possible elements. mandatory_elements A tuple or list of elements contained in `all_elements` that must be included in the final solution. elements_filters Functions used to pre-filter the list of `all_elements`. Must be a list or tuple of functions `fun(element)->True/False`. Only elements of `all_elements` such that `fun(element)` is True for all filters are considered. compatibility_conditions Functions used to determine whether two elements are compatible, i.e. whether a solution can feature these two elements at the same time. Must be a list or tuple of functions `fun(e1, e2)->True/False`. Two elements of `all_elements` are compatible when `fun(e1, e2)` is True for all functions in `compatibility_conditions`. solution_validity_conditions Additional validity conditions used to filter out some solutions. Must be a list or tuple of functions `fun(solution)->True/False`, where the `solution` is a list of elements of `all_elements`. A solution is considered valid when `fun(solution)` is True for all functions in `solution_validity_conditions`. score Of all the solutions explored, the which scores highest is returned. Can be either a function `fun(solution)->float` where `solution` is a list of elements, or it can be the default "subset_size" which means the score will be the length of the solution found (the solution returned will have as many elements as could be found) n_solutions_considered Number of cliques of the graph that are itered through (set to None if you want to consider all cliques in the graph, which may take a very long time) progress_bar If True, progress bars are displayed as the edges are computed and the graph cliques are explored. """ def progress(iterator, title=None, total_loops=None): """Wrap the iterator into a tqdm progress bar""" if progress_bar: return tqdm(iterator, desc=title, total=total_loops) else: return iterator if score == "subset_size": def score(subset): return len(subset) def elements_are_compatible(e1, e2): """Gather all compatibility conditions into a single function""" return all(test(e1, e2) for test in compatibility_conditions) def compatible_with_all_mandatory(element): """Test whether an element is compatible with all (other) mandatory elements""" return all( elements_are_compatible(element, m) for m in mandatory_elements if (m != element) ) elements_filters += [compatible_with_all_mandatory] def element_is_valid(element): """Gather all elements pre-filters into a single function.""" return all(fl(element) for fl in elements_filters) def solution_is_valid(elements_set): """Gather all validity conditions into a single function.""" return all(test(elements_set) for test in solution_validity_conditions) filtered_elements = list(filter(element_is_valid, all_elements)) wrong_mandatories = [ m for m in mandatory_elements if m not in filtered_elements ] if wrong_mandatories != []: raise ValueError( "These mandatory elements don't pass your tests: " + str(wrong_mandatories) ) graph = networkx.Graph() L = len(filtered_elements) for (e1, e2) in progress( itertools.combinations(filtered_elements, 2), title="Computing graph edges", total_loops=L * (L - 1) / 2, ): if elements_are_compatible(e1, e2): graph.add_edge(e1, e2) best_solution = [] best_score = None if randomize: ind_to_oh = {i: oh for i, oh in enumerate(graph.nodes())} oh_to_ind = {oh: i for i, oh in ind_to_oh.items()} graph = networkx.Graph( [(oh_to_ind[n1], oh_to_ind[n2]) for (n1, n2) in graph.edges()] ) def cliques_generator(): """Return random cliques from the connections graph. The randomness is introduced by permuting the nodes names, running `networkx.circular_paths` once, permuting the nodes names again, etc. """ # print (original_adj) while True: permutation = np.arange(len(graph.nodes())) np.random.shuffle(permutation) antipermutation = np.argsort(permutation) new_graph = networkx.Graph( [ (permutation[n1], permutation[n2]) for (n1, n2) in graph.edges() ] ) for clique in networkx.find_cliques(new_graph): clique = [antipermutation[i] for i in clique] clique = [ind_to_oh[i] for i in clique] yield clique break cliques_iterator = cliques_generator() else: cliques_iterator = networkx.find_cliques(graph) for i, clique in progress( enumerate(cliques_iterator), title="Exploring graph cliques", total_loops=n_solutions_considered, ): if solution_is_valid(clique): clique_score = score(clique) if (best_score is None) or (clique_score > best_score): best_solution = clique best_score = clique_score if (n_solutions_considered is not None) and ( i > n_solutions_considered ): break return best_solution
[docs]def find_compatible_overhangs( overhangs_size=4, mandatory_overhangs=(), forbidden_overhangs=(), min_gc_content=0, max_gc_content=1, min_overhangs_differences=2, min_reverse_overhangs_differences=2, elements_filters=(), compatibility_conditions=(), solution_validity_conditions=(), n_solutions_considered=5000, score="subset_size", progress_bar=False, randomize=False, ): """Return a list of compatible overhangs for (Golden Gate) assembly, satisfying all the specified conditions. Parameters ---------- overhangs_size The size of the overhangs. Four is the default and the most common case. mandatory_overhangs A list `["ATGC", "TTGC"...]` of the overhangs that must be part of the final solution. If these do not respect the other conditions or if they are not compatible between themselves, an error will be raised. forbidden_overhangs A list `["ATGC", "TTGC"...]` of overhangs that should NOT be part of the final solution. min_gc_content Float between 0.0 and 1.0 indicating the minimum proportion of G and C that valid overhangs should contain. max_gc_content Float between 0.0 and 1.0 indicating the maximum proportion of G and C that valid overhangs should contain. min_overhangs_differences Minimal number of different basepairs between two overhangs for them to be compatible (1 is an acceptable value but 2 is advised to really ensure the specificity of the assembly). min_reverse_overhangs_differences=2 Minimal number of different basepairs between an overhang and the reverse-complement of a second overhang for these two overhangs to be to be compatible (1 is an acceptable value but 2 is advised to really ensure the specificity of the assembly). elements_filters Additional filters to narrow down the possible overhangs. Must be a list or tuple of functions `fun(element)->True/False`. Only overhangs such that `fun(element)` is True for all filters are kept. compatibility_conditions Additional conditions to determine whether two overhands are compatible, i.e. whether a solution can feature these two elements at the same time. Must be a list or tuple of functions `fun(e1, e2)->True/False`. Two overhangs are compatible when `fun(e1, e2)` is True for all functions in `compatibility_conditions`. solution_validity_conditions Additional validity conditions used to filter out some solutions. Must be a list or tuple of functions `fun(solution)->True/False`, where the `solution` is a list of elements of `all_elements`. A solution is considered valid when `fun(solution)` is True for all functions in `solution_validity_conditions`. score Of all the solutions explored, the which scores highest is returned. Can be either a function `fun(solution)->float` where `solution` is a list of overhangs, or it can be the default "subset_size" which means the score will be the length of the solution found (the solution returned will have as many different overhangs as could be found) n_solutions_considered Number of solutions considered (set to None if you want to consider all possible solutions which may take a very long time) progress_bar If True, progress bars are displayed as the edges are computed and the graph cliques are explored. (see `find_best_compatible_subset`) """ all_elements = list_overhangs(overhangs_size) elements_filters = list(elements_filters) + [ lambda e: e not in forbidden_overhangs, lambda e: min_gc_content <= gc_content(e) <= max_gc_content, lambda e: ( sequences_differences(e, memo_reverse_complement(e)) >= min_reverse_overhangs_differences ), ] compatibility_conditions = list(compatibility_conditions) + [ lambda e1, e2: ( sequences_differences(e1, e2) >= min_overhangs_differences ), lambda e1, e2: ( sequences_differences(e1, memo_reverse_complement(e2)) >= min_reverse_overhangs_differences ), ] return find_large_compatible_subset( all_elements, mandatory_elements=mandatory_overhangs, elements_filters=elements_filters, compatibility_conditions=compatibility_conditions, solution_validity_conditions=solution_validity_conditions, n_solutions_considered=n_solutions_considered, score=score, progress_bar=progress_bar, randomize=randomize, )