Source code for dnachisel.biotools.indices_operations

"""Generic methods for grouping locations and sets of indices"""


[docs]def windows_overlap(window1, window2): """Return the overlap span between two windows. Parameters ---------- window1, window2 Each window is a couple of the form (start, end) indicating the range of a segment of integers. Returns ------- None In case the two windows do not overlap. [start, end] The coordinates of the overlap segment if there is one. """ start1, end1 = window1 start2, end2 = window2 if start2 < start1: return windows_overlap(window2, window1) if start1 <= start2 <= end1: return [start2, min(end1, end2)] else: return None
[docs]def subdivide_window(window, max_span): """Subdivide a window (start, end) into windows of size < max_span (start, i_1), (i_1, i_2), ... (i_n, end)""" start, end = window inds = list(range(start, end, max_span)) + [end] return list(zip(inds, inds[1:]))
[docs]def group_nearby_indices(indices, max_gap=None, max_group_spread=None): """Return a list of groups of the different indices. Indices are considered from smaller to larger and placed into groups Parameters ---------- max_gap Maximal allowed difference between two consecutive numbers of a group max_group_spread Maximal allowed difference between the smallest and largest elements of a group. """ if len(indices) == 0: return [] indices = sorted(indices) current_group = [indices[0]] groups = [current_group] for ind in indices[1:]: gap_small_enough = (max_gap is None) or ( ind - current_group[-1] < max_gap ) spread_small_enough = (max_group_spread is None) or ( ind - current_group[0] < max_group_spread ) if gap_small_enough and spread_small_enough: current_group.append(ind) else: current_group = [ind] groups.append(current_group) return groups
[docs]def group_nearby_segments(segments, max_start_gap=None, max_start_spread=None): """Return a list of groups of the different indices. Indices are considered from smaller to larger and placed into groups Parameters ---------- max_gap Maximal allowed difference between two consecutive numbers of a group max_group_spread Maximal allowed difference between the smallest and largest elements of a group. """ if len(segments) == 0: return [] segments = sorted(segments) current_group = [segments[0]] groups = [current_group] for seg in segments[1:]: gap_small_enough = (max_start_gap is None) or ( seg[0] - current_group[-1][0] < max_start_gap ) spread_small_enough = (max_start_spread is None) or ( seg[0] - current_group[0][0] < max_start_spread ) if gap_small_enough and spread_small_enough: current_group.append(seg) else: current_group = [seg] groups.append(current_group) return groups