Source code for dnachisel.biotools.gc_content

import numpy as np


[docs]def gc_content(sequence, window_size=None): """Compute global or local GC content. Parameters ---------- sequence An ATGC DNA sequence (upper case!) window_size If provided, the local GC content for the different sliding windows of this size is returned, else the global GC content is returned. Returns -------- A number between 0 and 1 indication the proportion of GC content. If window_size is provided, returns a list of len(sequence)-window_size values indicating the local GC contents (sliding-window method). The i-th value indicates the GC content in the window [i, i+window_size] """ # The code is a little cryptic as it uses numpy array operations # but the speed gain is 300x compared with pure-python string operations arr = np.frombuffer((sequence + "").encode(), dtype="uint8") arr_GCs = (arr == 71) | (arr == 67) # 67=C, 71=G if window_size is None: return 1.0 * arr_GCs.sum() / len(sequence) else: cs = np.cumsum(arr_GCs) a = cs[window_size - 1 :] b = np.hstack([[0], cs[:-window_size]]) return 1.0 * (a - b) / window_size