Source code for seqwalk.filtering

[docs] def filter_rc_3letter(library, k): """ filter library to be RC free (Supplementary note X) Args: library: list of sequences k: SSM k value Returns: list of strings : filtered_library list of sequences without reverse complementary k-mers """ assert (k % 2 == 1), "SSM k must be odd for RC filtering" to_remove = [] middle = int((k+1)/2) for seq in library: for i in range(len(seq)-k): if sum([(s == "C" or s == "G") for s in seq[i:i+k]]) == 0 : if seq[i+middle-1] == "A": to_remove.append(seq) return [seq for seq in library if seq not in to_remove]
[docs] def filter_gc(library, gc_min, gc_max): """ filters library for sequences that have desired GC content Args: library: list of sequences in string representation gc_min: minimum number of GC bases (int) gc_max: maximimum number of GC bases (int) Returns: list of strings : filtered_library list of sequences in string representation """ assert (gc_min <= gc_max), "gc_min cannot be greater than gc_max" assert (gc_max <= len(library[0])), "gc_max cannot be greater than seq length" filtered_library = [] for seq in library: gc = sum([(s == "C" or s == "G") for s in seq]) if gc >= gc_min: if gc <= gc_max: filtered_library.append(seq) return filtered_library
[docs] def filter_pattern(library, pattern): """ filters library to remove specific patterns Args: library: list of sequences in string representation pattern: sequence pattern to be prevented Returns: list of strings : filtered_library list of sequences in string representation """ filtered_library = [] for seq in library: if pattern not in seq: filtered_library.append(seq) return filtered_library