from itertools import product
import random
[docs]
def rc(seq):
"""
reverse complement of DNA sequence
Args:
seq: string with letters in {A, C, G, T}
Returns:
string corresponding to reverse complement
"""
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
return "".join(complement.get(base, base) for base in reversed(seq))
[docs]
def partition_path(seq, L, k):
"""
partitions self-avoiding walk into appropriate length sequences
Args:
seq: self avoiding walk (string or list)
L: length of desired sequences
k: SSM k-value
Returns:
seqs: list of strings, each a length L seq
"""
seqs = []
for i in range(0, len(seq)-L+1, L-k+1):
seqs.append(seq[i:i+L])
return seqs
[docs]
def is_necklace(seq):
"""
computes if a sequence is a necklace (as defined in Wong 2017)
Args:
seq: typically list of ints
"""
p = 1
for i in range(1, len(seq)):
if seq[i-p] < seq[i]:
p = i + 1
elif seq[i-p] > seq[i]:
return False
if (len(seq) % (p)) == 0:
return True
return False
[docs]
def f(seq, k):
"""
helper function defined in Wong 2017
* note confusing notation switch
Args:
seq: list of ints
k: alphabet length
Returns:
int corresponding to next element of seq
"""
p = 1
if seq[0] == k:
if sum(seq[1:]) == len(seq) - 1:
return 1
for i in range(0, k-1):
if not is_necklace(seq[1:] + [k-i]):
return k-i
return 1
elif is_necklace(seq[1:] + [(seq[0] % k) + 1]):
return (seq[0] % k) + 1
else:
return seq[0]
[docs]
def simple_shift(n, k):
"""
simple shift rule from Wong 2017
* note confusing notation switch
Args:
n: SSM k value
k: alphabet size
Returns:
list of integers corresponding to H. path
"""
seq = [1]*n
while True:
seq.append(f(seq[-n:], k))
if seq[-n:] == seq[:n]:
return seq
[docs]
def out_edges(v, alphabet):
"""
lists outedges for a node in a kmer graph
Args:
v: string corresponding to node
alphabet: string containing all valid letters
Returns:
list of outedges represented as strings
"""
return [v + l for l in alphabet]
[docs]
def adapted_hierholzer(k, alphabet):
"""
finds RC-free path through 4-letter kmer graph using modified hierholzer
(see Supplementary Note X)
Args:
k: SSM k (integer)
alphabet: string containing all valid letters
Returns:
path: string corresponding to RC-free self-avoiding walk
"""
assert (k % 2 == 1), "k must be odd for adapted Hierholzer"
# dictionary to store visited nodes
# list of all nodes
marked = {"".join(l) : 0 for l in product(alphabet, repeat=k)}
nodes = ["".join(l) for l in product(alphabet, repeat=(k-1))]
# initialize stack with a random starting node
v_stack = [random.choice(nodes)]
path = ""
while len(v_stack) != 0:
v = v_stack.pop()
unmarked = [s for s in out_edges(v, alphabet) if not marked[s]]
if unmarked == []:
if path == "":
path = v
else:
path = v[0] + path
else:
n_edge = random.choice(unmarked)
v_stack.append(v)
v_stack.append(n_edge[1:])
marked[n_edge] = 1
marked[rc(n_edge)] = 1
return path