Remove rare kmers from merge file
bewt85 opened this issue · comments
Ben Taylor commented
Please could you add a command so that I can remove rare kmers from a merge file? I've got a couple of merge files which I'd like to merge but I run out of RAM. If possible I'd like to remove all the kmers which are in fewer than 80% of input sequences because I think that'll make the merge (and later alignment) much more efficient.
ska weed -d 0.8 my_data.kmerge
Ben Taylor commented
I made a hacky python script to remove any kmer which occurs in fewer than 80% of samples. It ran in between 1m50 and 2m20 on merge files of size 579M and 469M respectivly (around 1200 Salmonella Enterica assemblies)
import math
import sys
INPUT = sys.argv[1]
OUTPUT = sys.argv[2]
THRESHOLD = 0.8
def parse_count(ascii_field):
bits = ''.join(['{0:06b}'.format(ord(c)-33) for c in ascii_field])
return bits.count('1')
kmer_counts = {}
with open(INPUT, 'r') as f_in, open(OUTPUT, 'w') as f_out:
k = int(f_in.readline().strip())
kmer_length = 1 + ((k * 2) // 3)
samples = f_in.readline().strip().split(' ')
nSamples = len(samples)
threshold = nSamples * THRESHOLD
ascii_length = math.ceil(nSamples / 6)
n_kmers = 2
def parse_line(line):
ascii_ = line[:ascii_length]
count = parse_count(line[:ascii_length])
kmers = ((line[i], line[i+1:i+kmer_length]) for i in range(ascii_length, len(line), kmer_length))
return ascii_, count, kmers
for line in f_in:
_, count, kmers = parse_line(line.rstrip())
for _, kmer in kmers:
kmer_counts[kmer] = kmer_counts.get(kmer, 0) + count
n_kmers += 1
if (n_kmers % 1000000) == 0:
print(f"Parsed {n_kmers} kmers")
kmers_to_keep = {kmer for kmer, count in kmer_counts.items() if count > threshold}
print(f"Found {len(kmers_to_keep)} kmers to keep")
f_in.seek(0)
# HEADERS
f_out.write(f_in.readline())
f_out.write(f_in.readline())
for line in f_in:
ascii_, _, kmers = parse_line(line.rstrip())
outline = "".join([ascii_] + [ _ for snp, kmer in kmers for _ in (snp, kmer) if kmer in kmers_to_keep] + ["\n"])
if len(outline) > ascii_length + 1:
f_out.write(outline)
Simon Harris commented
This has been added to ska weed.