Source code for riboraptor.kmer
from collections import Counter
from Bio.SeqIO.QualityIO import FastqGeneralIterator
from tqdm import tqdm
import gzip
import pandas as pd
import numpy as np
[docs]def fastq_kmer_histogram(fastq_file,
kmer_length=range(5, 31),
five_prime=False,
max_seq=1000000,
offset=0,
drop_probability=0):
"""Get a histogram of kmers from a fastq file
Parameters
----------
fastq_file: string
Location of .fastq or .fastq.gz
kmer_length: range
Range of kmer to consider
five_prime: bool
Should consider sequences from 5' end?
Default: False (uses sequence from 3' end)
max_seq: int
Maximum number of sequences to consider
offset: int
Offset to ignore at the 5' end or 3'end
Example: If the offset is 3, the first 3 bases will be skipped
and kmers will start only from the 4th position
For 3'end if the offset is 3, the last 3 bases will be skipped
drop_probability: float [0-1]
Reads are randomly droppped with this probability to
simulate sampling
Returns
-------
kmers: Series
Sorted series with most frequent kmer
"""
cur_count = 0
should_continue = True
if '.gz' in fastq_file:
# Open as a gzip file
handle = gzip.open(fastq_file, 'rt')
else:
handle = open(fastq_file, 'r')
histogram = {k: Counter() for k in kmer_length}
with tqdm(total=max_seq) as pbar:
for title, seq, qual in FastqGeneralIterator(handle):
if not should_continue:
break
if drop_probability > 0:
should_drop = np.random.choice(
[1, 0], p=[drop_probability, 1 - drop_probability])
if should_drop:
continue
cur_count += 1
for k in kmer_length:
if not five_prime:
if not offset:
k_seq = seq[-k:]
else:
k_seq = seq[-k - offset:-offset]
else:
k_seq = seq[offset:k + offset]
histogram[k][k_seq] += 1
if cur_count >= max_seq:
should_continue = False
pbar.update()
handle.close()
kmers = {}
for k, v in histogram.items():
kmers[k] = pd.Series(v).sort_values(ascending=False) / max_seq * 100
return kmers