Source code for riboraptor.kmer

from collections import Counter
from Bio.SeqIO.QualityIO import FastqGeneralIterator
from tqdm import tqdm
import gzip
import pandas as pd
import numpy as np


[docs]def fastq_kmer_histogram(fastq_file, kmer_length=range(5, 31), five_prime=False, max_seq=1000000, offset=0, drop_probability=0): """Get a histogram of kmers from a fastq file Parameters ---------- fastq_file: string Location of .fastq or .fastq.gz kmer_length: range Range of kmer to consider five_prime: bool Should consider sequences from 5' end? Default: False (uses sequence from 3' end) max_seq: int Maximum number of sequences to consider offset: int Offset to ignore at the 5' end or 3'end Example: If the offset is 3, the first 3 bases will be skipped and kmers will start only from the 4th position For 3'end if the offset is 3, the last 3 bases will be skipped drop_probability: float [0-1] Reads are randomly droppped with this probability to simulate sampling Returns ------- kmers: Series Sorted series with most frequent kmer """ cur_count = 0 should_continue = True if '.gz' in fastq_file: # Open as a gzip file handle = gzip.open(fastq_file, 'rt') else: handle = open(fastq_file, 'r') histogram = {k: Counter() for k in kmer_length} with tqdm(total=max_seq) as pbar: for title, seq, qual in FastqGeneralIterator(handle): if not should_continue: break if drop_probability > 0: should_drop = np.random.choice( [1, 0], p=[drop_probability, 1 - drop_probability]) if should_drop: continue cur_count += 1 for k in kmer_length: if not five_prime: if not offset: k_seq = seq[-k:] else: k_seq = seq[-k - offset:-offset] else: k_seq = seq[offset:k + offset] histogram[k][k_seq] += 1 if cur_count >= max_seq: should_continue = False pbar.update() handle.close() kmers = {} for k, v in histogram.items(): kmers[k] = pd.Series(v).sort_values(ascending=False) / max_seq * 100 return kmers