Source code for riboraptor.infer_protocol
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import pysam
from .helpers import is_read_uniq_mapping, create_bam_index
from .helpers import read_refseq_bed, read_bed_as_intervaltree
from collections import Counter
import numpy as np
[docs]def infer_protocol(bam, bed, n_reads=500000, drop_probability=0.2):
"""Infer strandedness protocol given a bam file
Parameters
----------
bam: string
Path to bam file
bed: string
Path to gene bed file
n_reads: int
Number of reads to use (downsampled)
drop_probability: float [0-1]
Reads are randomly droppped with this probability to
simulate sampling
Returns
-------
protocol: string
unstranded/forward/reverse
forward_mapped_reads: float
Proportion of reads of type + mapping to + (++) or - mapping to - (--)
reverse_mapped_reads: float
Proportion of reads of type + mapping to - (+-) or - mapping to + (-+)
"""
np.random.seed(42)
iteration = 0
create_bam_index(bam)
bam = pysam.AlignmentFile(bam, 'rb')
bed = read_bed_as_intervaltree(bed)
strandedness = Counter()
for read in bam.fetch():
if drop_probability > 0:
should_drop = np.random.choice(
[1, 0], p=[drop_probability, 1 - drop_probability])
if should_drop:
continue
if not is_read_uniq_mapping(read):
continue
if read.is_reverse:
mapped_strand = '-'
else:
mapped_strand = '+'
mapped_start = read.reference_start
mapped_end = read.reference_end
chrom = read.reference_name
gene_strand = list(set(bed[chrom].find(mapped_start, mapped_end)))
if len(gene_strand) != 1:
# Filter out genes with ambiguous strand info
# (those) that have a tx_start on opposite strands
continue
gene_strand = gene_strand[0]
strandedness['{}{}'.format(mapped_strand, gene_strand)] += 1
iteration += 1
if iteration >= n_reads:
break
## Add pseudocounts
strandedness['++'] += 1
strandedness['--'] += 1
strandedness['+-'] += 1
strandedness['-+'] += 1
total = sum(strandedness.values())
forward_mapped_reads = (strandedness['++'] + strandedness['--']) / total
reverse_mapped_reads = (strandedness['-+'] + strandedness['+-']) / total
ratio = forward_mapped_reads / reverse_mapped_reads
# Prefer checking for unstrandedness
# Check if the forward mapped reads - 0.5 is small,
# this threhold is defined to be 0.1
if np.isclose([np.abs(forward_mapped_reads - 0.5)], [0], atol=0.1):
return 'unstranded', forward_mapped_reads, reverse_mapped_reads, total
elif forward_mapped_reads >= 0.5:
return 'forward', forward_mapped_reads, reverse_mapped_reads, total
else:
return 'reverse', forward_mapped_reads, reverse_mapped_reads, total