Source code for riboraptor.helpers

"""All functions that are not so useful, but still useful."""
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
from collections import Counter
from collections import OrderedDict
from collections import defaultdict
import errno
import itertools
import math
import os
import re
import sys
import ntpath
import pickle
import subprocess

from scipy import stats
import numpy as np
import pandas as pd
import six
import pybedtools
import pysam
import pyBigWig
from bx.intervals.intersection import IntervalTree

import warnings
from .interval import Interval

# Unmapped, Unmapped+Reverse strand, Not primary alignment,
# Not primary alignment + reverse strand, supplementary alignment

# Source: https://broadinstitute.github.io/picard/explain-flags.html
__SAM_NOT_UNIQ_FLAGS__ = [4, 20, 256, 272, 2048]

CBB_PALETTE = [
    "#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2",
    "#D55E00", "#CC79A7"
]


[docs]def order_dataframe(df, columns): """Order a dataframe Order a dataframe by moving the `columns` in the front Parameters ---------- df: Dataframe Dataframe columns: list List of columns that need to be put in front """ if isinstance(columns, six.string_types): columns = [columns] #let the command take a string or list remaining_columns = [w for w in df.columns if w not in columns] df = df[columns + remaining_columns] return df
def _fix_bed_coltype(bed): """Fix bed chrom and name columns to be string This is necessary since the chromosome numbers are often interpreted as int """ bed['chrom'] = bed['chrom'].astype(str) bed['name'] = bed['name'].astype(str) return bed
[docs]def check_file_exists(filepath): """Check if file exists. Parameters ---------- filepath : str Path to file """ if os.path.isfile(os.path.abspath(filepath)): return True return False
[docs]def list_to_ranges(list_of_int): """Convert a list to a list of range object Parameters ---------- list_of_int: list List of integers to be squeezed into range Returns ------- list_of_range: list List of range objects """ sorted_list = sorted(set(list_of_int)) for key, group in itertools.groupby( enumerate(sorted_list), lambda x: x[1] - x[0]): group = list(group) yield group[0][1], group[-1][1]
[docs]def create_ideal_periodic_signal(signal_length): """Create ideal ribo-seq signal. Parameters ---------- signal_length : int Length of signal to create Returns ------- signal : array_like 1-0-0 signal """ uniform_signal = np.array([4 / 6.0] * signal_length) uniform_signal[list(range(1, len(uniform_signal), 3))] = 1 / 6.0 uniform_signal[list(range(2, len(uniform_signal), 3))] = 1 / 6.0 return uniform_signal
[docs]def identify_peaks(coverage): """Given coverage array, find the site of maximum density""" return np.argmax(coverage[list(range(-18, -10))])
[docs]def millify(n): """Convert integer to human readable format. Parameters ---------- n : int Returns ------- millidx : str Formatted integer """ if n is None or np.isnan(n): return 'NaN' millnames = ['', ' K', ' M', ' B', ' T'] # Source: http://stackoverflow.com/a/3155023/756986 n = float(n) millidx = max( 0, min( len(millnames) - 1, int(math.floor(0 if n == 0 else math.log10(abs(n)) / 3)))) return '{:.1f}{}'.format(n / 10**(3 * millidx), millnames[millidx])
[docs]def mkdir_p(path): """Python version mkdir -p Parameters ---------- path : str """ if path: try: os.makedirs(path) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise
[docs]def r2(x, y): '''Calculate pearson correlation between two vectors. Parameters ---------- x : array_like Input y : array_like Input ''' return stats.pearsonr(x, y)[0]**2
[docs]def round_to_nearest(x, base=5): '''Round to nearest base. Parameters ---------- x : float Input Returns ------- v : int Output ''' return int(base * round(float(x) / base))
[docs]def set_xrotation(ax, degrees): """Rotate labels on x-axis. Parameters ---------- ax : matplotlib.Axes Axes object degrees : int Rotation degrees """ for i in ax.get_xticklabels(): i.set_rotation(degrees)
[docs]def summary_stats_two_arrays_welch(old_mean_array, new_array, old_var_array=None, old_n_counter=None, carried_forward_observations=None): """Average two arrays using welch's method Parameters ---------- old_mean_array : Series Series of previous means with index as positions old_var_array : Series Series of previous variances with index as positions new_array : array like Series of new observations (Does noes Ciunts of number of positions at a certain index Returns ------- m : array like Column wise Mean array var : array like Column wise variance Consider an example: [1,2,3], [1,2,3,4], [1,2,3,4,5] old = [1,2,3] new = [1,2,3,4] counter = [1,1,1] mean = [1,2,3,4] Var =[na, na, na, na], carried_fowrad = [[1,1], [2,2], [3,3], [4]] old = [1,2,3,4] new = [1,2,3,4,5] couter = [2,2,2,1] mean = [1,2,3,4,5] var = [0,0,0, na, na] carried_forward = [[], [], [], [4,4], [5]] """ if not isinstance(old_mean_array, pd.Series): old_mean_array = pd.Series(old_mean_array) if not isinstance(new_array, pd.Series): new_array = pd.Series(new_array) if old_n_counter is not None and not isinstance(old_n_counter, pd.Series): old_n_counter = pd.Series(old_n_counter) len_old, len_new = len(old_mean_array), len(new_array) if old_n_counter is None: # Initlaized from current series old_n_counter = pd.Series( np.zeros(len(old_mean_array)) + 1, index=old_mean_array.index) if old_var_array is None: # Initlaized from current series old_var_array = pd.Series( np.zeros(len(old_mean_array)) + np.nan, index=old_mean_array.index) # Update positions counts based on new_array new_n_counter = old_n_counter.add( pd.Series(np.zeros(len(new_array)) + 1, index=new_array.index), fill_value=0) if len_old > len_new: len_diff = len_old - len_new # Pad the incoming array # We append NAs to the end of new_array since it will mostly be in the metagene context max_index = np.max(new_array.index.tolist()) new_index = np.arange(max_index + 1, max_index + 1 + len_diff) new_array = new_array.append( pd.Series(np.zeros(len_diff) + np.nan, index=new_index), verify_integrity=True) elif len_old < len_new: len_diff = len_new - len_old # Pad the old array if len_old == 0: old_mean_array = pd.Series([]) else: max_index = np.max(old_mean_array.index.tolist()) new_index = np.arange(max_index + 1, max_index + 1 + len_diff) old_mean_array = old_mean_array.append( pd.Series(np.zeros(len_diff) + np.nan, index=new_index), verify_integrity=True) if not (old_mean_array.index == new_array.index).all(): print('old array index: {}'.format(old_mean_array)) print('new array index: {}'.format(new_array)) positions_with_less_than3_obs = defaultdict(list) for index, counts in six.iteritems(new_n_counter): # Which positions has <3 counts for calculating variance if counts <= 3: # Fetch the exact observations from history try: last_observations = carried_forward_observations[index] except: # No carreid forward passed if not np.isnan(old_mean_array[index]): last_observations = [old_mean_array[index]] else: last_observations = [] # Add entry from new_array only if it is not NAN if not np.isnan(new_array[index]): last_observations.append(new_array[index]) positions_with_less_than3_obs[index] = last_observations # positions_with_less_than3_obs = pd.Series(positions_with_less_than3_obs) # delta = x_n - mean(x_{n-1}) delta = new_array.subtract(old_mean_array) """ for index, value in six.iteritems( delta ): if np.isnan(value): if not np.isnan(old_mean_array[index]): delta[index] = old_mean_array[index] else: delta[index] = new_array[index] """ # delta = delta/n delta_normalized = delta.divide(new_n_counter) # mean(x_n) = mean(x_{n-1}) + delta/n new_mean_array = old_mean_array.add(delta_normalized) for index, value in six.iteritems(new_mean_array): if np.isnan(value): if not np.isnan(old_mean_array[index]): new_mean_array[index] = old_mean_array[index] else: new_mean_array[index] = new_array[index] #print(delta) #print(new_n_counter) #print(delta_normalized) #print(new_mean_array) # mean_difference_current = x_n - mean(x_n) # mean_difference_previous = x_n - mean(x_{n-1}) mean_difference_current = new_array.fillna(0) - new_mean_array.fillna(0) mean_difference_previous = new_array.fillna(0) - old_mean_array.fillna(0) # (x_n-mean(x_n))(x_n-mean(x_{n-1}) product = np.multiply(mean_difference_current, mean_difference_previous) # (n-1)S_n^2 - (n-2)S_{n-1}^2 = (x_n-mean(x_n)) (x_n-mean(x_{n-1})) # old_ssq = (n-1)S_{n-1}^2 # (n-2)S_{n-1}^2 old_sum_of_sq = (old_n_counter - 2).multiply(old_var_array.fillna(0)) # new_ssq = (old_ssq + product) # (n-1) S_n^2 new_sum_of_sq = old_sum_of_sq + product # if counts is less than 3, set sum of sq to NA new_sum_of_sq[new_n_counter < 3] = np.nan # if counts just became 3, compute the variance for index, counts in six.iteritems(new_n_counter): if counts == 3: observations = positions_with_less_than3_obs[index] variance = np.var(observations) print(index, variance) new_sum_of_sq[index] = variance # delete it from the history del positions_with_less_than3_obs[index] new_var_array = new_sum_of_sq.divide(new_n_counter - 1) new_var_array[new_var_array == np.inf] = np.nan new_var_array[new_n_counter < 3] = np.nan """ for index, counts in six.iteritems(new_n_counter): if counts < 3: if not np.isnan(new_array[index]): if index not in list(positions_with_less_than3_obs.keys()): positions_with_less_than3_obs[index] = list() assert index in positions_with_less_than3_obs.keys() positions_with_less_than3_obs[index].append(new_array[index]) """ return new_mean_array, new_var_array, new_n_counter, positions_with_less_than3_obs
[docs]def path_leaf(path): """Get path's tail from a filepath""" head, tail = ntpath.split(path) return tail or ntpath.basename(head)
[docs]def parse_star_logs(infile, outfile=None): """Parse star logs into a dict Parameters ---------- infile : str Path to starlogs.final.out file Returns ------- star_info : dict Dict with necessary records parsed """ ANNOTATIONS = [ 'total_reads', 'uniquely_mapped', 'uniquely_mapped_percent', 'multi_mapped_percent', 'unmapped_percent', 'multi_mapped' ] star_info = OrderedDict() with open(infile) as fh: for line in fh: line = line.strip() if line.startswith('Number of input reads'): star_info[ANNOTATIONS[0]] = int(line.strip().split('\t')[1]) elif line.startswith('Uniquely mapped reads number'): star_info[ANNOTATIONS[1]] = int(line.strip().split('\t')[1]) elif line.startswith('Uniquely mapped reads %'): star_info[ANNOTATIONS[2]] = round( float(line.strip('%').split('\t')[1]), 2) elif line.startswith('Number of reads mapped to multiple loci'): star_info[ANNOTATIONS[5]] = int(line.strip().split('\t')[1]) elif line.startswith('Number of reads mapped to too many loci'): star_info[ANNOTATIONS[5]] += int(line.strip().split('\t')[1]) elif line.startswith('% of reads mapped to multiple loci'): star_info[ANNOTATIONS[3]] = round( float(line.strip('%').split('\t')[1]), 2) elif line.startswith('% of reads mapped to too many loci'): star_info[ANNOTATIONS[3]] += round( float(line.strip('%').split('\t')[1]), 2) elif line.startswith('% of reads unmapped: too many mismatches'): star_info[ANNOTATIONS[4]] = round( float(line.strip('%').split('\t')[1]), 2) elif line.startswith('% of reads unmapped: too short'): star_info[ANNOTATIONS[4]] += round( float(line.strip('%').split('\t')[1]), 2) elif line.startswith('% of reads unmapped: other'): star_info[ANNOTATIONS[4]] += round( float(line.strip('%').split('\t')[1]), 2) star_info = { key: round(star_info[key], 2) for key in list(star_info.keys()) } if outfile is None: return star_info filename = path_leaf(infile) filename = filename.strip('Log.final.out') counts_df = pd.DataFrame.from_dict(star_info, orient='index').T counts_df.index = [filename] if outfile: counts_df.to_csv(outfile, sep=str('\t'), index=True, header=True) return counts_df
[docs]def get_strandedness(filepath): """Parse output of infer_experiment.py from RSeqC to get strandedness. Parameters ---------- filepath : str Path to infer_experiment.py output Returns ------- strandedness : str reverse or forward or none """ with open(filepath) as f: data = f.read() splitted = [x.strip() for x in data.split('\n') if len(x.strip()) >= 1] assert splitted[0] == 'This is SingleEnd Data' fwd_percentage = None rev_percentage = None for line in splitted[1:]: if 'Fraction of reads failed to determine:' in line: continue elif 'Fraction of reads explained by "++,--":' in line: fwd_percentage = float(line.split(':')[1]) elif 'Fraction of reads explained by "+-,-+":' in line: rev_percentage = float(line.split(':')[1]) assert rev_percentage is not None assert fwd_percentage is not None ratio = fwd_percentage / rev_percentage if np.isclose([ratio], [1]): return 'none' elif ratio >= 0.5: return 'forward' else: return 'reverse'
[docs]def load_pickle(filepath): """Read pickled files easy in Python 2/3""" if '.tsv' in filepath: raise IndexError if sys.version_info > (3, 0): pickled = pickle.load(open(filepath, 'rb'), encoding='latin1') else: pickled = pickle.load(open(filepath, 'rb')) return pickled
[docs]def pad_or_truncate(some_list, target_len): """Pad or truncate a list upto given target length Parameters ---------- some_list : list Input list target_length : int Final length of list If being extended, returns list padded with NAs. """ return some_list[:target_len] + [np.nan] * (target_len - len(some_list))
[docs]def pad_five_prime_or_truncate(some_list, offset_5p, target_len): """Pad first the 5prime end and then the 3prime end or truncate Parameters ---------- some_list : list Input list offset_5p : int 5' offset target_length : int Final length of list If being extended, returns list padded with NAs. """ some_list = list(some_list) padded_5p = [np.nan] * offset_5p + some_list return padded_5p[:target_len] + [np.nan] * (target_len - len(padded_5p))
[docs]def codon_to_anticodon(codon): """Codon to anticodon. Parameters ---------- codon : string Input codon """ pairs = {'A': 'T', 'C': 'G', 'T': 'A', 'G': 'C', 'N': 'N'} return ''.join(pairs[c] for c in codon)[::-1]
[docs]def merge_intervals(intervals, chromosome_lengths=None, offset_5p=0, offset_3p=0, zero_based=True): """Collapse intervals into non overlapping manner Parameters ---------- intervals : list of Interval chromosome_lengths : dict A map of each chromosome'e length Only used with offset_3p, offset_5p>0 offset_5p : int (positive) Number of bases to count upstream (5') offset_3p : int (positive) Number of bases to count downstream (3') zero_based: bool Indicate if the intervals are zero-based True means zero-based half open False means one-based full closed Returns ------- interval_combined : list of Interval sorted by the start A merged version of intervals This is useful when the annotations are overlapping. Example: chr1 310 320 gene1 + chr1 319 324 gene1 + Returns: chr1 310 324 gene1 + gene_offset_5p: Gene wise 5 prime offset This might be different from `offset_5p` in cases where `offset_5p` leads to a negative coordinate gene_offset_3p: Gene wise 3 prime offset This might be different from `offset_3p` in cases where `offset_3p` leads to position beyond chromsome length """ if not intervals: return ([], offset_5p, offset_3p) chroms = list(set([i.chrom for i in intervals])) strands = list(set([i.strand for i in intervals])) if len(chroms) != 1: sys.stderr.write('Error: chromosomes should be unique') return ([], offset_5p, offset_3p) if len(strands) != 1: sys.stderr.write('Error: strands should be unique') return ([], offset_5p, offset_3p) chrom = chroms[0] strand = strands[0] # Sort intervals by start intervals.sort(key=lambda x: x.start) # Find first interval first_interval = intervals[0] # Find last interval last_interval = intervals[-1] for i in intervals: if i.end > last_interval.end: last_interval = i if offset_5p != 0 or offset_3p != 0: if str(chrom) in chromosome_lengths: chrom_length = chromosome_lengths[str(chrom)] else: warnings.warn('Chromosome {} does not exist'.format(chrom), UserWarning) chrom_length = np.inf else: chrom_length = np.inf if zero_based: lower_bound = 0 else: lower_bound = 1 upper_bound = chrom_length if strand == '+': if first_interval.start - offset_5p >= lower_bound: first_interval.start -= offset_5p gene_offset_5p = offset_5p else: gene_offset_5p = first_interval.start - lower_bound first_interval.start = lower_bound if last_interval.end + offset_3p <= upper_bound: last_interval.end += offset_3p gene_offset_3p = offset_3p else: gene_offset_3p = upper_bound - last_interval.end last_interval.end = upper_bound else: if last_interval.end + offset_5p <= upper_bound: last_interval.end += offset_5p gene_offset_5p = offset_5p else: gene_offset_5p = upper_bound - last_interval.end last_interval.end = upper_bound if first_interval.start - offset_3p >= lower_bound: first_interval.start -= offset_3p gene_offset_3p = offset_3p else: gene_offset_3p = first_interval.start - lower_bound first_interval.start = lower_bound # Merge overlapping intervals to_merge = Interval(chrom, first_interval.start, first_interval.end, strand) intervals_combined = [] for i in intervals: if i.start <= to_merge.end: to_merge.end = max(to_merge.end, i.end) else: intervals_combined.append(to_merge) to_merge = Interval(chrom, i.start, i.end, strand) intervals_combined.append(to_merge) return (intervals_combined, gene_offset_5p, gene_offset_3p)
[docs]def summarize_counters(samplewise_dict): """Summarize gene counts for a collection of samples. Parameters ---------- samplewise_dict : dict A dictionary with key as sample name and value as another dictionary of counts for each gene Returns ------- totals : dict A dictionary with key as sample name and value as total gene count """ totals = {} for key, sample_dict in six.iteritems(samplewise_dict): totals[key] = np.nansum( [np.nansum(d) for d in list(sample_dict.values)]) return totals
[docs]def complementary_strand(strand): """Get complementary strand Parameters ---------- strand: string +/- Returns ------- rs: string -/+ """ if strand == '+': return '-' elif strand == '-': return '+' else: raise ValueError('Not a valid strand: {}'.format(strand))
[docs]def read_refseq_bed(filepath): """Read refseq bed12 from UCSC. Parameters ---------- filepath: string Location to bed12 Returns ------- refseq: dict dict with keys as gene name and values as intervaltree """ refseq = defaultdict(IntervalTree) with open(filepath, 'r') as fh: for line in fh: line = line.strip() if line.startswith(('#', 'track', 'browser')): continue fields = line.split('\t') chrom, tx_start, tx_end, name, score, strand = fields[:6] tx_start = int(tx_start) tx_end = int(tx_end) refseq[chrom].insert(tx_start, tx_end, strand) return refseq
[docs]def read_bed_as_intervaltree(filepath): """Read bed as interval tree Useful for reading start/stop codon beds Parameters ---------- filepath: string Location to bed Returns ------- bedint_tree: dict dict with keys as gene name and strand as intervaltree """ bed_df = pybedtools.BedTool(filepath).sort().to_dataframe() bed_df['chrom'] = bed_df['chrom'].astype(str) bed_df['name'] = bed_df['name'].astype(str) bed_grouped = bed_df.groupby('chrom') bedint_tree = defaultdict(IntervalTree) for chrom, df in bed_grouped: df_list = zip(df['start'], df['end'], df['strand']) for start, end, strand in df_list: bedint_tree[chrom].insert(start, end, strand) return bedint_tree
[docs]def read_chrom_sizes(filepath): """Read chr.sizes file sorted by chromosome name Parameters ---------- filepath: string Location to chr.sizes Returns ------- chrom_lengths: list of tuple A list of tuples with chromsome name and their size """ chrom_lengths = [] with open(filepath, 'r') as fh: for line in fh: chrom, size = line.strip().split('\t') chrom_lengths.append((chrom, int(size))) chrom_lengths = list(sorted(chrom_lengths, key=lambda x: x[0]))
[docs]def create_bam_index(bam): """Create bam index. Parameters ---------- bam : str Path to bam file """ if isinstance(bam, pysam.AlignmentFile): bam = bam.filename if not os.path.exists('{}.bai'.format(bam)): pysam.index(bam)
[docs]def is_read_uniq_mapping(read): """Check if read is uniquely mappable. Parameters ---------- read : pysam.Alignment.fetch object Most reliable: ['NH'] tag """ # Filter out secondary alignments if read.is_secondary: return False tags = dict(read.get_tags()) try: nh_count = tags['NH'] except KeyError: # Reliable in case of STAR if read.mapping_quality == 255: return True if read.mapping_quality < 1: return False # NH tag not set so rely on flags if read.flag in __SAM_NOT_UNIQ_FLAGS__: return False else: raise RuntimeError('Malformed BAM?') if nh_count == 1: return True return False
[docs]def find_first_non_none(positions): """Given a list of positions, find the index and value of first non-none element. This method is specifically designed for pysam, which has a weird way of returning the reference positions. If they are mismatched/softmasked it returns None when fetched using get_reference_positions. query_alignment_start and query_alignment_end give you indexes of position in the read which technically align, but are not softmasked i.e. it is set to None even if the position does not align Parameters ---------- positions: list of int Positions as returned by pysam.fetch.get_reference_positions Return ------ index: int Index of first non-None value position: int Value at that index """ for idx, position in enumerate(positions): if position is not None: return idx, position
[docs]def find_last_non_none(positions): """Given a list of positions, find the index and value of last non-none element. This function is similar to the `find_first_non_none` function, but does it for the reversed list. It is specifically useful for reverse strand cases Parameters ---------- positions: list of int Positions as returned by pysam.fetch.get_reference_positions Return ------ index: int Index of first non-None value position: int Value at that index """ return find_first_non_none(positions[::-1])
# NOTE: We can in principle do a longer metagene anaylsis # using this helper funciont
[docs]def yield_intervals(chrom_size, chunk_size=20000): for start in np.arange(0, chrom_size, chunk_size): end = start + chunk_size if end > chrom_size: yield (start, chrom_size) else: yield (start, end)
[docs]def bwsum(bw, chunk_size=5000, scale_to=1e6): bw_sum = 0 if isinstance(bw, six.string_types): bw = pyBigWig.open(bw) chrom_sizes = bw.chroms() for chrom, chrom_size in six.iteritems(chrom_sizes): for start, end in yield_intervals(chrom_size, chunk_size): bw_sum += np.nansum(bw.values(chrom, start, end)) scale_factor = 1 / (bw_sum / scale_to) return bw_sum, scale_factor
[docs]def scale_bigwig(inbigwig, chrom_sizes, outbigwig, scale_factor=1): """Scale a bigwig by certain factor. Parameters ---------- inbigwig: string Path to input bigwig chrom_sizes: string Path to chrom.sizes file outbigwig: string Path to output bigwig scale_factor: float Scale by value """ wigfile = os.path.abspath('{}.wig'.format(outbigwig)) chrom_sizes = os.path.abspath(chrom_sizes) inbigwig = os.path.abspath(inbigwig) outbigwig = os.path.abspath(outbigwig) if os.path.isfile(wigfile): # wiggletools errors if the file already exists os.remove(wigfile) cmds = [ 'wiggletools', 'write', wigfile, 'scale', str(scale_factor), inbigwig ] try: p = subprocess.Popen( cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) stdout, stderr = p.communicate() rc = p.returncode if rc != 0: raise RuntimeError( 'Error running wiggletools.\nstdout : {} \n stderr : {}'. format(stdout, stderr)) except FileNotFoundError: raise FileNotFoundError("wiggletool not found on the path." "Use `conda install wiggletools`") cmds = ['wigToBigWig', wigfile, chrom_sizes, outbigwig] try: p = subprocess.Popen( cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) stdout, stderr = p.communicate() rc = p.returncode if rc != 0: raise RuntimeError( 'Error running wigToBigWig.\nstdout : {} \n stderr : {}'. format(stdout, stderr)) os.remove(wigfile) except FileNotFoundError: raise FileNotFoundError( "wigToBigwig not found on the path. This is an external " "tool from UCSC which can be downloaded from " "http://hgdownload.soe.ucsc.edu/admin/exe/. Alternatatively, use " "`conda install ucsc-wigtobigwig`")
[docs]def get_region_sizes(region_bed): """Get summed up size of a CDS/UTR region from bed file Parameters ---------- region_bed: string Input bed file Returns ------- region_sizes: pd.Series Series with region name as index and size as key """ if isinstance(region_bed, six.string_types): region_bed = pybedtools.BedTool(region_bed).to_dataframe() region_bed_grouped = region_bed.groupby('name') region_sizes = {} for gene_name, gene_group in region_bed_grouped: ## Get rid of trailing dots gene_name = re.sub(r'\.[0-9]+', '', gene_name) # Collect all intervals at once intervals = zip(gene_group['chrom'], gene_group['start'], gene_group['end'], gene_group['strand']) for interval in intervals: if gene_name not in region_sizes: # End is always 1-based so does not require +1 region_sizes[gene_name] = interval[2] - interval[1] else: region_sizes[gene_name] += interval[2] - interval[1] return pd.Series(region_sizes)
[docs]def htseq_to_tpm(htseq_f, outfile, cds_bed_f): """Convert htseq-counts file to tpm Parameters ---------- htseq_f: string Path to htseq-count output outfile: string Path to output file with tpm values cds_bed_f: string Path to CDS/genesize bed file """ cds_bed = pybedtools.BedTool(cds_bed_f).to_dataframe() cds_bed_sizes = get_region_sizes(cds_bed) htseq = pd.read_table(htseq_f, names=['name', 'counts']).set_index('name') htseq = htseq.iloc[:-5] if (htseq.shape[0] <= 10): print('Empty dataframe for : {}\n'.format(htseq_f)) return None rate = np.log(htseq['counts']).subtract(np.log(cds_bed_sizes)) denom = np.log(np.sum(np.exp(rate))) tpm = np.exp(rate - denom + np.log(1e6)) tpm = pd.DataFrame(tpm, columns=['tpm']) tpm = tpm.sort_values(by=['tpm'], ascending=False) tpm.to_csv(outfile, sep='\t', index=True, header=False)
[docs]def counts_to_tpm(counts, sizes): """Counts to TPM Parameters ---------- counts: array like Series/array of counts sizes: array like Series/array of region sizes """ rate = np.log(counts).subtract(np.log(sizes)) denom = np.log(np.sum(np.exp(rate))) tpm = np.exp(rate - denom + np.log(1e6)) return tpm
[docs]def featurecounts_to_tpm(fc_f, outfile): """Convert htseq-counts file to tpm Parameters ---------- fc_f: string Path to htseq-count output outfile: string Path to output file with tpm values """ feature_counts = pd.read_table(fc_f, skiprows=[0]) feature_counts = feature_counts.drop( columns=['Geneid', 'Chr', 'Start', 'End', 'Strand']) lengths = feature_counts['Length'] feature_counts = feature_counts.drop(columns=['Length']) tpm = feature_counts.apply(lambda x: counts_to_tpm(x, lengths), axis=0) tpm.to_csv(outfile, sep='\t', index=False, header=True)
[docs]def read_htseq(htseq_f): """Read HTSeq file. Parameters ---------- htseq_f: str Path to htseq counts file Returns ------- htseq_df: dataframe HTseq counts as in a dataframe """ htseq = pd.read_table(htseq_f, names=['name', 'counts']).set_index('name') htseq = htseq.iloc[:-5] if (htseq.shape[0] <= 10): sys.stderr.write('Empty dataframe for : {}\n'.format(htseq_f)) return None return htseq
[docs]def read_enrichment(read_lengths, enrichment_range=range(28, 33), input_is_stream=False, input_is_file=True): """Calculate read enrichment for a certain range of lengths Parameters ---------- read_lengths: Counter A counter with read lengths and their counts enrichment_range: range or str Range of reads to concentrate upon (28-32 or range(28,33)) input_is_stream: bool True if input is sent through stdin Returns ------- ratio: float Enrichment in this range (Scale 0-1) """ if isinstance(read_lengths, pd.Series): pass elif input_is_file: read_lengths = os.path.abspath(read_lengths) if not check_file_exists(read_lengths): raise RuntimeError('{} does not exist.'.format(read_lengths)) read_lengths = pd.read_table(read_lengths, sep='\t') read_lengths = pd.Series( read_lengths['count'].tolist(), index=read_lengths['read_length'].tolist()).add( pd.Series( [0] * len(enrichment_range), index=list(enrichment_range)), fill_value=0) elif input_is_stream: counter = {} for line in read_lengths: splitted = list(map(lambda x: int(x), line.strip().split('\t'))) counter[splitted[0]] = splitted[1] read_lengths = Counter(counter) if isinstance(read_lengths, Counter): read_lengths = pd.Series(read_lengths) if isinstance(enrichment_range, six.string_types): splitted = list( map(lambda x: int(x), enrichment_range.strip().split('-'))) enrichment_range = range(splitted[0], splitted[1] + 1) rpf_signal = read_lengths.get(list(enrichment_range)).sum() total_signal = read_lengths.sum() read_lengths = read_lengths.sort_index() array = [[x] * int(y) for x, y in zip(read_lengths.index, read_lengths.values)] mean_length, std_dev_length = stats.norm.fit( np.concatenate(array).ravel().tolist()) # mean_length_floor = np.floor(mean_length) # 1 - P(x1 < X <x2) = P(X<x1) + P(X>x2) = cdf(x1) + sf(x2) cdf_min = stats.norm.cdf( min(enrichment_range), mean_length, std_dev_length) sf_max = stats.norm.sf(max(enrichment_range), mean_length, std_dev_length) pvalue = cdf_min + sf_max ratio = rpf_signal / float(total_signal) return ratio, pvalue
[docs]def bwshift(bw, shift_by, out_bw, chunk_size=20000): """Given a bigwig shift all the values by this Shifting by 10: variableStep chrom=chr span=1 1 1 2 2 3 5 4 6 5 5 6 3 7 3 8 5 9 5 10 5 11 6 12 6 13 0 14 2 15 3 16 3 17 10 18 4 19 4 20 2 21 2 22 2 23 1 shifted by 10 variableStep chrom=chr span=1 1 6 2 6 3 0 4 2 5 3 6 3 7 10 8 4 9 4 10 2 11 2 12 2 13 1 """ if isinstance(bw, six.string_types): bw = pyBigWig.open(bw) chrom_sizes = bw.chroms() out_bw = pyBigWig.open(out_bw, 'w') out_bw.addHeader(list(chrom_sizes.items()), maxZooms=0) for chrom, chrom_size in six.iteritems(chrom_sizes): for start, end in yield_intervals(chrom_size, chunk_size): shifted_values = [] shifted_end = end + shift_by shifted_start = start + shift_by # Cases # Case1: shifted_start < chrom_start and shifted_end < chrom_end # Need to set the first shifted_start - chrom_start bases to 0 if shifted_start < 0 and shifted_end < chrom_size: zero_padding = np.array([0.0] * np.abs(shifted_start)) shifted_values = np.concatenate( [zero_padding, bw.values(chrom, 0, shifted_end)]) # Case2: shifted_start < 0 and shifted_end > chrom_end elif shifted_start < 0 and shifted_end >= chrom_size: zero_padding_start = np.array([0.0] * shifted_start) zero_padding_end = np.array([0.0] * (shifted_end - chrom_size)) shifted_values = np.concatenate([ zero_padding_start, bw.values(chrom, shifted_start, chrom_size), zero_padding_end ]) # Case3: shifted_start > 0 and shifted_end > chrom_end elif shifted_start > 0 and shifted_end >= chrom_size: zero_padding_end = np.array([0.0] * (shifted_end - chrom_size)) shifted_values = np.concatenate([ bw.values(chrom, shifted_start, chrom_size), zero_padding_end ]) elif shifted_start > 0 and shifted_end < chrom_size: shifted_values = bw.values(chrom, shifted_start, shifted_end) elif shifted_start >= chrom_size: shifted_values = [] else: raise NotImplementedError( 'Unhandled case: shifted_start = {} | shifted_end = {} | chrom_size = {}' .format(shifted_start, shifted_end, chrom_size)) assert end - start == len( shifted_values ), 'end = {} | shifted_end = {} | start={} | shifted_start ={} | values_len = {}'.format( end, shifted_end, start, shifted_start, len(shifted_values)) starts = np.arange(start, end) ends = starts + 1 out_bw.addEntries([chrom] * (end - start), starts, ends, shifted_values)