Source code for riboraptor.utils

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
from tqdm import tqdm
import numpy as np
import re
import pickle
import os

import pandas as pd
from textwrap import dedent
from .helpers import mkdir_p
from .helpers import symlink_force


[docs]def load_tpm(path):
    df = pd.read_table(path, names=['gene_id', 'tpm']).set_index('gene_id')
    return df


[docs]def get_cell_line_or_tissue(row):
    if str(row['cell_line']).strip() and str(
            row['cell_line']).strip() != 'nan':
        return '{}-{}-{}'.format(row['cell_line'], row['study_accession'],
                                 row['experiment_accession'])
    if str(row['tissue']).strip() and str(row['tissue']).strip() != 'nan':
        return '{}-{}-{}'.format(row['tissue'], row['study_accession'],
                                 row['experiment_accession'])
    if str(row['source_name']).strip() and str(
            row['source_name']).strip() != 'nan':
        return '{}-{}-{}'.format(row['source_name'], row['study_accession'],
                                 row['experiment_accession'])
    if row['study_accession'].strip() == 'SRP052229':
        print(row)
    return '{}-{}-{}'.format(row['source_name'], row['study_accession'],
                             row['experiment_accession'])


[docs]def determine_cell_type(sample_attribute):
    sample_attribute = str(sample_attribute)
    if 'cell line:' in sample_attribute:
        x = re.search(r'cell line: \w+', sample_attribute)
        return x.group(0).strip('cell line: ').rstrip(' ').upper()
    if 'cell_line:' in sample_attribute:
        x = re.search(r'cell_line: \w+', sample_attribute)
        return x.group(0).strip('cell_line: ').rstrip(' ').upper()
    if 'cell-line:' in sample_attribute:
        x = re.search(r'cell-line: \w+', sample_attribute)
        return x.group(0).strip('cell-line: ').rstrip(' ').upper()
    if 'cell_type:' in sample_attribute:
        x = re.search(r'cell_type: \w+', sample_attribute)
        return x.group(0).strip('cell_type: ').rstrip(' ').upper()
    if 'source_name:' in sample_attribute:
        x = re.search(r'source_name: \w+', sample_attribute)
        return x.group(0).strip('source_name: ').rstrip(' ').upper()
    else:
        #pass
        print('Found {}'.format(sample_attribute))
        return np.nan


[docs]def get_tissue_type(sample_attribute):
    sample_attribute = str(sample_attribute)
    if 'tissue: ' in sample_attribute:
        x = re.search(r'tissue: \w+', sample_attribute)
        return x.group(0).strip('tissue: ').rstrip(' ').lower()
    else:
        print('Found {}'.format(sample_attribute))
        return np.nan


[docs]def get_strain_type(sample_attribute):
    sample_attribute = str(sample_attribute)
    if 'strain: ' in sample_attribute:
        x = re.search(r'strain: \w+', sample_attribute)
        return x.group(0).strip('strain: ').rstrip(' ').lower()
    else:
        print('Found {}'.format(sample_attribute))
        return np.nan


[docs]def summary_starlogs_over_runs(directory, list_of_srr):
    df = pd.DataFrame()
    files_not_found = []
    for run in list_of_srr:
        if not os.path.isfile(os.path.join(directory, run + '.tsv')):
            files_not_found.append(run)
            continue
        temp_df = pd.read_table(os.path.join(directory, run + '.tsv'))
        df = pd.concat([df, temp_df])
    return df, files_not_found


[docs]def get_enrichment_cds_stats(pickle_file):
    data = pickle.load(open(pickle_file, 'rb'))
    mean = np.nanmean(data.values())
    median = np.nanmedian(data.values())
    stddev = np.nanstd(data.values())
    minx = np.nanmin(data.values())
    maxx = np.nanmax(data.values())
    return minx, maxx, mean, median, stddev


[docs]def get_fragment_enrichment_score(txt_file):
    with open(txt_file) as fh:
        data = fh.read()
    enrichment = data.strip('\(').strip('\)').strip(' ').strip()
    enrichment, pval = enrichment.split(',')
    if 'nan' not in enrichment:
        return float(enrichment.strip('Enrichment: ')), float(
            pval.strip(')').strip('pval: '))
    else:
        return np.nan, 1


re_ribo_root_dir = '/staging/as/skchoudh/SRA_datasets/'
samples_to_process_dir = '/staging/as/skchoudh/re-ribo-datasets/'
re_ribo_config_dir = '/home/cmb-panasas2/skchoudh/github_projects/re-ribo-smk/snakemake/configs'
re_ribo_analysis_dir = '/staging/as/skchoudh/re-ribo-analysis/'
riboraptor_annotation_dir = '/home/cmb-panasas2/skchoudh/github_projects/riboraptor/riboraptor/annotation/'

genome_annotation_map = {
    'hg38': 'v25',
    'mm10': 'vM11',
    'mg1655': '',
    'sacCerR64': 'v91',
    'MG1655': 'ASM584v2.38',
    'BDGP6': 'v91',
    'GRCz10': 'v91',
    'panTro3': 'v94',
    'Mmul8': 'v94'
}
taxon_id_map = {
    10090: 'mm10',
    9606: 'hg38',
    4932: 'sacCerR64',
    511145: 'MG1655',
    7227: 'BDGP6',
    7955: 'GRCz10',
    9598: 'panTro3',
    9544: 'Mmul8'
}

genome_fasta_map = {
    'hg38':
    '/home/cmb-panasas2/skchoudh/genomes/hg38/fasta/hg38.fa',
    'mm10':
    '/home/cmb-panasas2/skchoudh/genomes/mm10/fasta/mm10.fa',
    'sacCerR64':
    '/home/cmb-panasas2/skchoudh/genomes/sacCerR64/fasta/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa',
    'MG1655':
    '/home/cmb-panasas2/skchoudh/genomes/escherichia_coli_str_k_12_substr_mg1655/fasta/Escherichia_coli_str_k_12_substr_mg1655.ASM584v2.dna.toplevel.fa',
    'BDGP6':
    '/home/cmb-panasas2/skchoudh/genomes/drosophila_melanogaster_BDGP6/fasta/Drosophila_melanogaster.BDGP6.dna.toplevel.fa',
    'GRCz10':
    '/home/cmb-panasas2/skchoudh/genomes/GRCz10/fasta/Danio_rerio.GRCz10.dna.toplevel.fa',
    'panTro3':
    '/home/cmb-panasas2/skchoudh/genomes/panTro3/fasta/Pan_troglodytes.Pan_tro_3.0.dna.toplevel.fa',
    'Mmul8':
    '/home/cmb-panasas2/skchoudh/genomes/Mmul8/fasta/Macaca_mulatta.Mmul_8.0.1.dna.toplevel.fa'
}

chrom_sizes_map = {
    'hg38':
    '/home/cmb-panasas2/skchoudh/genomes/hg38/fasta/hg38.chrom.sizes',
    'mm10':
    '/home/cmb-panasas2/skchoudh/genomes/mm10/fasta/mm10.chrom.sizes',
    'sacCerR64':
    '/home/cmb-panasas2/skchoudh/genomes/sacCerR64/fasta/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.sizes',
    'MG1655':
    '/home/cmb-panasas2/skchoudh/genomes/escherichia_coli_str_k_12_substr_mg1655/fasta/Escherichia_coli_str_k_12_substr_mg1655.ASM584v2.dna.toplevel.sizes',
    'BDGP6':
    '/home/cmb-panasas2/skchoudh/genomes/drosophila_melanogaster_BDGP6/fasta/Drosophila_melanogaster.BDGP6.dna.toplevel.sizes',
    'GRCz10':
    '/home/cmb-panasas2/skchoudh/genomes/GRCz10/fasta/Danio_rerio.GRCz10.dna.toplevel.sizes',
    'panTro3':
    '/home/cmb-panasas2/skchoudh/genomes/panTro3/fasta/Pan_troglodytes.Pan_tro_3.0.dna.toplevel.sizes',
    'Mmul8':
    '/home/cmb-panasas2/skchoudh/genomes/Mmul8/fasta/Macaca_mulatta.Mmul_8.0.1.dna.toplevel.sizes'
}

star_index_map = {
    'hg38':
    '/home/cmb-panasas2/skchoudh/genomes/hg38/star_annotated',
    'mm10':
    '/home/cmb-panasas2/skchoudh/genomes/mm10/star_annotated',
    'sacCerR64':
    '/home/cmb-panasas2/skchoudh/genomes/sacCerR64/star_annotated',
    'MG1655':
    '/home/cmb-panasas2/skchoudh/genomes/escherichia_coli_str_k_12_substr_mg1655/star_annotated',
    'BDGP6':
    '/home/cmb-panasas2/skchoudh/genomes/drosophila_melanogaster_BDGP6/star_annotated',
    'GRCz10':
    '/home/cmb-panasas2/skchoudh/genomes/GRCz10/star_annotated',
    'panTro3':
    '/home/cmb-panasas2/skchoudh/genomes/panTro3/star_annotated',
    'Mmul8':
    '/home/cmb-panasas2/skchoudh/genomes/Mmul8/star_annotated'
}

gtf_map = {
    'hg38':
    '/home/cmb-panasas2/skchoudh/genomes/hg38/annotation/gencode.v25.annotation.gtf',
    'mm10':
    '/home/cmb-panasas2/skchoudh/genomes/mm10/annotation/gencode.vM11.annotation.gtf',
    'sacCerR64':
    '/home/cmb-panasas2/skchoudh/genomes/sacCerR64/annotation/Saccharomyces_cerevisiae.R64-1-1.91.gtf',
    'MG1655':
    '/home/cmb-panasas2/skchoudh/genomes/escherichia_coli_str_k_12_substr_mg1655/annotation/Escherichia_coli_str_k_12_substr_mg1655.ASM584v2.38.gtf',
    'BDGP6':
    '/home/cmb-panasas2/skchoudh/genomes/drosophila_melanogaster_BDGP6/annotation/Drosophila_melanogaster.BDGP6.91.gtf',
    'GRCz10':
    '/home/cmb-panasas2/skchoudh/genomes/GRCz10/annotation/Danio_rerio.GRCz10.91.gtf',
    'Mmul8':
    '/home/cmb-panasas2/skchoudh/genomes/Mmul8/annotation/Macaca_mulatta.Mmul_8.0.1.94.gtf',
    'panTro3':
    '/home/cmb-panasas2/skchoudh/genomes/panTro3/annotation/Pan_troglodytes.Pan_tro_3.0.94.gtf'
}


[docs]def filter_single_end_samples(df):
    """Filter single end samples from a dataframe

    Parameters
    ----------
    df: DataFrame
        Dataframe as obtained from SRAb.sra_convert()

    Returns
    -------
    df: DataFrame
        DataFrame with only single end samples
    """
    df = df[~df['library_strategy'].str.contains('PAIRED')]
    return df


[docs]def copy_sra_data(df,
                  sra_source_dir='/staging/as/skchoudh/SRA_datasets/',
                  sra_dest_dir='/staging/as/skchoudh/re-ribo-datasets/'):
    """Copy SRA data to a new location retaining only single ended samples."""
    df = filter_single_end_samples(df)
    assert len(df.study_accession.unique()) == 1, 'Multiple SRPs found'
    srp = df.study_accession.unique()[0]
    df_grouped = df.groupby(['taxon_id'])
    srp_source_dir = os.path.join(sra_source_dir, srp)

    for taxon_id, df_group in df_grouped:
        species = taxon_id_map[taxon_id]
        species_dest_dir = os.path.join(sra_dest_dir, species)
        srp_dest_dir = os.path.join(species_dest_dir, srp)
        mkdir_p(os.path.join(species_dest_dir, srp))
        source_loc = srp_source_dir + os.path.sep + df_group[
            'experiment_accession'].str.cat(
                df_group['run_accession'] + '.sra', sep=os.path.sep)
        dest_loc = srp_dest_dir + os.path.sep + df_group[
            'experiment_accession'].str.cat(
                df_group['run_accession'] + '.sra', sep=os.path.sep)
        with tqdm(total=len(source_loc)) as pbar:
            for source, dest in zip(source_loc, dest_loc):
                mkdir_p(os.path.dirname(dest))
                if os.path.isfile(source):
                    symlink_force(source, dest)
                pbar.update()


[docs]def write_config(species, srp):
    rawdata_dir = os.path.join(samples_to_process_dir, species, srp)
    out_dir = os.path.join(re_ribo_analysis_dir, species, srp)
    gene_bed = os.path.join(riboraptor_annotation_dir, species,
                            genome_annotation_map[species], 'gene.bed.gz')
    utr5_bed = os.path.join(riboraptor_annotation_dir, species,
                            genome_annotation_map[species], 'utr5.bed.gz')
    utr3_bed = os.path.join(riboraptor_annotation_dir, species,
                            genome_annotation_map[species], 'utr3.bed.gz')
    intron_bed = os.path.join(riboraptor_annotation_dir, species,
                              genome_annotation_map[species], 'intron.bed.gz')
    start_codon_bed = os.path.join(riboraptor_annotation_dir, species,
                                   genome_annotation_map[species],
                                   'start_codon.bed.gz')
    stop_codon_bed = os.path.join(riboraptor_annotation_dir, species,
                                  genome_annotation_map[species],
                                  'stop_codon.bed.gz')
    cds_bed = os.path.join(riboraptor_annotation_dir, species,
                           genome_annotation_map[species], 'cds.bed.gz')
    genome_fasta = genome_fasta_map[species]
    gtf = gtf_map[species]
    chrom_sizes = chrom_sizes_map[species]
    star_index = star_index_map[species]
    to_write = """
    RAWDATA_DIR = '{}'
    OUT_DIR = '{}'
    GENOME_FASTA = '{}'
    CHROM_SIZES = '{}'
    STAR_INDEX = '{}'
    GTF = '{}'
    GENE_BED = '{}'
    STAR_CODON_BED = '{}'
    STOP_CODON_BED = '{}'
    CDS_BED = '{}'
    UTR5_BED = '{}'
    UTR3_BED = '{}'
    INTRON_BED = '{}'
    ORIENTATIONS = ['5prime', '3prime']
    STRANDS = ['pos', 'neg', 'combined']
    FRAGMENT_LENGTHS =  range(18, 39)
    """.format(rawdata_dir, out_dir, genome_fasta, chrom_sizes, star_index,
               gtf, gene_bed, start_codon_bed, stop_codon_bed, cds_bed,
               utr5_bed, utr3_bed, intron_bed)
    return dedent(to_write)


[docs]def create_config_file(df):
    df_grouped = df.groupby(['taxon_id'])

    for taxon_id, df_group in df_grouped:
        assert len(
            df_group['study_accession'].unique()) == 1, 'Multiple SRPs found'
        species = taxon_id_map[taxon_id]
        srp = df_group['study_accession'].unique()[0]
        with open(
                os.path.join(re_ribo_config_dir, '{}_{}.py'.format(
                    species, srp)), 'w') as fh:

            config = write_config(species, srp)
            fh.write(config)
            print('Wrote {}'.format(
                os.path.join(re_ribo_config_dir, '{}_{}.py'.format(
                    species, srp))))
Contents

Source code for riboraptor.utils