Source code for riboraptor.sradb

"""Helper functions for parsing SRAmetadb.sqlite file"""

import re
import sqlite3
import pandas as pd
"""
Tables

1. Study
> dbListFields(sra_con,"study")
"study_ID"    "study_alias"    "study_accession"    "study_title"    "study_type"
"study_abstract"    "broker_name"    "center_name"    "center_project_name"
"study_description"    "related_studies"    "primary_study"    "sra_link"
"study_url_link"    "xref_link"    "study_entrez_link"    "ddbj_link"    "ena_link"


2. Sample
> dbListFields(sra_con,"sample")
"sample_ID"    "sample_alias"    "sample_accession"    "broker_name"
"center_name"    "taxon_id"    "scientific_name"    "common_name"
"anonymized_name"    "individual_name"    "description"    "sra_link"
"sample_url_link"    "xref_link"    "sample_entrez_link"   "ddbj_link"
"ena_link"    "sample_attribute"    "submission_accession" "sradb_updated"

3. Experiment
> dbListFields(sra_con,"experiment")
"experiment_ID"    "bamFile"    "fastqFTP"    "experiment_alias"
"experiment_accession"    "broker_name"    "center_name"    "title"
"study_name"    "study_accession"    "design_description"    "sample_name"
"sample_accession"    "sample_member"    "library_name"    "library_strategy"
"library_source"    "library_selection"    "library_layout"    "targeted_loci"
"library_construction_protocol" "spot_length"    "adapter_spec"    "read_spec"
"platform"    "instrument_model"    "platform_parameters"    "sequence_space"
"base_caller"    "quality_scorer"    "number_of_levels"    "multiplier"
"qtype"    "sra_link"    "experiment_url_link"    "xref_link"
"experiment_entrez_link"    "ddbj_link"    "ena_link"    "experiment_attribute"
"submission_accession"    "sradb_updated"

4. Run
> dbListFields(sra_con,"run")
"run_ID"    "bamFile"    "run_alias"    "run_accession"
"broker_name"    "instrument_name"    "run_date"    "run_file"
"run_center"    "total_data_blocks"    "experiment_accession"    "experiment_name"
"sra_link"    "run_url_link"    "xref_link"    "run_entrez_link"     "ddbj_link"
"ena_link"    "run_attribute"    "submission_accession" "sradb_updated"


5. Submission

> dbListFields(sra_con,"submission")
"submission_ID"    "submission_alias"    "submission_accession"   "submission_comment"
"files"    "broker_name"    "center_name"    "lab_name"
"submission_date"    "sra_link"    "submission_url_link"    "xref_link"
"submission_entrez_link"   "ddbj_link"    "ena_link"    "submission_attribute"
"sradb_updated"

"""


def _extract_first_field(data):
    """Extract first field from a list of fields"""
    return list(next(zip(*data)))


[docs]class SRAdb(object):
    def __init__(self, sqlite_file):
        self.sqlite_file = sqlite_file
        self.open()
        self.cursor = self.db.cursor()
        self.valid_in_acc_type = [
            'SRA', 'ERA', 'DRA', 'SRP', 'ERP', 'DRP', 'SRS', 'ERS', 'DRS',
            'SRX', 'ERX', 'DRX', 'SRR', 'ERR', 'DRR'
        ]
        self.valid_in_type = {
            'SRA': 'submission',
            'ERA': 'submission',
            'DRA': 'submission',
            'SRP': 'study',
            'ERP': 'study',
            'DRP': 'study',
            'SRS': 'sample',
            'ERS': 'sample',
            'DRS': 'sample',
            'SRX': 'experiment',
            'ERX': 'experiment',
            'DRX': 'experiment',
            'SRR': 'run',
            'ERR': 'run',
            'DRR': 'run'
        }

[docs]    def open(self):
        self.db = sqlite3.connect(self.sqlite_file)
        self.db.text_factory = str

[docs]    def close(self):
        self.db.close()

[docs]    def list_tables(self):
        """List all tables in the sqlite"""
        results = self.cursor.execute(
            'SELECT name FROM sqlite_master WHERE type="table";').fetchall()
        return _extract_first_field(results)

[docs]    def list_fields(self, table):
        "List all fields in a given table"
        results = self.cursor.execute('SELECT * FROM {}'.format(table))
        return _extract_first_field(results.description)

[docs]    def desc_table(self, table):
        results = self.cursor.execute(
            'PRAGMA table_info("{}")'.format(table)).fetchall()
        columns = ['cid', 'name', 'dtype', 'notnull', 'dflt_value', 'pk']
        data = []
        for result in results:
            data.append(list(map(lambda x: str(x), result)))
        df = pd.DataFrame(data, columns=columns)
        return df

[docs]    def get_query(self, query):
        results = self.cursor.execute(query).fetchall()
        column_names = list(map(lambda x: x[0], self.cursor.description))
        results = [dict(zip(column_names, result)) for result in results]
        return pd.DataFrame(results)

[docs]    def get_row_count(self, table):
        """Get row counts for a table"""
        return self.cursor.execute(
            'SELECT max(rowid) FROM {}'.format(table)).fetchone()[0]

[docs]    def get_table_counts(self):
        tables = self.list_tables()
        results = dict(
            [(table, self.get_row_count(table)) for table in tables])
        return pd.DataFrame.from_dict(
            results, orient='index', columns=['count'])

[docs]    def sra_convert(self,
                    acc,
                    out_type=[
                        'study_accession',
                        'experiment_accession',
                        'experiment_title',
                        'run_accession',
                        'taxon_id',
                        'library_selection',
                        'library_layout',
                        'library_strategy',
                        'library_source',
                        'library_name',
                        'bases',
                        'spots',
                        'adapter_spec',
                    ]):
        in_acc_type = re.sub('\\d+$', '', acc).upper()
        if in_acc_type not in self.valid_in_acc_type:
            raise ValueError('{} not a valid input type'.format(in_acc_type))

        in_type = self.valid_in_type[in_acc_type]
        out_type = [x for x in out_type if x != in_type]

        select_type = [in_type + '_accession'] + out_type
        select_type_sql = (',').join(select_type)
        sql = "SELECT DISTINCT " + select_type_sql + " FROM sra_ft WHERE sra_ft MATCH '" + acc + "';"
        df = self.get_query(sql)
        df['avg_read_length'] = df['bases'] / df['spots']
        df['spots'] = df['spots'].astype(int)
        df['bases'] = df['bases'].astype(int)
        df['taxon_id'] = df['taxon_id'].fillna(0).astype(int)
        df = df.sort_values(by=[
            'taxon_id', 'avg_read_length', 'run_accession',
            'experiment_accession', 'library_selection'
        ])
        df = df[out_type + ['avg_read_length']].reset_index(drop=True)
        return df

[docs]    def search_experiment(self, srx):
        """Search for a SRX/GSM id in the experiments"""
        if 'GSM' in srx:
            results = self.cursor.execute(
                'select * from EXPERIMENT where experiment_alias = "{}"'.
                format(srx)).fetchall()
        else:
            results = self.cursor.execute(
                'select * from EXPERIMENT where experiment_accession = "{}"'.
                format(srx)).fetchall()
        assert len(results) == 1, 'Got multiple hits'
        results = results[0]
        column_names = list(map(lambda x: x[0], self.cursor.description))
        results = dict(zip(column_names, results))
        return results

[docs]    def convert_gse_to_srp(self, gse):
        """Convert GSE to SRP id.
        Requires input db to be GEOmetadb.sqlite
        """
        results = self.get_query('SELECT * from gse WHERE gse = "' + gse + '"')
        if results.shape[0] == 1:
            #result = results[0].split(';')[0]
            splitted = results['supplementary_file'][0].split(';')
            if len(splitted):
                match = re.findall('SRP.*', splitted[-1])
                if len(match):
                    srp = match[0].split('/')[-1]
                    return srp
Contents

Source code for riboraptor.sradb