"""Helper functions for parsing SRAmetadb.sqlite file"""
import re
import sqlite3
import pandas as pd
"""
Tables
1. Study
> dbListFields(sra_con,"study")
"study_ID" "study_alias" "study_accession" "study_title" "study_type"
"study_abstract" "broker_name" "center_name" "center_project_name"
"study_description" "related_studies" "primary_study" "sra_link"
"study_url_link" "xref_link" "study_entrez_link" "ddbj_link" "ena_link"
2. Sample
> dbListFields(sra_con,"sample")
"sample_ID" "sample_alias" "sample_accession" "broker_name"
"center_name" "taxon_id" "scientific_name" "common_name"
"anonymized_name" "individual_name" "description" "sra_link"
"sample_url_link" "xref_link" "sample_entrez_link" "ddbj_link"
"ena_link" "sample_attribute" "submission_accession" "sradb_updated"
3. Experiment
> dbListFields(sra_con,"experiment")
"experiment_ID" "bamFile" "fastqFTP" "experiment_alias"
"experiment_accession" "broker_name" "center_name" "title"
"study_name" "study_accession" "design_description" "sample_name"
"sample_accession" "sample_member" "library_name" "library_strategy"
"library_source" "library_selection" "library_layout" "targeted_loci"
"library_construction_protocol" "spot_length" "adapter_spec" "read_spec"
"platform" "instrument_model" "platform_parameters" "sequence_space"
"base_caller" "quality_scorer" "number_of_levels" "multiplier"
"qtype" "sra_link" "experiment_url_link" "xref_link"
"experiment_entrez_link" "ddbj_link" "ena_link" "experiment_attribute"
"submission_accession" "sradb_updated"
4. Run
> dbListFields(sra_con,"run")
"run_ID" "bamFile" "run_alias" "run_accession"
"broker_name" "instrument_name" "run_date" "run_file"
"run_center" "total_data_blocks" "experiment_accession" "experiment_name"
"sra_link" "run_url_link" "xref_link" "run_entrez_link" "ddbj_link"
"ena_link" "run_attribute" "submission_accession" "sradb_updated"
5. Submission
> dbListFields(sra_con,"submission")
"submission_ID" "submission_alias" "submission_accession" "submission_comment"
"files" "broker_name" "center_name" "lab_name"
"submission_date" "sra_link" "submission_url_link" "xref_link"
"submission_entrez_link" "ddbj_link" "ena_link" "submission_attribute"
"sradb_updated"
"""
def _extract_first_field(data):
"""Extract first field from a list of fields"""
return list(next(zip(*data)))
[docs]class SRAdb(object):
def __init__(self, sqlite_file):
self.sqlite_file = sqlite_file
self.open()
self.cursor = self.db.cursor()
self.valid_in_acc_type = [
'SRA', 'ERA', 'DRA', 'SRP', 'ERP', 'DRP', 'SRS', 'ERS', 'DRS',
'SRX', 'ERX', 'DRX', 'SRR', 'ERR', 'DRR'
]
self.valid_in_type = {
'SRA': 'submission',
'ERA': 'submission',
'DRA': 'submission',
'SRP': 'study',
'ERP': 'study',
'DRP': 'study',
'SRS': 'sample',
'ERS': 'sample',
'DRS': 'sample',
'SRX': 'experiment',
'ERX': 'experiment',
'DRX': 'experiment',
'SRR': 'run',
'ERR': 'run',
'DRR': 'run'
}
[docs] def open(self):
self.db = sqlite3.connect(self.sqlite_file)
self.db.text_factory = str
[docs] def close(self):
self.db.close()
[docs] def list_tables(self):
"""List all tables in the sqlite"""
results = self.cursor.execute(
'SELECT name FROM sqlite_master WHERE type="table";').fetchall()
return _extract_first_field(results)
[docs] def list_fields(self, table):
"List all fields in a given table"
results = self.cursor.execute('SELECT * FROM {}'.format(table))
return _extract_first_field(results.description)
[docs] def desc_table(self, table):
results = self.cursor.execute(
'PRAGMA table_info("{}")'.format(table)).fetchall()
columns = ['cid', 'name', 'dtype', 'notnull', 'dflt_value', 'pk']
data = []
for result in results:
data.append(list(map(lambda x: str(x), result)))
df = pd.DataFrame(data, columns=columns)
return df
[docs] def get_query(self, query):
results = self.cursor.execute(query).fetchall()
column_names = list(map(lambda x: x[0], self.cursor.description))
results = [dict(zip(column_names, result)) for result in results]
return pd.DataFrame(results)
[docs] def get_row_count(self, table):
"""Get row counts for a table"""
return self.cursor.execute(
'SELECT max(rowid) FROM {}'.format(table)).fetchone()[0]
[docs] def get_table_counts(self):
tables = self.list_tables()
results = dict(
[(table, self.get_row_count(table)) for table in tables])
return pd.DataFrame.from_dict(
results, orient='index', columns=['count'])
[docs] def sra_convert(self,
acc,
out_type=[
'study_accession',
'experiment_accession',
'experiment_title',
'run_accession',
'taxon_id',
'library_selection',
'library_layout',
'library_strategy',
'library_source',
'library_name',
'bases',
'spots',
'adapter_spec',
]):
in_acc_type = re.sub('\\d+$', '', acc).upper()
if in_acc_type not in self.valid_in_acc_type:
raise ValueError('{} not a valid input type'.format(in_acc_type))
in_type = self.valid_in_type[in_acc_type]
out_type = [x for x in out_type if x != in_type]
select_type = [in_type + '_accession'] + out_type
select_type_sql = (',').join(select_type)
sql = "SELECT DISTINCT " + select_type_sql + " FROM sra_ft WHERE sra_ft MATCH '" + acc + "';"
df = self.get_query(sql)
df['avg_read_length'] = df['bases'] / df['spots']
df['spots'] = df['spots'].astype(int)
df['bases'] = df['bases'].astype(int)
df['taxon_id'] = df['taxon_id'].fillna(0).astype(int)
df = df.sort_values(by=[
'taxon_id', 'avg_read_length', 'run_accession',
'experiment_accession', 'library_selection'
])
df = df[out_type + ['avg_read_length']].reset_index(drop=True)
return df
[docs] def search_experiment(self, srx):
"""Search for a SRX/GSM id in the experiments"""
if 'GSM' in srx:
results = self.cursor.execute(
'select * from EXPERIMENT where experiment_alias = "{}"'.
format(srx)).fetchall()
else:
results = self.cursor.execute(
'select * from EXPERIMENT where experiment_accession = "{}"'.
format(srx)).fetchall()
assert len(results) == 1, 'Got multiple hits'
results = results[0]
column_names = list(map(lambda x: x[0], self.cursor.description))
results = dict(zip(column_names, results))
return results
[docs] def convert_gse_to_srp(self, gse):
"""Convert GSE to SRP id.
Requires input db to be GEOmetadb.sqlite
"""
results = self.get_query('SELECT * from gse WHERE gse = "' + gse + '"')
if results.shape[0] == 1:
#result = results[0].split(';')[0]
splitted = results['supplementary_file'][0].split(';')
if len(splitted):
match = re.findall('SRP.*', splitted[-1])
if len(match):
srp = match[0].split('/')[-1]
return srp