Source code for pysradb.geodb
"""Methods to interact with SRA"""
# This is now defunct and will be removed in a future release.
import gzip
import os
import re
import sys
from io import StringIO
import pandas as pd
from .basedb import BASEdb
from .utils import _get_url, copyfileobj, get_gzip_uncompressed_size
PY3 = True
if sys.version_info[0] < 3:
PY3 = False
GEOmetadb_URL = "http://starbuck1.s3.amazonaws.com/sradb/GEOmetadb.sqlite.gz"
[docs]
def download_geodb_file(download_dir=os.getcwd(), overwrite=True):
"""Download GEOmetadb.sqlite file.
Parameters
----------
download_dir: string
Directory to download SRAmetadb.sqlite
overwrite: bool
overwrite existing file(s).
Set to True by default.
"""
download_location = os.path.join(download_dir, "GEOmetadb.sqlite.gz")
download_location_unzip = download_location.rstrip(".gz")
if os.path.isfile(download_location) and overwrite is False:
raise RuntimeError(
"{} already exists! Set `overwrite=True` to redownload.".format(
download_location
)
)
if os.path.isfile(download_location_unzip) and overwrite is False:
raise RuntimeError(
"{} already exists! Set `overwrite=True` to redownload.".format(
download_location_unzip
)
)
try:
_get_url(GEOmetadb_URL, download_location)
except Exception as e:
raise RuntimeError(
"Could not use {}.\nException: {}.\n".format(GEOmetadb_URL, e)
)
print("Extracting {} ...".format(download_location))
filesize = get_gzip_uncompressed_size(download_location)
with gzip.open(download_location, "rb") as fh_in:
with open(download_location_unzip, "wb") as fh_out:
copyfileobj(
fh_in,
fh_out,
filesize=filesize,
desc="Extracting {}".format("GEOmetadb.sqlite.gz"),
)
print("Done!")
db = GEOdb(download_location_unzip)
metadata = db.query("SELECT * FROM metaInfo")
db.close()
print("Metadata associated with {}:".format(download_location_unzip))
print(metadata)
[docs]
class GEOdb(BASEdb):
def __init__(self, sqlite_file):
"""Initialize SRAdb.
Parameters
----------
sqlite_file: string
Path to unzipped SRAmetadb.sqlite file
"""
super(GEOdb, self).__init__(sqlite_file)
self._db_type = "GEO"
self.valid_in_type = ["GSE", "GPL", "GSM", "GDS"]
[docs]
def geo_convert(self, from_acc):
"""Convert one GEO accession to other.
Parameters
----------
from_acc: string
GPL/GSE/GSM accession ID
Returns
-------
mapping_df: DataFrame
A dataframe with relevant mappings
"""
return self.query(
"SELECT * FROM geoConvert WHERE from_acc='{}';".format(from_acc)
)
[docs]
def gse_to_gsm(self, gse):
"""Fetch GSMs for a GSE.
Parameters
----------
gse: string
GSE ID
Returns
-------
mapping_df: DataFrame
A dataframe with relevant mappings
"""
return self.query("SELECT * FROM gse_gsm WHERE gse='{}'".format(gse))
[docs]
def gsm_to_gse(self, gsm):
"""Fetch GSE for a GSM.
Parameters
----------
gsm: string
GSM ID
Returns
-------
mapping_df: DataFrame
A dataframe with relevant mappings
"""
mapping_df = self.query("SELECT * FROM gse_gsm WHERE gsm='{}'".format(gsm))
return mapping_df.loc[:, ["gsm", "gse"]]
[docs]
def guess_srp_from_gse(self, gse):
"""Convert GSE to SRP id.
Parameters
----------
gse: string
GSE ID
Returns
-------
srp: string
SRP ID
"""
results = self.query('SELECT * FROM gse WHERE gse = "' + gse + '"')
if results.shape[0] == 1:
supp_file = results["supplementary_file"][0]
if supp_file:
splitted = supp_file.split(";")
if len(splitted):
match = re.findall("SRP.*", splitted[-1])
if len(match):
srp = match[0].split("/")[-1]
return srp
return None
[docs]
def download_geo_matrix(accession, output_dir="."):
"""
Download a GEO Matrix file for a given GEO accession ID.
Args:
accession (str): GEO accession ID (e.g., 'GSE234190').
output_dir (str): Directory to save the downloaded file (default: current directory).
Returns:
str: Path to the downloaded file.
Raises:
Exception: If the download fails.
"""
# Construct the URL for the GEO Matrix file
url = f"https://ftp.ncbi.nlm.nih.gov/geo/series/{accession[:-3]}nnn/{accession}/matrix/{accession}_series_matrix.txt.gz"
# Define the output file path
output_file = os.path.join(output_dir, f"{accession}_series_matrix.txt.gz")
# Download the file using _get_url
try:
_get_url(url, output_file)
return output_file
except Exception as e:
raise Exception(
f"Failed to download GEO Matrix file for {accession}. Exception: {str(e)}"
)
[docs]
def parse_geo_matrix_to_tsv(input_file, output_file):
"""
Parse a GEO Matrix file to a TSV file, extracting the expression data.
Args:
input_file (str): Path to the input GEO Matrix file (gzipped).
output_file (str): Path to save the output TSV file.
Returns:
pandas.DataFrame: The parsed expression data.
"""
# Read the gzipped file and extract the data section
data_lines = []
data_section = False
with gzip.open(input_file, "rt", encoding="utf-8") as f:
for line in f:
line = line.strip()
if line == "!series_matrix_table_begin":
data_section = True
continue
elif line == "!series_matrix_table_end":
break
if data_section and line:
data_lines.append(line)
# Use pandas.read_csv to parse the data section
df = pd.read_csv(StringIO("\n".join(data_lines)), sep="\t", comment="!")
# Save to TSV
df.to_csv(output_file, sep="\t", index=False)
return df