DH607 - Introduction to computational multi-omics
  • Home
  • Syllabus
  • FAQs
  • Programming resources
  • Code notebooks
    • Lecture 02 - Relation of G/C content with genome sizes

Lecture 02 - GC content

# install packages
list.of.packages <- c("ggplot2", "scattermore", "ggpubr")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[, "Package"])]
if (length(new.packages)) install.packages(new.packages)

library(tidyverse)
library(scattermore)
library(ggpubr)
theme_set(theme_pubr())

eukaryotes <- read_tsv("https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/eukaryotes.txt")
prokaryotes <- read_tsv("https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/prokaryotes.txt")
viruses <- read_tsv("https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/viruses.txt")


FilterData <- function(df) {
  df <- df %>% filter(Status %in% c("Chromosome", "Complete Genome", "Scaffold"))
  return(df)
}


prokaryotes$taxon <- "Prokaryote"
viruses$taxon <- "Virus"
viruses$`Size (Mb)` <- viruses$`Size (Kb)`/1000
eukaryotes$taxon <- "Eukaryote"

all.data <- bind_rows(list(prokaryotes, viruses, eukaryotes))

data.filtered <- FilterData(all.data)
data.filtered$`GC%` <- as.numeric(data.filtered$`GC%`)

data.filtered <- data.filtered %>%
  filter(!is.na(`GC%`)) %>%
  filter(!is.na(`Size (Mb)`))


organisms_of_interest <- c(
  "Emiliania huxleyi CCMP1516",
  "Felis catus",
  "Escherichia coli",
  "Homo sapiens",
  "Saccharomyces cerevisiae",
  "SARS coronavirus "
)
ggplot(data.filtered, aes(`Size (Mb)`, `GC%`, colour = taxon)) +

  # geom_scattermore(size = 2) +
  geom_point(size = 0.1, alpha = 0.5) +
  facet_wrap(~taxon, scales = "free_x") +
  theme(legend.position = "none") +
  labs(
    x = "Genome Size (Mb)",
    y = "GC%"
  ) +
  scale_color_brewer(palette = "Set1") +
  scale_x_log10(
    breaks = c(0.1, 1, 10, 100, 1000, 10000),
    labels = as.character(c(0.1, 1, 10, 100, 1000, 10000))
  ) +
  guides(color = guide_legend(override.aes = list(size = 2, alpha = 1)))

ggplot(
  data.filtered %>% filter(taxon == "Eukaryote"),
  aes(`Size (Mb)`, `GC%`, colour = Group)
) +
  geom_point(size = 1, alpha = 0.5) +
  facet_wrap(~taxon, scales = "free_x") +
  scale_x_log10() +
  scale_color_brewer(palette = "Set2") +
  scale_x_log10(
    breaks = c(0.1, 1, 10, 100, 1000, 10000),
    labels = as.character(c(0.1, 1, 10, 100, 1000, 10000))
  ) +
  guides(color = guide_legend(override.aes = list(size = 2, alpha = 1)))

data.filtered.ecoli <- data.filtered %>%
  filter(grepl(`#Organism/Name`, pattern = "Escherichia coli")) %>%
  filter(taxon == "Prokaryote")
data.filtered.ecoli$organism <- "Ecoli"


data.filtered.yeast <- data.filtered %>%
  filter(grepl(`#Organism/Name`, pattern = "Saccharomyces cerevisiae")) %>%
  filter(Group == "Fungi")
data.filtered.yeast$organism <- "Yeast"


data.filtered.sarscov2 <- data.filtered %>%
  filter(grepl(`#Organism/Name`, pattern = "SARS coronavirus")) %>%
  filter(taxon == "Virus")
data.filtered.sarscov2$organism <- "SARS-CoV-2"

data.filtered.huxleyi <- data.filtered %>%
  filter(grepl(`#Organism/Name`, pattern = "Emiliania huxleyi CCMP1516"))
data.filtered.huxleyi$organism <- "E. huxleyi"


data.filtered.human <- data.filtered %>%
  filter(grepl(`#Organism/Name`, pattern = "Homo sapiens")) %>%
  filter(taxon == "Eukaryote") %>%
  filter(`Size (Mb)` > 500) %>%
  filter(`Size (Mb)` < 5000)

data.filtered.human$organism <- "Human"


data.filtered.cat <- data.filtered %>%
  filter(grepl(`#Organism/Name`, pattern = "Felis catus")) %>%
  filter(taxon == "Eukaryote")
data.filtered.cat$organism <- "Cat"

data.filtered.rabbit <- data.filtered %>%
  filter(grepl(`#Organism/Name`, pattern = "Oryctolagus cuniculus")) %>%
  filter(taxon == "Eukaryote")
data.filtered.rabbit$organism <- "Rabbit"


data.filtered.plasmodium <- data.filtered %>%
  filter(grepl(`#Organism/Name`, pattern = "Plasmodium falciparum")) %>%
  filter(taxon == "Eukaryote")
data.filtered.plasmodium$organism <- "Plasmodium"

data.merged <- bind_rows(list(
  data.filtered.ecoli, data.filtered.yeast,
  data.filtered.sarscov2, data.filtered.huxleyi,
  data.filtered.cat,
  data.filtered.human, data.filtered.rabbit,
  data.filtered.plasmodium
))
ggplot(data.merged, aes(`Size (Mb)`, `GC%`, colour = organism)) +

  # geom_scattermore(size = 2) +
  geom_jitter(size = 2, alpha = 0.5) +
  # facet_wrap(~taxon, scales = "free_x")+
  scale_x_log10(
    breaks = c(0.1, 1, 10, 100, 1000, 5000),
    labels = as.character(c(0.1, 1, 10, 100, 1000, 5000))
  ) +
  labs(
    x = "Genome Size (Mb)",
    y = "GC%"
  ) +
  scale_color_brewer(palette = "Dark2", name = "Species") +
  guides(color = guide_legend(override.aes = list(size = 2, alpha = 1)))