Lecture 02 - GC content

Goal

We are interested in characterizing if the “GC” content in different clades of organisms is different. We will use the NCBI genome reports to visualize the GC content and genome size of different organisms.

Setup

# install packages
list.of.packages <- c("ggplot2", "scattermore", "ggpubr")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[, "Package"])]
if (length(new.packages)) install.packages(new.packages)

library(tidyverse)
library(scattermore)
library(ggpubr)
theme_set(theme_pubr())

eukaryotes <- read_tsv("https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/eukaryotes.txt")
prokaryotes <- read_tsv("https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/prokaryotes.txt")
viruses <- read_tsv("https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/viruses.txt")


FilterData <- function(df) {
  df <- df %>% filter(Status %in% c("Chromosome", "Complete Genome", "Scaffold"))
  return(df)
}


prokaryotes$taxon <- "Prokaryote"
viruses$taxon <- "Virus"
viruses$`Size (Mb)` <- viruses$`Size (Kb)`/1000
eukaryotes$taxon <- "Eukaryote"

all.data <- bind_rows(list(prokaryotes, viruses, eukaryotes))

data.filtered <- FilterData(all.data)
data.filtered$`GC%` <- as.numeric(data.filtered$`GC%`)

data.filtered <- data.filtered %>%
  filter(!is.na(`GC%`)) %>%
  filter(!is.na(`Size (Mb)`))


organisms_of_interest <- c(
  "Emiliania huxleyi CCMP1516",
  "Felis catus",
  "Escherichia coli",
  "Homo sapiens",
  "Saccharomyces cerevisiae",
  "SARS coronavirus "
)
ggplot(data.filtered, aes(`Size (Mb)`, `GC%`, colour = taxon)) +

  # geom_scattermore(size = 2) +
  geom_point(size = 0.1, alpha = 0.5) +
  facet_wrap(~taxon, scales = "free_x") +
  theme(legend.position = "none") +
  labs(
    x = "Genome Size (Mb)",
    y = "GC%"
  ) +
  scale_color_brewer(palette = "Set1") +
  scale_x_log10(
    breaks = c(0.1, 1, 10, 100, 1000, 10000),
    labels = as.character(c(0.1, 1, 10, 100, 1000, 10000))
  ) +
  guides(color = guide_legend(override.aes = list(size = 2, alpha = 1)))

Exercise - What explains the blobs in the Eukaryotes?

ggplot(
  data.filtered %>% filter(taxon == "Eukaryote"),
  aes(`Size (Mb)`, `GC%`)
) +
  geom_point(size = 1, alpha = 0.5) +
  facet_wrap(~taxon, scales = "free_x") +
  scale_x_log10() +
  scale_x_log10(
    breaks = c(0.1, 1, 10, 100, 1000, 10000),
    labels = as.character(c(0.1, 1, 10, 100, 1000, 10000))
  ) +
  guides(color = guide_legend(override.aes = list(size = 2, alpha = 1)))

Exercise

Take some representative species and try to visualize thei

data.filtered.ecoli <- data.filtered %>%
  filter(grepl(`#Organism/Name`, pattern = "Escherichia coli")) %>%
  filter(taxon == "Prokaryote")
data.filtered.ecoli$organism <- "Ecoli"


data.filtered.yeast <- data.filtered %>%
  filter(grepl(`#Organism/Name`, pattern = "Saccharomyces cerevisiae")) %>%
  filter(Group == "Fungi")
data.filtered.yeast$organism <- "Yeast"


data.filtered.sarscov2 <- data.filtered %>%
  filter(grepl(`#Organism/Name`, pattern = "SARS coronavirus")) %>%
  filter(taxon == "Virus")
data.filtered.sarscov2$organism <- "SARS-CoV-2"

data.filtered.huxleyi <- data.filtered %>%
  filter(grepl(`#Organism/Name`, pattern = "Emiliania huxleyi CCMP1516"))
data.filtered.huxleyi$organism <- "E. huxleyi"


data.filtered.human <- data.filtered %>%
  filter(grepl(`#Organism/Name`, pattern = "Homo sapiens")) %>%
  filter(taxon == "Eukaryote") %>%
  filter(`Size (Mb)` > 500) %>%
  filter(`Size (Mb)` < 5000)

data.filtered.human$organism <- "Human"


data.filtered.cat <- data.filtered %>%
  filter(grepl(`#Organism/Name`, pattern = "Felis catus")) %>%
  filter(taxon == "Eukaryote")
data.filtered.cat$organism <- "Cat"

data.filtered.rabbit <- data.filtered %>%
  filter(grepl(`#Organism/Name`, pattern = "Oryctolagus cuniculus")) %>%
  filter(taxon == "Eukaryote")
data.filtered.rabbit$organism <- "Rabbit"


data.filtered.plasmodium <- data.filtered %>%
  filter(grepl(`#Organism/Name`, pattern = "Plasmodium falciparum")) %>%
  filter(taxon == "Eukaryote")
data.filtered.plasmodium$organism <- "Plasmodium"

data.merged <- bind_rows(list(
  data.filtered.ecoli, data.filtered.yeast,
  data.filtered.sarscov2, data.filtered.huxleyi,
  data.filtered.cat,
  data.filtered.human, data.filtered.rabbit,
  data.filtered.plasmodium
))
ggplot(data.merged, aes(`Size (Mb)`, `GC%`, colour = organism)) +

  geom_jitter(size = 2, alpha = 0.5) +
  scale_x_log10(
    breaks = c(0.1, 1, 10, 100, 1000, 5000),
    labels = as.character(c(0.1, 1, 10, 100, 1000, 5000))
  ) +
  labs(
    x = "Genome Size (Mb)",
    y = "GC%"
  ) +
  guides(color = guide_legend(override.aes = list(size = 2, alpha = 1)))