# install packages
list.of.packages <- c("ggplot2", "scattermore", "ggpubr")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[, "Package"])]
if (length(new.packages)) install.packages(new.packages)
library(tidyverse)
library(scattermore)
library(ggpubr)
theme_set(theme_pubr())
eukaryotes <- read_tsv("https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/eukaryotes.txt")
prokaryotes <- read_tsv("https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/prokaryotes.txt")
viruses <- read_tsv("https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/viruses.txt")
FilterData <- function(df) {
df <- df %>% filter(Status %in% c("Chromosome", "Complete Genome", "Scaffold"))
return(df)
}
prokaryotes$taxon <- "Prokaryote"
viruses$taxon <- "Virus"
viruses$`Size (Mb)` <- viruses$`Size (Kb)`/1000
eukaryotes$taxon <- "Eukaryote"
all.data <- bind_rows(list(prokaryotes, viruses, eukaryotes))
data.filtered <- FilterData(all.data)
data.filtered$`GC%` <- as.numeric(data.filtered$`GC%`)
data.filtered <- data.filtered %>%
filter(!is.na(`GC%`)) %>%
filter(!is.na(`Size (Mb)`))
organisms_of_interest <- c(
"Emiliania huxleyi CCMP1516",
"Felis catus",
"Escherichia coli",
"Homo sapiens",
"Saccharomyces cerevisiae",
"SARS coronavirus "
)Lecture 02 - GC content
Goal
We are interested in characterizing if the “GC” content in different clades of organisms is different. We will use the NCBI genome reports to visualize the GC content and genome size of different organisms.
Setup
ggplot(data.filtered, aes(`Size (Mb)`, `GC%`, colour = taxon)) +
# geom_scattermore(size = 2) +
geom_point(size = 0.1, alpha = 0.5) +
facet_wrap(~taxon, scales = "free_x") +
theme(legend.position = "none") +
labs(
x = "Genome Size (Mb)",
y = "GC%"
) +
scale_color_brewer(palette = "Set1") +
scale_x_log10(
breaks = c(0.1, 1, 10, 100, 1000, 10000),
labels = as.character(c(0.1, 1, 10, 100, 1000, 10000))
) +
guides(color = guide_legend(override.aes = list(size = 2, alpha = 1)))Exercise - What explains the blobs in the Eukaryotes?
ggplot(
data.filtered %>% filter(taxon == "Eukaryote"),
aes(`Size (Mb)`, `GC%`)
) +
geom_point(size = 1, alpha = 0.5) +
facet_wrap(~taxon, scales = "free_x") +
scale_x_log10() +
scale_x_log10(
breaks = c(0.1, 1, 10, 100, 1000, 10000),
labels = as.character(c(0.1, 1, 10, 100, 1000, 10000))
) +
guides(color = guide_legend(override.aes = list(size = 2, alpha = 1)))Exercise
Take some representative species and try to visualize thei
data.filtered.ecoli <- data.filtered %>%
filter(grepl(`#Organism/Name`, pattern = "Escherichia coli")) %>%
filter(taxon == "Prokaryote")
data.filtered.ecoli$organism <- "Ecoli"
data.filtered.yeast <- data.filtered %>%
filter(grepl(`#Organism/Name`, pattern = "Saccharomyces cerevisiae")) %>%
filter(Group == "Fungi")
data.filtered.yeast$organism <- "Yeast"
data.filtered.sarscov2 <- data.filtered %>%
filter(grepl(`#Organism/Name`, pattern = "SARS coronavirus")) %>%
filter(taxon == "Virus")
data.filtered.sarscov2$organism <- "SARS-CoV-2"
data.filtered.huxleyi <- data.filtered %>%
filter(grepl(`#Organism/Name`, pattern = "Emiliania huxleyi CCMP1516"))
data.filtered.huxleyi$organism <- "E. huxleyi"
data.filtered.human <- data.filtered %>%
filter(grepl(`#Organism/Name`, pattern = "Homo sapiens")) %>%
filter(taxon == "Eukaryote") %>%
filter(`Size (Mb)` > 500) %>%
filter(`Size (Mb)` < 5000)
data.filtered.human$organism <- "Human"
data.filtered.cat <- data.filtered %>%
filter(grepl(`#Organism/Name`, pattern = "Felis catus")) %>%
filter(taxon == "Eukaryote")
data.filtered.cat$organism <- "Cat"
data.filtered.rabbit <- data.filtered %>%
filter(grepl(`#Organism/Name`, pattern = "Oryctolagus cuniculus")) %>%
filter(taxon == "Eukaryote")
data.filtered.rabbit$organism <- "Rabbit"
data.filtered.plasmodium <- data.filtered %>%
filter(grepl(`#Organism/Name`, pattern = "Plasmodium falciparum")) %>%
filter(taxon == "Eukaryote")
data.filtered.plasmodium$organism <- "Plasmodium"
data.merged <- bind_rows(list(
data.filtered.ecoli, data.filtered.yeast,
data.filtered.sarscov2, data.filtered.huxleyi,
data.filtered.cat,
data.filtered.human, data.filtered.rabbit,
data.filtered.plasmodium
))
ggplot(data.merged, aes(`Size (Mb)`, `GC%`, colour = organism)) +
geom_jitter(size = 2, alpha = 0.5) +
scale_x_log10(
breaks = c(0.1, 1, 10, 100, 1000, 5000),
labels = as.character(c(0.1, 1, 10, 100, 1000, 5000))
) +
labs(
x = "Genome Size (Mb)",
y = "GC%"
) +
guides(color = guide_legend(override.aes = list(size = 2, alpha = 1)))