# install packages
list.of.packages <- c("ggplot2", "scattermore", "ggpubr")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[, "Package"])]
if (length(new.packages)) install.packages(new.packages)
library(tidyverse)
library(scattermore)
library(ggpubr)
theme_set(theme_pubr())
eukaryotes <- read_tsv("https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/eukaryotes.txt")
prokaryotes <- read_tsv("https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/prokaryotes.txt")
viruses <- read_tsv("https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/viruses.txt")
FilterData <- function(df) {
df <- df %>% filter(Status %in% c("Chromosome", "Complete Genome", "Scaffold"))
return(df)
}
prokaryotes$taxon <- "Prokaryote"
viruses$taxon <- "Virus"
viruses$`Size (Mb)` <- viruses$`Size (Kb)`/1000
eukaryotes$taxon <- "Eukaryote"
all.data <- bind_rows(list(prokaryotes, viruses, eukaryotes))
data.filtered <- FilterData(all.data)
data.filtered$`GC%` <- as.numeric(data.filtered$`GC%`)
data.filtered <- data.filtered %>%
filter(!is.na(`GC%`)) %>%
filter(!is.na(`Size (Mb)`))
organisms_of_interest <- c(
"Emiliania huxleyi CCMP1516",
"Felis catus",
"Escherichia coli",
"Homo sapiens",
"Saccharomyces cerevisiae",
"SARS coronavirus "
)Lecture 02 - GC content
ggplot(data.filtered, aes(`Size (Mb)`, `GC%`, colour = taxon)) +
# geom_scattermore(size = 2) +
geom_point(size = 0.1, alpha = 0.5) +
facet_wrap(~taxon, scales = "free_x") +
theme(legend.position = "none") +
labs(
x = "Genome Size (Mb)",
y = "GC%"
) +
scale_color_brewer(palette = "Set1") +
scale_x_log10(
breaks = c(0.1, 1, 10, 100, 1000, 10000),
labels = as.character(c(0.1, 1, 10, 100, 1000, 10000))
) +
guides(color = guide_legend(override.aes = list(size = 2, alpha = 1)))ggplot(
data.filtered %>% filter(taxon == "Eukaryote"),
aes(`Size (Mb)`, `GC%`, colour = Group)
) +
geom_point(size = 1, alpha = 0.5) +
facet_wrap(~taxon, scales = "free_x") +
scale_x_log10() +
scale_color_brewer(palette = "Set2") +
scale_x_log10(
breaks = c(0.1, 1, 10, 100, 1000, 10000),
labels = as.character(c(0.1, 1, 10, 100, 1000, 10000))
) +
guides(color = guide_legend(override.aes = list(size = 2, alpha = 1)))data.filtered.ecoli <- data.filtered %>%
filter(grepl(`#Organism/Name`, pattern = "Escherichia coli")) %>%
filter(taxon == "Prokaryote")
data.filtered.ecoli$organism <- "Ecoli"
data.filtered.yeast <- data.filtered %>%
filter(grepl(`#Organism/Name`, pattern = "Saccharomyces cerevisiae")) %>%
filter(Group == "Fungi")
data.filtered.yeast$organism <- "Yeast"
data.filtered.sarscov2 <- data.filtered %>%
filter(grepl(`#Organism/Name`, pattern = "SARS coronavirus")) %>%
filter(taxon == "Virus")
data.filtered.sarscov2$organism <- "SARS-CoV-2"
data.filtered.huxleyi <- data.filtered %>%
filter(grepl(`#Organism/Name`, pattern = "Emiliania huxleyi CCMP1516"))
data.filtered.huxleyi$organism <- "E. huxleyi"
data.filtered.human <- data.filtered %>%
filter(grepl(`#Organism/Name`, pattern = "Homo sapiens")) %>%
filter(taxon == "Eukaryote") %>%
filter(`Size (Mb)` > 500) %>%
filter(`Size (Mb)` < 5000)
data.filtered.human$organism <- "Human"
data.filtered.cat <- data.filtered %>%
filter(grepl(`#Organism/Name`, pattern = "Felis catus")) %>%
filter(taxon == "Eukaryote")
data.filtered.cat$organism <- "Cat"
data.filtered.rabbit <- data.filtered %>%
filter(grepl(`#Organism/Name`, pattern = "Oryctolagus cuniculus")) %>%
filter(taxon == "Eukaryote")
data.filtered.rabbit$organism <- "Rabbit"
data.filtered.plasmodium <- data.filtered %>%
filter(grepl(`#Organism/Name`, pattern = "Plasmodium falciparum")) %>%
filter(taxon == "Eukaryote")
data.filtered.plasmodium$organism <- "Plasmodium"
data.merged <- bind_rows(list(
data.filtered.ecoli, data.filtered.yeast,
data.filtered.sarscov2, data.filtered.huxleyi,
data.filtered.cat,
data.filtered.human, data.filtered.rabbit,
data.filtered.plasmodium
))
ggplot(data.merged, aes(`Size (Mb)`, `GC%`, colour = organism)) +
# geom_scattermore(size = 2) +
geom_jitter(size = 2, alpha = 0.5) +
# facet_wrap(~taxon, scales = "free_x")+
scale_x_log10(
breaks = c(0.1, 1, 10, 100, 1000, 5000),
labels = as.character(c(0.1, 1, 10, 100, 1000, 5000))
) +
labs(
x = "Genome Size (Mb)",
y = "GC%"
) +
scale_color_brewer(palette = "Dark2", name = "Species") +
guides(color = guide_legend(override.aes = list(size = 2, alpha = 1)))