# install packages
<- c("ggplot2", "scattermore", "ggpubr")
list.of.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[, "Package"])]
new.packages if (length(new.packages)) install.packages(new.packages)
library(tidyverse)
library(scattermore)
library(ggpubr)
theme_set(theme_pubr())
<- read_tsv("https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/eukaryotes.txt")
eukaryotes <- read_tsv("https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/prokaryotes.txt")
prokaryotes <- read_tsv("https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/viruses.txt")
viruses
<- function(df) {
FilterData <- df %>% filter(Status %in% c("Chromosome", "Complete Genome", "Scaffold"))
df return(df)
}
$taxon <- "Prokaryote"
prokaryotes$taxon <- "Virus"
viruses$`Size (Mb)` <- viruses$`Size (Kb)`/1000
viruses$taxon <- "Eukaryote"
eukaryotes
<- bind_rows(list(prokaryotes, viruses, eukaryotes))
all.data
<- FilterData(all.data)
data.filtered $`GC%` <- as.numeric(data.filtered$`GC%`)
data.filtered
<- data.filtered %>%
data.filtered filter(!is.na(`GC%`)) %>%
filter(!is.na(`Size (Mb)`))
<- c(
organisms_of_interest "Emiliania huxleyi CCMP1516",
"Felis catus",
"Escherichia coli",
"Homo sapiens",
"Saccharomyces cerevisiae",
"SARS coronavirus "
)
Lecture 02 - GC content
Goal
We are interested in characterizing if the “GC” content in different clades of organisms is different. We will use the NCBI genome reports to visualize the GC content and genome size of different organisms.
Setup
ggplot(data.filtered, aes(`Size (Mb)`, `GC%`, colour = taxon)) +
# geom_scattermore(size = 2) +
geom_point(size = 0.1, alpha = 0.5) +
facet_wrap(~taxon, scales = "free_x") +
theme(legend.position = "none") +
labs(
x = "Genome Size (Mb)",
y = "GC%"
+
) scale_color_brewer(palette = "Set1") +
scale_x_log10(
breaks = c(0.1, 1, 10, 100, 1000, 10000),
labels = as.character(c(0.1, 1, 10, 100, 1000, 10000))
+
) guides(color = guide_legend(override.aes = list(size = 2, alpha = 1)))
Exercise - What explains the blobs in the Eukaryotes?
ggplot(
%>% filter(taxon == "Eukaryote"),
data.filtered aes(`Size (Mb)`, `GC%`)
+
) geom_point(size = 1, alpha = 0.5) +
facet_wrap(~taxon, scales = "free_x") +
scale_x_log10() +
scale_x_log10(
breaks = c(0.1, 1, 10, 100, 1000, 10000),
labels = as.character(c(0.1, 1, 10, 100, 1000, 10000))
+
) guides(color = guide_legend(override.aes = list(size = 2, alpha = 1)))
Exercise
Take some representative species and try to visualize thei
<- data.filtered %>%
data.filtered.ecoli filter(grepl(`#Organism/Name`, pattern = "Escherichia coli")) %>%
filter(taxon == "Prokaryote")
$organism <- "Ecoli"
data.filtered.ecoli
<- data.filtered %>%
data.filtered.yeast filter(grepl(`#Organism/Name`, pattern = "Saccharomyces cerevisiae")) %>%
filter(Group == "Fungi")
$organism <- "Yeast"
data.filtered.yeast
<- data.filtered %>%
data.filtered.sarscov2 filter(grepl(`#Organism/Name`, pattern = "SARS coronavirus")) %>%
filter(taxon == "Virus")
$organism <- "SARS-CoV-2"
data.filtered.sarscov2
<- data.filtered %>%
data.filtered.huxleyi filter(grepl(`#Organism/Name`, pattern = "Emiliania huxleyi CCMP1516"))
$organism <- "E. huxleyi"
data.filtered.huxleyi
<- data.filtered %>%
data.filtered.human filter(grepl(`#Organism/Name`, pattern = "Homo sapiens")) %>%
filter(taxon == "Eukaryote") %>%
filter(`Size (Mb)` > 500) %>%
filter(`Size (Mb)` < 5000)
$organism <- "Human"
data.filtered.human
<- data.filtered %>%
data.filtered.cat filter(grepl(`#Organism/Name`, pattern = "Felis catus")) %>%
filter(taxon == "Eukaryote")
$organism <- "Cat"
data.filtered.cat
<- data.filtered %>%
data.filtered.rabbit filter(grepl(`#Organism/Name`, pattern = "Oryctolagus cuniculus")) %>%
filter(taxon == "Eukaryote")
$organism <- "Rabbit"
data.filtered.rabbit
<- data.filtered %>%
data.filtered.plasmodium filter(grepl(`#Organism/Name`, pattern = "Plasmodium falciparum")) %>%
filter(taxon == "Eukaryote")
$organism <- "Plasmodium"
data.filtered.plasmodium
<- bind_rows(list(
data.merged
data.filtered.ecoli, data.filtered.yeast,
data.filtered.sarscov2, data.filtered.huxleyi,
data.filtered.cat,
data.filtered.human, data.filtered.rabbit,
data.filtered.plasmodium
))ggplot(data.merged, aes(`Size (Mb)`, `GC%`, colour = organism)) +
geom_jitter(size = 2, alpha = 0.5) +
scale_x_log10(
breaks = c(0.1, 1, 10, 100, 1000, 5000),
labels = as.character(c(0.1, 1, 10, 100, 1000, 5000))
+
) labs(
x = "Genome Size (Mb)",
y = "GC%"
+
) guides(color = guide_legend(override.aes = list(size = 2, alpha = 1)))