# install packages
<- c("ggplot2", "scattermore", "ggpubr")
list.of.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[, "Package"])]
new.packages if (length(new.packages)) install.packages(new.packages)
library(tidyverse)
library(scattermore)
library(ggpubr)
theme_set(theme_pubr())
<- read_tsv("https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/eukaryotes.txt")
eukaryotes <- read_tsv("https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/prokaryotes.txt")
prokaryotes <- read_tsv("https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/viruses.txt")
viruses
<- function(df) {
FilterData <- df %>% filter(Status %in% c("Chromosome", "Complete Genome", "Scaffold"))
df return(df)
}
$taxon <- "Prokaryote"
prokaryotes$taxon <- "Virus"
viruses$`Size (Mb)` <- viruses$`Size (Kb)`/1000
viruses$taxon <- "Eukaryote"
eukaryotes
<- bind_rows(list(prokaryotes, viruses, eukaryotes))
all.data
<- FilterData(all.data)
data.filtered $`GC%` <- as.numeric(data.filtered$`GC%`)
data.filtered
<- data.filtered %>%
data.filtered filter(!is.na(`GC%`)) %>%
filter(!is.na(`Size (Mb)`))
<- c(
organisms_of_interest "Emiliania huxleyi CCMP1516",
"Felis catus",
"Escherichia coli",
"Homo sapiens",
"Saccharomyces cerevisiae",
"SARS coronavirus "
)
Lecture 02 - GC content
ggplot(data.filtered, aes(`Size (Mb)`, `GC%`, colour = taxon)) +
# geom_scattermore(size = 2) +
geom_point(size = 0.1, alpha = 0.5) +
facet_wrap(~taxon, scales = "free_x") +
theme(legend.position = "none") +
labs(
x = "Genome Size (Mb)",
y = "GC%"
+
) scale_color_brewer(palette = "Set1") +
scale_x_log10(
breaks = c(0.1, 1, 10, 100, 1000, 10000),
labels = as.character(c(0.1, 1, 10, 100, 1000, 10000))
+
) guides(color = guide_legend(override.aes = list(size = 2, alpha = 1)))
ggplot(
%>% filter(taxon == "Eukaryote"),
data.filtered aes(`Size (Mb)`, `GC%`, colour = Group)
+
) geom_point(size = 1, alpha = 0.5) +
facet_wrap(~taxon, scales = "free_x") +
scale_x_log10() +
scale_color_brewer(palette = "Set2") +
scale_x_log10(
breaks = c(0.1, 1, 10, 100, 1000, 10000),
labels = as.character(c(0.1, 1, 10, 100, 1000, 10000))
+
) guides(color = guide_legend(override.aes = list(size = 2, alpha = 1)))
<- data.filtered %>%
data.filtered.ecoli filter(grepl(`#Organism/Name`, pattern = "Escherichia coli")) %>%
filter(taxon == "Prokaryote")
$organism <- "Ecoli"
data.filtered.ecoli
<- data.filtered %>%
data.filtered.yeast filter(grepl(`#Organism/Name`, pattern = "Saccharomyces cerevisiae")) %>%
filter(Group == "Fungi")
$organism <- "Yeast"
data.filtered.yeast
<- data.filtered %>%
data.filtered.sarscov2 filter(grepl(`#Organism/Name`, pattern = "SARS coronavirus")) %>%
filter(taxon == "Virus")
$organism <- "SARS-CoV-2"
data.filtered.sarscov2
<- data.filtered %>%
data.filtered.huxleyi filter(grepl(`#Organism/Name`, pattern = "Emiliania huxleyi CCMP1516"))
$organism <- "E. huxleyi"
data.filtered.huxleyi
<- data.filtered %>%
data.filtered.human filter(grepl(`#Organism/Name`, pattern = "Homo sapiens")) %>%
filter(taxon == "Eukaryote") %>%
filter(`Size (Mb)` > 500) %>%
filter(`Size (Mb)` < 5000)
$organism <- "Human"
data.filtered.human
<- data.filtered %>%
data.filtered.cat filter(grepl(`#Organism/Name`, pattern = "Felis catus")) %>%
filter(taxon == "Eukaryote")
$organism <- "Cat"
data.filtered.cat
<- data.filtered %>%
data.filtered.rabbit filter(grepl(`#Organism/Name`, pattern = "Oryctolagus cuniculus")) %>%
filter(taxon == "Eukaryote")
$organism <- "Rabbit"
data.filtered.rabbit
<- data.filtered %>%
data.filtered.plasmodium filter(grepl(`#Organism/Name`, pattern = "Plasmodium falciparum")) %>%
filter(taxon == "Eukaryote")
$organism <- "Plasmodium"
data.filtered.plasmodium
<- bind_rows(list(
data.merged
data.filtered.ecoli, data.filtered.yeast,
data.filtered.sarscov2, data.filtered.huxleyi,
data.filtered.cat,
data.filtered.human, data.filtered.rabbit,
data.filtered.plasmodium
))ggplot(data.merged, aes(`Size (Mb)`, `GC%`, colour = organism)) +
# geom_scattermore(size = 2) +
geom_jitter(size = 2, alpha = 0.5) +
# facet_wrap(~taxon, scales = "free_x")+
scale_x_log10(
breaks = c(0.1, 1, 10, 100, 1000, 5000),
labels = as.character(c(0.1, 1, 10, 100, 1000, 5000))
+
) labs(
x = "Genome Size (Mb)",
y = "GC%"
+
) scale_color_brewer(palette = "Dark2", name = "Species") +
guides(color = guide_legend(override.aes = list(size = 2, alpha = 1)))