Chapter 4 MAG catalogue
4.1 Genome phylogeny
Circular tree with genome size and completeness
# Generate the phylum color heatmap
phylum_heatmap <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
arrange(match(genome, genome_tree$tip.label)) %>%
dplyr::select(genome,phylum) %>%
mutate(phylum = factor(phylum, levels = unique(phylum))) %>%
column_to_rownames(var = "genome")
# Generate basal tree
circular_tree <- force.ultrametric(genome_tree, method="extend") %>% # extend to ultrametric for the sake of visualisation
ggtree(., layout="fan", open.angle=10, size=0.5)
***************************************************************
* Note: *
* force.ultrametric does not include a formal method to *
* ultrametricize a tree & should only be used to coerce *
* a phylogeny that fails is.ultrametric due to rounding -- *
* not as a substitute for formal rate-smoothing methods. *
***************************************************************
# Add phylum ring
circular_tree <- gheatmap(circular_tree, phylum_heatmap, offset=0.05, width=0.05, colnames=FALSE) +
scale_fill_manual(values=phylum_colors) +
labs(fill="Phylum")
#theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0))
# Flush color scale to enable a new color scheme in the next ring
circular_tree <- circular_tree + new_scale_fill()
# Add completeness ring
circular_tree <- circular_tree +
new_scale_fill() +
scale_fill_gradient(low = "#d1f4ba", high = "#f4baba") +
geom_fruit(
data=genome_metadata,
geom=geom_bar,
mapping = aes(x=completeness, y=genome, fill=contamination),
offset = 0.24,
pwidth = 0.1,
orientation="y",
stat="identity")+
labs(fill="Contamination")
# Add genome-size ring
circular_tree <- circular_tree + new_scale_fill()
circular_tree <- circular_tree +
geom_fruit(
data=genome_metadata,
geom=geom_bar,
mapping = aes(x=length, y=genome),
fill = "#1e6e55",
offset = 0.05,
orientation="y",
stat="identity")
# Add text
circular_tree <- circular_tree +
annotate('text', x=2.15, y=0, label=' Phylum', family='arial', size=3.5) +
annotate('text', x=2.6, y=0, label=' Genome quality', family='arial', size=3.5) +
annotate('text', x=2.8, y=0, label=' Genome size', family='arial', size=3.5)
#Plot circular tree
circular_tree %>% open_tree(30) %>% rotate_tree(90)
Investigating large genome MAG:
large_genome_mag <- genome_metadata %>%
arrange(desc(size)) %>%
mutate(size_Mb = size /1000000)%>%
dplyr::select(genus, size_Mb ,msa_percent, completeness, contamination, score, contig_count)
head(large_genome_mag)
# A tibble: 6 × 7
genus size_Mb msa_percent completeness contamination score contig_count
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Paraclostridium 15.4 68.1 93.9 1.61 90.7 1960
2 Escherichia 4.45 98.1 100.0 0.12 99.7 196
3 Bacteroides 3.85 93.2 100.0 0.38 99.2 548
4 Parabacteroides 3.72 81.6 88.2 1.16 85.9 594
5 Anaerobiospirillum 3.66 90.4 89.4 2.27 84.8 768
6 Plesiomonas 3.59 97.6 100.0 0.29 99.4 25
Paraclostridium species tend to have genome sizes of 2.9-3.6Mb. This MAG seems to have lower msa_percent, is this relevant? It also has the highest contig count.
Circular tree with genome presence/absence in different sample locations and fox behaviour
# Generate the phylum color heatmap
phylum_heatmap <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
mutate(phylum=str_remove_all(phylum, "p__")) %>%
right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
arrange(match(genome, genome_tree$tip.label)) %>%
dplyr::select(genome,phylum) %>%
mutate(phylum = factor(phylum, levels = unique(phylum))) %>%
column_to_rownames(var = "genome")
# Generate prevalence data
prevalence_data <- genome_counts_filt %>%
pivot_longer(!genome,names_to="sample",values_to="abundance") %>%
left_join(sample_metadata,by="sample") %>%
group_by(genome,gut_location) %>%
summarise(presence=ifelse(sum(abundance)>0,1,0)) %>%
group_by(genome) %>%
summarise(prevalence=sum(presence))
# Generate F_colon heatmap
f_colon_heatmap <- genome_counts_filt %>%
pivot_longer(!genome,names_to="sample",values_to="abundance") %>%
left_join(sample_metadata,by="sample") %>%
filter(gut_location =="F_colon") %>%
group_by(genome) %>%
summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>%
column_to_rownames(var="genome")
# Generate M_colon heatmap
m_colon_heatmap <- genome_counts_filt %>%
pivot_longer(!genome,names_to="sample",values_to="abundance") %>%
left_join(sample_metadata,by="sample") %>%
filter(gut_location =="M_colon") %>%
group_by(genome) %>%
summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>%
column_to_rownames(var="genome")
# Generate D_ileum heatmap
d_ileum_heatmap <- genome_counts_filt %>%
pivot_longer(!genome,names_to="sample",values_to="abundance") %>%
left_join(sample_metadata,by="sample") %>%
filter(gut_location =="D_ileum") %>%
group_by(genome) %>%
summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>%
column_to_rownames(var="genome")
# Generate K_ileum heatmap
k_ileum_heatmap <- genome_counts_filt %>%
pivot_longer(!genome,names_to="sample",values_to="abundance") %>%
left_join(sample_metadata,by="sample") %>%
filter(gut_location =="K_ileum") %>%
group_by(genome) %>%
summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>%
column_to_rownames(var="genome")
# Generate Gut content heatmap
gut_content_heatmap <- genome_counts_filt %>%
pivot_longer(!genome,names_to="sample",values_to="abundance") %>%
left_join(sample_metadata,by="sample") %>%
filter(sample_type =="Gut content") %>%
group_by(genome) %>%
summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>%
column_to_rownames(var="genome")
# Generate Gut tissue heatmap
gut_tissue_heatmap <- genome_counts_filt %>%
pivot_longer(!genome,names_to="sample",values_to="abundance") %>%
left_join(sample_metadata,by="sample") %>%
filter(sample_type =="Gut tissue") %>%
group_by(genome) %>%
summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>%
column_to_rownames(var="genome")
# Generate tame heatmap
tame_heatmap <- genome_counts_filt %>%
pivot_longer(!genome,names_to="sample",values_to="abundance") %>%
left_join(sample_metadata,by="sample") %>%
filter(fox_behaviour=="tame") %>%
group_by(genome) %>%
summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>%
column_to_rownames(var="genome")
# Generate aggressive heatmap
aggressive_heatmap <- genome_counts_filt %>%
pivot_longer(!genome,names_to="sample",values_to="abundance") %>%
left_join(sample_metadata,by="sample") %>%
filter(fox_behaviour=="aggr") %>%
group_by(genome) %>%
summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>%
column_to_rownames(var="genome")
# Generate unselected heatmap
unsel_heatmap <- genome_counts_filt %>%
pivot_longer(!genome,names_to="sample",values_to="abundance") %>%
left_join(sample_metadata,by="sample") %>%
filter(fox_behaviour=="unsel") %>%
group_by(genome) %>%
summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>%
column_to_rownames(var="genome")
# Generate basal tree
circular_tree <- force.ultrametric(genome_tree, method="extend") %>% # extend to ultrametric for the sake of visualisation
ggtree(., layout="fan", open.angle=10, size=0.3)
***************************************************************
* Note: *
* force.ultrametric does not include a formal method to *
* ultrametricize a tree & should only be used to coerce *
* a phylogeny that fails is.ultrametric due to rounding -- *
* not as a substitute for formal rate-smoothing methods. *
***************************************************************
# Add phylum ring
circular_tree <- gheatmap(circular_tree, phylum_heatmap, offset=0, width=0.05, colnames=FALSE) +
scale_fill_manual(values=phylum_colors) +
theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0)) +
new_scale_fill()
# Add F_colon ring
circular_tree <- gheatmap(circular_tree, f_colon_heatmap, offset=0.2, width=0.05, colnames=FALSE) +
scale_fill_manual(values = c("absent" = "#ffffff", "present" = "#d6604d")) +
theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0)) +
new_scale_fill()
# Add M_colon ring
circular_tree <- gheatmap(circular_tree, m_colon_heatmap, offset=0.4, width=0.05, colnames=FALSE) +
scale_fill_manual(values=c("#ffffff","#542788")) +
theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0)) +
new_scale_fill()
# Add D_ileum ring
circular_tree <- gheatmap(circular_tree, d_ileum_heatmap, offset=0.3, width=0.05, colnames=FALSE) +
scale_fill_manual(values=c("#ffffff","#fdb863")) +
theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0)) +
new_scale_fill()
# Add K_ileum ring
circular_tree <- gheatmap(circular_tree, k_ileum_heatmap, offset=0.5, width=0.05, colnames=FALSE) +
scale_fill_manual(values=c("absent" = "#ffffff", "present" = "#bb99d8")) +
theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0)) +
new_scale_fill()
# Add prevalence ring
circular_tree <- circular_tree +
new_scale_fill() +
scale_fill_manual(values = "#cccccc") +
geom_fruit(
data=prevalence_data,
geom=geom_bar,
mapping = aes(x=prevalence, y=genome),
offset = 0.4,
orientation="y",
stat="identity")
# Add tame ring
circular_tree <- gheatmap(circular_tree, tame_heatmap, offset=1.2, width=0.05, colnames=FALSE) +
scale_fill_manual(values=c("#ffffff","#1f77b4")) +
theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0)) +
new_scale_fill()
# Add aggressive ring
circular_tree <- gheatmap(circular_tree, aggressive_heatmap, offset=1.3, width=0.05, colnames=FALSE) +
scale_fill_manual(values=c("#ffffff","#d62728")) +
theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0)) +
new_scale_fill()
# Add unselected ring
circular_tree <- gheatmap(circular_tree, unsel_heatmap, offset=1.4, width=0.05, colnames=FALSE) +
scale_fill_manual(values=c("#ffffff","#2ca02c")) +
theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0)) +
new_scale_fill()
# Add text
circular_tree <- circular_tree +
annotate('text', x=2.05, y= 0, label=' Phylum', family='arial', size=3.5) +
annotate('text', x=2.4, y=0, label=' Gut location', family='arial', size=3.5) +
annotate('text', x=2.9, y=0, label=' Prevalence', family='arial', size=3.5) +
annotate('text', x= 3.4,y=0, label=' Fox behaviour', family='arial', size=3.5)
#Plot circular tree
circular_tree %>% open_tree(30) %>% rotate_tree(90)
4.1.1 Taxonomy overview
tax_mag <-genome_metadata %>%
group_by(phylum) %>%
summarise(mag_n=n())
tax_mag %>%
mutate(percetage_mag=round(mag_n*100/sum(mag_n), 2)) %>%
arrange(-percetage_mag) %>%
tt()
phylum | mag_n | percetage_mag |
---|---|---|
Bacillota_A | 55 | 38.46 |
Bacteroidota | 21 | 14.69 |
Bacillota | 18 | 12.59 |
Bacillota_I | 14 | 9.79 |
Pseudomonadota | 10 | 6.99 |
Campylobacterota | 6 | 4.20 |
Fusobacteriota | 6 | 4.20 |
Actinomycetota | 5 | 3.50 |
Bacillota_B | 2 | 1.40 |
Desulfobacterota | 2 | 1.40 |
Spirochaetota | 2 | 1.40 |
Bacillota_C | 1 | 0.70 |
Deferribacterota | 1 | 0.70 |
4.2 Genome quality
tibble(Completeness=
paste0(round(genome_metadata$completeness %>% mean(),2),
"±",
round(genome_metadata$completeness %>% sd(),2)),
Contamination=
paste0(round(genome_metadata$contamination %>% mean(),2),
"±",
round(genome_metadata$contamination %>% sd(),2))) %>%
tt()
Completeness | Contamination |
---|---|
92.6±6.75 | 1.92±2.11 |
#Generate quality biplot
genome_biplot <- genome_metadata %>%
dplyr::select(c(genome,phylum,completeness,contamination,length)) %>%
arrange(match(genome, rev(genome_tree$tip.label))) %>% #sort MAGs according to phylogenetic tree
ggplot(aes(x=completeness,y=contamination,size=length,color=phylum)) +
geom_point(alpha=0.7) +
xlim(c(70,100)) +
ylim(c(10,0)) +
scale_color_manual(values=phylum_colors) +
labs(y= "Contamination", x = "Completeness") +
theme_classic() +
theme(legend.position = "none")
#Generate contamination boxplot
genome_contamination <- genome_metadata %>%
ggplot(aes(y=contamination)) +
ylim(c(10,0)) +
geom_boxplot(colour = "#999999", fill="#cccccc") +
theme_classic() +
theme(legend.position = "none",
axis.line = element_blank(),
axis.title = element_blank(),
axis.text=element_blank(),
axis.ticks=element_blank(),
plot.margin = unit(c(0, 0, 0.40, 0),"inches")) #add bottom-margin (top, right, bottom, left)
#Generate completeness boxplot
genome_completeness <- genome_metadata %>%
ggplot(aes(x=completeness)) +
xlim(c(70,100)) +
geom_boxplot(colour = "#999999", fill="#cccccc") +
theme_classic() +
theme(legend.position = "none",
axis.line = element_blank(),
axis.title = element_blank(),
axis.text=element_blank(),
axis.ticks=element_blank(),
plot.margin = unit(c(0, 0, 0, 0.50),"inches")) #add left-margin (top, right, bottom, left)
#Render composite figure
#pdf("figures/completeness_contamination.pdf",width=10, height=5)
grid.arrange(grobs = list(genome_completeness,genome_biplot,genome_contamination),
layout_matrix = rbind(c(1,1,1,1,1,1,1,1,1,1,1,4),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3)))
4.3 Functional overview
4.3.0.2 Number of annotated genes and percentages
#How many genes have at least 1 annotation
genome_annota <- genome_annotations %>%
filter(if_any(c(kegg, pfam, cazy), ~ !is.na(.))) %>%
nrow()
cat(genome_annota)
257893
[1] 80.64096
4.3.0.3 Number of KEGG annotatated genes and percentages
# KEGG annotation
kegg_annota <- genome_annotations %>%
filter(!is.na(kegg)) %>%
nrow()
cat(kegg_annota)
219314
[1] 85.0407
4.3.0.4 Function tree
# Aggregate basal GIFT into elements
function_table <- genome_gifts %>%
to.elements(., GIFT_db)
# Generate basal tree
function_tree <- force.ultrametric(genome_tree, method="extend") %>%
ggtree(., size = 0.3)
***************************************************************
* Note: *
* force.ultrametric does not include a formal method to *
* ultrametricize a tree & should only be used to coerce *
* a phylogeny that fails is.ultrametric due to rounding -- *
* not as a substitute for formal rate-smoothing methods. *
***************************************************************
#Add phylum colors next to the tree tips
function_tree <- gheatmap(function_tree, phylum_heatmap, offset=0, width=0.1, colnames=FALSE) +
scale_fill_manual(values=phylum_colors) +
labs(fill="Phylum")
#Reset fill scale to use a different colour profile in the heatmap
function_tree <- function_tree + new_scale_fill()
#Add functions heatmap
function_tree <- gheatmap(function_tree, function_table, offset=0.5, width=3.5, colnames=FALSE) +
vexpand(.08) +
coord_cartesian(clip = "off") +
scale_fill_gradient(low = "#f4f4f4", high = "steelblue", na.value="white") +
labs(fill="GIFT")
#Reset fill scale to use a different colour profile in the heatmap
function_tree <- function_tree + new_scale_fill()
# Add completeness barplots
function_tree <- function_tree +
geom_fruit(data=genome_metadata,
geom=geom_bar,
grid.params=list(axis="x", text.size=2, nbreak = 1),
axis.params=list(vline=TRUE),
mapping = aes(x=length, y=genome, fill=completeness),
offset = 3.8,
orientation="y",
stat="identity") +
scale_fill_gradient(low = "#cf8888", high = "#a2cc87") +
labs(fill="Genome\ncompleteness")
function_tree
4.4 Functional ordination
PCoA with Euclidean distances:
gift_pcoa <- genome_gifts %>%
to.elements(., GIFT_db) %>%
as.data.frame() %>%
vegdist(method="euclidean") %>%
pcoa()
gift_pcoa_rel_eigen <- gift_pcoa$values$Relative_eig[1:10]
# Get genome positions
gift_pcoa_vectors <- gift_pcoa$vectors %>% #extract vectors
as.data.frame() %>%
dplyr::select(Axis.1,Axis.2) # keep the first 2 axes
gift_pcoa_eigenvalues <- gift_pcoa$values$Eigenvalues[c(1,2)]
#For the black arrows: Functional group loadings
gift_pcoa_gifts <- cov(genome_gifts, scale(gift_pcoa_vectors)) %*% diag((gift_pcoa_eigenvalues/(nrow(genome_gifts)-1))^(-0.5)) %>%
as.data.frame() %>%
rename(Axis.1=1,Axis.2=2) %>%
rownames_to_column(var="label") %>%
#get function summary vectors
mutate(func=substr(label,1,3)) %>%
group_by(func) %>%
summarise(Axis.1=mean(Axis.1),
Axis.2=mean(Axis.2)) %>%
rename(label=func) %>%
filter(!label %in% c("S01","S02","S03"))
set.seed(101)
scale <- 20 # scale for vector loadings (to make arrows visible)
gift_pcoa_vectors %>%
rownames_to_column(var="genome") %>%
left_join(genome_metadata, by="genome") %>%
ggplot() +
#genome positions
scale_color_manual(values=phylum_colors)+
geom_point(aes(x=Axis.1,y=Axis.2, color=phylum, size=length),
alpha=0.9, shape=16) +
#scale_color_manual(values=phylum_colors) +
scale_size_continuous(range = c(0.1,5)) +
#loading positions
geom_segment(data=gift_pcoa_gifts,
aes(x=0, y=0, xend=Axis.1 * scale, yend=Axis.2 * scale),
arrow = arrow(length = unit(0.3, "cm"),
type = "open",
angle = 25),
linewidth = 0.5,
color = "black") +
#Primary and secondary scale adjustments
scale_x_continuous(name = paste0("PCoA1 (",round(gift_pcoa_rel_eigen[1]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA1")
) +
scale_y_continuous(name = paste0("PCoA2 (",round(gift_pcoa_rel_eigen[2]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA2")
) +
geom_label_repel(data = gift_pcoa_gifts,
aes(label = label, x = Axis.1 * scale, y = Axis.2 * scale),
segment.color = 'transparent') +
xlim(-3,3) +
ylim(-2.5,2.5) +
theme_minimal() +
theme(legend.position = "none")+
labs(
x = paste0("PCoA1 (", round(gift_pcoa_rel_eigen[1]*100, 2), " %)"),
y = paste0("PCoA2 (", round(gift_pcoa_rel_eigen[2]*100, 2), " %)")
)
Warning: Removed 1 row containing missing values or values outside the scale range (`geom_point()`).
Warning: ggrepel: 12 unlabeled data points (too many overlaps). Consider increasing max.overlaps
arrow_functions <- gift_pcoa_gifts %>%
left_join(GIFT_db, by = c("label" = "Code_function")) %>%
group_by(label) %>%
summarise(
Axis.1 = mean(Axis.1),
Axis.2 = mean(Axis.2),
Function = first(Function),
Domain = first(Domain),
Length = sqrt(Axis.1^2 + Axis.2^2), # vector length
.groups = "drop"
) %>%
arrange(desc(Length)) # sort so most important arrows are first
arrow_functions
# A tibble: 18 × 6
label Axis.1 Axis.2 Function Domain Length
<chr> <dbl> <dbl> <chr> <chr> <dbl>
1 B02 0.119 -0.0345 Amino acid biosynthesis Biosynthesis 0.124
2 B07 0.0905 0.0708 Vitamin biosynthesis Biosynthesis 0.115
3 B01 0.0805 0.0477 Nucleic acid biosynthesis Biosynthesis 0.0936
4 D09 0.00454 -0.0913 Antibiotic degradation Degradation 0.0914
5 B06 0.0774 0.0195 Organic anion biosynthesis Biosynthesis 0.0798
6 B03 0.0302 -0.0148 Amino acid derivative biosynthesis Biosynthesis 0.0336
7 D01 0.00244 -0.0100 Lipid degradation Degradation 0.0103
8 D03 0.00248 -0.00530 Sugar degradation Degradation 0.00586
9 D05 0.00466 0.00234 Amino acid degradation Degradation 0.00521
10 D06 0.00255 0.00355 Nitrogen compound degradation Degradation 0.00437
11 D08 0.00261 -0.00107 Xenobiotic degradation Degradation 0.00282
12 B04 -0.000742 -0.00219 SCFA biosynthesis Biosynthesis 0.00231
13 B09 0.000491 -0.000896 Metallophore biosynthesis Biosynthesis 0.00102
14 B10 0.0000905 -0.0000140 Antibiotic biosynthesis Biosynthesis 0.0000916
15 B08 0 0 Aromatic compound biosynthesis Biosynthesis 0
16 D02 0 0 Polysaccharide degradation Degradation 0
17 D04 0 0 Protein degradation Degradation 0
18 D07 0 0 Alcohol degradation Degradation 0
PCoA with Bray-Curtis (relative composition)
gift_pcoa_bray <- genome_gifts %>%
to.elements(., GIFT_db) %>%
as.data.frame() %>%
vegdist(method="bray") %>%
pcoa()
Warning in vegdist(., method = "bray"): you have empty rows: their dissimilarities may be
meaningless in method "bray"
gift_pcoa_rel_eigen_bray <- gift_pcoa_bray$values$Relative_eig[1:10]
# Get genome positions
gift_pcoa_vectors_bray <- gift_pcoa_bray$vectors %>% #extract vectors
as.data.frame() %>%
dplyr::select(Axis.1,Axis.2) # keep the first 2 axes
gift_pcoa_eigenvalues_bray <- gift_pcoa_bray$values$Eigenvalues[c(1,2)]
gift_pcoa_gifts_bray <- cov(genome_gifts, scale(gift_pcoa_vectors_bray)) %*% diag((gift_pcoa_eigenvalues_bray/(nrow(genome_gifts)-1))^(-0.5)) %>%
as.data.frame() %>%
rename(Axis.1=1,Axis.2=2) %>%
rownames_to_column(var="label") %>%
#get function summary vectors
mutate(func=substr(label,1,3)) %>%
group_by(func) %>%
summarise(Axis.1=mean(Axis.1),
Axis.2=mean(Axis.2)) %>%
rename(label=func) %>%
filter(!label %in% c("S01","S02","S03"))
set.seed(101)
scale <- 20 # scale for vector loadings
gift_pcoa_vectors_bray %>%
rownames_to_column(var="genome") %>%
left_join(genome_metadata, by="genome") %>%
ggplot() +
#genome positions
scale_color_manual(values=phylum_colors)+
geom_point(aes(x=Axis.1,y=Axis.2, color=phylum, size=length),
alpha=0.9, shape=16) +
#scale_color_manual(values=phylum_colors) +
scale_size_continuous(range = c(0.1,5)) +
#loading positions
geom_segment(data=gift_pcoa_gifts_bray,
aes(x=0, y=0, xend=Axis.1 * scale, yend=Axis.2 * scale),
arrow = arrow(length = unit(0.3, "cm"),
type = "open",
angle = 25),
linewidth = 0.5,
color = "black") +
#Primary and secondary scale adjustments
scale_x_continuous(name = paste0("PCoA1 (",round(gift_pcoa_rel_eigen_bray[1]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA1")
) +
scale_y_continuous(name = paste0("PCoA2 (",round(gift_pcoa_rel_eigen_bray[2]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA2")
) +
geom_label_repel(data = gift_pcoa_gifts_bray,
aes(label = label, x = Axis.1 * scale, y = Axis.2 * scale),
segment.color = 'transparent') +
xlim(-0.8,0.8) +
ylim(-0.5,0.5) +
theme_minimal() +
theme(legend.position = "none")+
labs(
x = paste0("PCoA1 (", round(gift_pcoa_rel_eigen[1]*100, 2), " %)"),
y = paste0("PCoA2 (", round(gift_pcoa_rel_eigen[2]*100, 2), " %)")
)
Warning: Removed 1 row containing missing values or values outside the scale range (`geom_point()`).
Warning: Removed 9 rows containing missing values or values outside the scale range (`geom_segment()`).
Warning: Removed 9 rows containing missing values or values outside the scale range (`geom_label_repel()`).
arrow_functions_bray <- gift_pcoa_gifts_bray %>%
left_join(GIFT_db, by = c("label" = "Code_function")) %>%
group_by(label) %>%
summarise(
Axis.1 = mean(Axis.1),
Axis.2 = mean(Axis.2),
Function = first(Function),
Domain = first(Domain),
Length = sqrt(Axis.1^2 + Axis.2^2),
.groups = "drop"
) %>%
arrange(desc(Length))
arrow_functions_bray
# A tibble: 18 × 6
label Axis.1 Axis.2 Function Domain Length
<chr> <dbl> <dbl> <chr> <chr> <dbl>
1 B02 0.912 0.444 Amino acid biosynthesis Biosynthesis 1.01
2 B07 0.794 -0.455 Vitamin biosynthesis Biosynthesis 0.915
3 D09 0.00697 0.864 Antibiotic degradation Degradation 0.864
4 B01 0.736 -0.152 Nucleic acid biosynthesis Biosynthesis 0.752
5 B06 0.605 -0.0718 Organic anion biosynthesis Biosynthesis 0.609
6 B03 0.236 0.170 Amino acid derivative biosynthesis Biosynthesis 0.291
7 D01 -0.0146 0.0836 Lipid degradation Degradation 0.0849
8 D03 0.0182 0.0481 Sugar degradation Degradation 0.0514
9 D05 0.0428 -0.0105 Amino acid degradation Degradation 0.0441
10 D06 0.0212 -0.0183 Nitrogen compound degradation Degradation 0.0280
11 D08 0.0199 0.0195 Xenobiotic degradation Degradation 0.0279
12 B04 -0.0124 0.0191 SCFA biosynthesis Biosynthesis 0.0227
13 B09 0.00328 0.00980 Metallophore biosynthesis Biosynthesis 0.0103
14 B10 0.000699 0.000633 Antibiotic biosynthesis Biosynthesis 0.000943
15 B08 0 0 Aromatic compound biosynthesis Biosynthesis 0
16 D02 0 0 Polysaccharide degradation Degradation 0
17 D04 0 0 Protein degradation Degradation 0
18 D07 0 0 Alcohol degradation Degradation 0
UMAP
set.seed(1001)
#parameters to test
neighbors_values <- c(5, 15, 30)
min_dist_values <- c(0.1, 0.3, 0.5)
umap_plots <- list()
# Loop over combinations
for (nn in neighbors_values) {
for (md in min_dist_values) {
umap_result <- umap(function_table,
n_neighbors = nn,
min_dist = md,
metric = "euclidean")
df_umap <- as.data.frame(umap_result) %>%
mutate(genome = rownames(function_table)) %>%
inner_join(genome_metadata, by="genome") %>%
rename(UMAP1 = V1, UMAP2 = V2)
p <- ggplot(df_umap, aes(x = UMAP1, y = UMAP2,
color = phylum, size = length)) +
geom_point(alpha = 0.7, shape=16) +
scale_color_manual(values = phylum_colors) +
theme_minimal() +
labs(title = paste0("UMAP (neighbors=", nn,
", min_dist=", md, ")"),
color="Phylum", size="Genome size") +
guides(color = guide_legend(override.aes = list(size = 5)))
p <- p + theme(legend.position = "none")
umap_plots[[paste(nn, md, sep="_")]] <- p
}
}
ggpubr::ggarrange(plotlist = umap_plots, ncol = 3, nrow = 3)
set.seed(1001)
# Trying different perplexity values
perplexities <- c(5, 7,8, 9, 10, 11, 12, 13, 15)
plots <- lapply(perplexities, function(p) {
tsne_result <- Rtsne(
X = function_table,
dims = 2,
perplexity = p,
check_duplicates = FALSE
)
tsne_df <- as.data.frame(tsne_result$Y) %>%
mutate(genome = rownames(function_table)) %>%
inner_join(genome_metadata, by = "genome") %>%
rename(tSNE1 = "V1", tSNE2 = "V2")
ggplot(tsne_df, aes(x = tSNE1, y = tSNE2, color = phylum)) +
geom_point(alpha = 0.7, size = 2) +
scale_color_manual(values=phylum_colors)+
theme_minimal(base_size = 5) +
theme(legend.position = "none") + # remove legend
labs(title = paste("Perplexity =", p))
})
ggpubr::ggarrange(plotlist = plots, ncol = 3, nrow = 3)
[1] 144
# Generate the tSNE ordination
set.seed(1001)
tSNE_function <- Rtsne(X=function_table, dims = 2, perplexity = 10, check_duplicates = FALSE)
# Plot the ordination
function_ordination <- tSNE_function$Y %>%
as.data.frame() %>%
mutate(genome=rownames(function_table)) %>%
inner_join(genome_metadata, by="genome") %>%
rename(tSNE1="V1", tSNE2="V2") %>%
dplyr::select(genome,phylum,tSNE1,tSNE2, length) %>%
ggplot(aes(x = tSNE1, y = tSNE2, color = phylum, size=length))+
geom_point(shape=16, alpha=0.7) +
scale_color_manual(values=phylum_colors) +
theme_minimal() +
labs(color="Phylum", size="Genome size") +
guides(color = guide_legend(override.aes = list(size = 5))) # enlarge Phylum dots in legend
function_ordination