Chapter 3 MAG catalogue

3.1 Genome phylogeny

# Generate the phylum color heatmap
phylum_heatmap <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
    mutate(phylum=str_remove_all(phylum, "p__")) %>%
    right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
    arrange(match(genome, genome_tree$tip.label)) %>%
    dplyr::select(genome,phylum) %>%
    mutate(phylum = factor(phylum, levels = unique(phylum))) %>%
    column_to_rownames(var = "genome")

# Generate prevalence data
prevalence_data <- genome_counts_filt  %>%
  pivot_longer(!genome,names_to="sample",values_to="abundance") %>% 
  left_join(sample_metadata,by="sample") %>% 
  group_by(genome,gut_location) %>% 
  summarise(presence=ifelse(sum(abundance)>0,1,0)) %>% 
  group_by(genome) %>%
  summarise(prevalence=sum(presence))

# Generate F_colon heatmap
f_colon_heatmap <- genome_counts_filt  %>%
  pivot_longer(!genome,names_to="sample",values_to="abundance") %>% 
  left_join(sample_metadata,by="sample") %>% 
  filter(gut_location =="F_colon") %>% 
  group_by(genome) %>% 
  summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>% 
  column_to_rownames(var="genome")

# Generate M_colon heatmap
m_colon_heatmap <- genome_counts_filt  %>%
  pivot_longer(!genome,names_to="sample",values_to="abundance") %>% 
  left_join(sample_metadata,by="sample") %>% 
  filter(gut_location =="M_colon") %>% 
  group_by(genome) %>% 
  summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>% 
  column_to_rownames(var="genome")

# Generate D_ileum heatmap
d_ileum_heatmap <- genome_counts_filt  %>%
  pivot_longer(!genome,names_to="sample",values_to="abundance") %>% 
  left_join(sample_metadata,by="sample") %>% 
  filter(gut_location =="D_ileum") %>% 
  group_by(genome) %>% 
  summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>% 
  column_to_rownames(var="genome")

# Generate K_ileum heatmap
k_ileum_heatmap <- genome_counts_filt  %>%
  pivot_longer(!genome,names_to="sample",values_to="abundance") %>% 
  left_join(sample_metadata,by="sample") %>% 
  filter(gut_location =="K_ileum") %>% 
  group_by(genome) %>% 
  summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>% 
  column_to_rownames(var="genome")

# Generate Gut content heatmap
gut_content_heatmap <- genome_counts_filt  %>%
  pivot_longer(!genome,names_to="sample",values_to="abundance") %>% 
  left_join(sample_metadata,by="sample") %>% 
  filter(sample_type =="Gut content") %>% 
  group_by(genome) %>% 
  summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>% 
  column_to_rownames(var="genome")


# Generate Gut tissue heatmap
gut_tissue_heatmap <- genome_counts_filt  %>%
  pivot_longer(!genome,names_to="sample",values_to="abundance") %>% 
  left_join(sample_metadata,by="sample") %>% 
  filter(sample_type =="Gut tissue") %>% 
  group_by(genome) %>% 
  summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>% 
  column_to_rownames(var="genome")

# Generate tame heatmap
tame_heatmap <- genome_counts_filt  %>%
  pivot_longer(!genome,names_to="sample",values_to="abundance") %>% 
  left_join(sample_metadata,by="sample") %>% 
  filter(fox_behaviour=="tame") %>% 
  group_by(genome) %>% 
  summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>% 
  column_to_rownames(var="genome")

# Generate aggressive heatmap
aggressive_heatmap <- genome_counts_filt  %>%
  pivot_longer(!genome,names_to="sample",values_to="abundance") %>% 
  left_join(sample_metadata,by="sample") %>% 
  filter(fox_behaviour=="aggr") %>% 
  group_by(genome) %>% 
  summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>% 
  column_to_rownames(var="genome")
# Generate  basal tree
circular_tree <- force.ultrametric(genome_tree, method="extend") %>% # extend to ultrametric for the sake of visualisation
    ggtree(., layout="fan", open.angle=10, size=0.2)
***************************************************************
*                          Note:                              *
*    force.ultrametric does not include a formal method to    *
*    ultrametricize a tree & should only be used to coerce    *
*   a phylogeny that fails is.ultrametric due to rounding --  *
*    not as a substitute for formal rate-smoothing methods.   *
***************************************************************
# Add phylum ring
circular_tree <- gheatmap(circular_tree, phylum_heatmap, offset=0, width=0.05, colnames=FALSE) +
        scale_fill_manual(values=phylum_colors) +
        theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0)) +
        new_scale_fill()

# Add F_colon ring
circular_tree <- gheatmap(circular_tree, f_colon_heatmap, offset=0.2, width=0.05, colnames=FALSE) +
        scale_fill_manual(values = c("absent" = "#ffffff", "present" = "#3D5C61")) +
        theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0)) +
        new_scale_fill()

# Add M_colon ring
circular_tree <- gheatmap(circular_tree, m_colon_heatmap, offset=0.3, width=0.05, colnames=FALSE) +
        scale_fill_manual(values=c("#ffffff","#41B6C0")) +
        theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0)) +
        new_scale_fill()

# Add D_ileum ring
circular_tree <- gheatmap(circular_tree, d_ileum_heatmap, offset=0.4, width=0.05, colnames=FALSE) +
        scale_fill_manual(values=c("#ffffff","#90C8C5")) +
        theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0)) +
        new_scale_fill()

# Add K_ileum ring
circular_tree <- gheatmap(circular_tree, k_ileum_heatmap, offset=0.5, width=0.05, colnames=FALSE) +
        scale_fill_manual(values=c("absent" =  "#ffffff", "present" = "#E5D388")) +
        theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0)) +
        new_scale_fill()

# Add Gut content ring
circular_tree <- gheatmap(circular_tree, gut_content_heatmap, offset=0.7, width=0.05, colnames=FALSE) +
        scale_fill_manual(values=c("absent" ="#ffffff","present" = "#BFA366")) +
        theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0)) +
        new_scale_fill()

# Add Gut tissue ring
circular_tree <- gheatmap(circular_tree, gut_tissue_heatmap, offset=0.8, width=0.05, colnames=FALSE) +
        scale_fill_manual(values=c("#ffffff","#6E5244")) +
        theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0)) +
        new_scale_fill()



# Add prevalence ring
circular_tree <-  circular_tree +
        new_scale_fill() +
        scale_fill_manual(values = "#cccccc") +
        geom_fruit(
             data=prevalence_data,
             geom=geom_bar,
             mapping = aes(x=prevalence, y=genome),
                 offset = 0.5,
                 orientation="y",
         stat="identity")

# Add tame ring
circular_tree <- gheatmap(circular_tree, tame_heatmap, offset=1.4, width=0.05, colnames=FALSE) +
        scale_fill_manual(values=c("#ffffff","#bd70ae")) +
        theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0)) +
        new_scale_fill()

# Add aggressive ring
circular_tree <- gheatmap(circular_tree, aggressive_heatmap, offset=1.5, width=0.05, colnames=FALSE) +
        scale_fill_manual(values=c("#ffffff","#949293")) +
        theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0)) +
        new_scale_fill()

#Plot circular tree
circular_tree %>% open_tree(30) %>% rotate_tree(90)

## Genome quality

tibble(Completeness=
         paste0(round(genome_metadata$completeness %>% mean(),2),
                "±",
                round(genome_metadata$completeness %>% sd(),2)),
       Contamination=
           paste0(round(genome_metadata$contamination %>% mean(),2),
                "±",
                round(genome_metadata$contamination %>% sd(),2))) %>%
  tt()
Completeness Contamination
92.6±6.75 1.92±2.11
#Generate quality biplot
genome_biplot <- genome_metadata %>%
  dplyr::select(c(genome,phylum,completeness,contamination,length)) %>%
  arrange(match(genome, rev(genome_tree$tip.label))) %>% #sort MAGs according to phylogenetic tree
  ggplot(aes(x=completeness,y=contamination,size=length,color=phylum)) +
              geom_point(alpha=0.7) +
                    xlim(c(70,100)) +
                    ylim(c(10,0)) +
                    scale_color_manual(values=phylum_colors) +
                    labs(y= "Contamination", x = "Completeness") +
                    theme_classic() +
                    theme(legend.position = "none")

#Generate contamination boxplot
genome_contamination <- genome_metadata %>%
            ggplot(aes(y=contamination)) +
                    ylim(c(10,0)) +
                    geom_boxplot(colour = "#999999", fill="#cccccc") +
                    theme_classic() +
                    theme(legend.position = "none",
                    axis.line = element_blank(),
                    axis.title = element_blank(),
                    axis.text=element_blank(),
                    axis.ticks=element_blank(),
                        plot.margin = unit(c(0, 0, 0.40, 0),"inches")) #add bottom-margin (top, right, bottom, left)

#Generate completeness boxplot
genome_completeness <- genome_metadata %>%
        ggplot(aes(x=completeness)) +
                xlim(c(70,100)) +
                geom_boxplot(colour = "#999999", fill="#cccccc") +
                theme_classic() +
                theme(legend.position = "none",
                    axis.line = element_blank(),
                    axis.title = element_blank(),
                    axis.text=element_blank(),
                    axis.ticks=element_blank(),
                    plot.margin = unit(c(0, 0, 0, 0.50),"inches")) #add left-margin (top, right, bottom, left)

#Render composite figure
#pdf("figures/completeness_contamination.pdf",width=10, height=5)
grid.arrange(grobs = list(genome_completeness,genome_biplot,genome_contamination),
        layout_matrix = rbind(c(1,1,1,1,1,1,1,1,1,1,1,4),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3)))

#dev.off()

3.2 Functional overview

# Aggregate basal GIFT into elements
function_table <- genome_gifts %>%
    to.elements(., GIFT_db)

# Generate  basal tree
function_tree <- force.ultrametric(genome_tree, method="extend") %>%
                ggtree(., size = 0.3) 
***************************************************************
*                          Note:                              *
*    force.ultrametric does not include a formal method to    *
*    ultrametricize a tree & should only be used to coerce    *
*   a phylogeny that fails is.ultrametric due to rounding --  *
*    not as a substitute for formal rate-smoothing methods.   *
***************************************************************
#Add phylum colors next to the tree tips
function_tree <- gheatmap(function_tree, phylum_heatmap, offset=0, width=0.1, colnames=FALSE) +
            scale_fill_manual(values=phylum_colors) +
            labs(fill="Phylum")

#Reset fill scale to use a different colour profile in the heatmap
function_tree <- function_tree + new_scale_fill()

#Add functions heatmap
function_tree <- gheatmap(function_tree, function_table, offset=0.5, width=3.5, colnames=FALSE) +
            vexpand(.08) +
            coord_cartesian(clip = "off") +
            scale_fill_gradient(low = "#f4f4f4", high = "steelblue", na.value="white") +
            labs(fill="GIFT")

#Reset fill scale to use a different colour profile in the heatmap
function_tree <- function_tree + new_scale_fill()

# Add completeness barplots
function_tree <- function_tree +
            geom_fruit(data=genome_metadata,
            geom=geom_bar,
            grid.params=list(axis="x", text.size=2, nbreak = 1),
            axis.params=list(vline=TRUE),
            mapping = aes(x=length, y=genome, fill=completeness),
                 offset = 3.8,
                 orientation="y",
                 stat="identity") +
            scale_fill_gradient(low = "#cf8888", high = "#a2cc87") +
            labs(fill="Genome\ncompleteness")

function_tree

3.3 Functional ordination

# Generate the tSNE ordination
tSNE_function <- Rtsne(X=function_table, dims = 2, check_duplicates = FALSE)

# Plot the ordination
function_ordination <- tSNE_function$Y %>%
                as.data.frame() %>%
                mutate(genome=rownames(function_table)) %>%
                inner_join(genome_metadata, by="genome") %>%
                rename(tSNE1="V1", tSNE2="V2") %>%
                dplyr::select(genome,phylum,tSNE1,tSNE2, length) %>%
                ggplot(aes(x = tSNE1, y = tSNE2, color = phylum, size=length))+
                            geom_point(shape=16, alpha=0.7) +
                            scale_color_manual(values=phylum_colors) +
                            theme_minimal() +
                labs(color="Phylum", size="Genome size") +
                guides(color = guide_legend(override.aes = list(size = 5))) # enlarge Phylum dots in legend

function_ordination