Chapter 8 Compositional analysis

load("data/data.Rdata")

8.0.1 Genome phylogeny

#Get phylum colors from the EHI standard
phylum_colors <- genome_metadata %>%
    left_join(read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv"), by=join_by(phylum == phylum)) %>%
    arrange(match(genome, genome_tree$tip.label)) %>%
    select(phylum, colors) %>%
    unique() %>%
    arrange(phylum) %>%
    select(colors) %>%
    pull()

# Generate the phylum color heatmap
phylum_heatmap <- genome_metadata %>%
    left_join(read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv"), by=join_by(phylum == phylum)) %>%
    arrange(match(genome, genome_tree$tip.label)) %>%
    select(genome,phylum) %>%
    mutate(phylum = factor(phylum, levels = unique(phylum))) %>%
    column_to_rownames(var = "genome")

# Generate  basal tree
circular_tree <- force.ultrametric(genome_tree, method="extend") %>% # extend to ultrametric for the sake of visualisation
    ggtree(., layout="fan", open.angle=10, size=0.5)
***************************************************************
*                          Note:                              *
*    force.ultrametric does not include a formal method to    *
*    ultrametricize a tree & should only be used to coerce    *
*   a phylogeny that fails is.ultrametric due to rounding --  *
*    not as a substitute for formal rate-smoothing methods.   *
***************************************************************
# Add phylum ring
circular_tree <- gheatmap(circular_tree, phylum_heatmap, offset=0.40, width=0.1, colnames=FALSE) +
        scale_fill_manual(values=phylum_colors) +
        geom_tiplab2(size=1, hjust=-0.1) +
        theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0))

# Flush color scale to enable a new color scheme in the next ring
circular_tree <- circular_tree + new_scale_fill()

# Add completeness ring
circular_tree <- circular_tree +
        new_scale_fill() +
        scale_fill_gradient(low = "#d1f4ba", high = "#f4baba") +
        geom_fruit(
                data=genome_metadata,
                geom=geom_bar,
                mapping = aes(x=completeness, y=genome, fill=contamination),
                offset = 0.40,
                orientation="y",
              stat="identity")

# Add genome-size ring
circular_tree <-  circular_tree +
        new_scale_fill() +
        scale_fill_manual(values = "#cccccc") +
        geom_fruit(
             data=genome_metadata,
             geom=geom_bar,
             mapping = aes(x=length, y=genome),
                 offset = 0.05,
                 orientation="y",
         stat="identity")

# Add text
circular_tree <-  circular_tree +
        annotate('text', x=2.7, y=0, label='            Phylum', family='arial', size=3.5) +
        annotate('text', x=3.1, y=0, label='                         Genome quality', family='arial', size=3.5) +
        annotate('text', x=3.5, y=0, label='                     Genome size', family='arial', size=3.5)

#Plot circular tree
circular_tree %>% open_tree(30) %>% rotate_tree(90)

8.0.2 Genome quality

#Completeness
genome_metadata %>% summarise(mean=mean(completeness),sd=sd(completeness))
# A tibble: 1 × 2
   mean    sd
  <dbl> <dbl>
1  90.9  9.12
#Contamination
genome_metadata %>% summarise(mean=mean(contamination),sd=sd(contamination))
# A tibble: 1 × 2
   mean    sd
  <dbl> <dbl>
1  1.49  1.77
#Generate quality biplot
genome_biplot <- genome_metadata %>%
  select(c(genome,domain,phylum,completeness,contamination,length)) %>%
  arrange(match(genome, rev(genome_tree$tip.label))) %>% #sort MAGs according to phylogenetic tree
  ggplot(aes(x=completeness,y=contamination,size=length,color=phylum)) +
              geom_point(alpha=0.7) +
                    ylim(c(10,0)) +
                    scale_color_manual(values=phylum_colors) +
                labs(y= "Contamination", x = "Completeness") +
                    theme_classic() +
                  theme(legend.position = "none")

#Generate contamination boxplot
genome_contamination <- genome_metadata %>%
            ggplot(aes(y=contamination)) +
                    ylim(c(10,0)) +
                    geom_boxplot(colour = "#999999", fill="#cccccc") +
                    theme_void() +
                    theme(legend.position = "none",
                        axis.title.x = element_blank(),
                        axis.title.y = element_blank(),
                        axis.text.y=element_blank(),
                        axis.ticks.y=element_blank(),
                        axis.text.x=element_blank(),
                        axis.ticks.x=element_blank(),
                        plot.margin = unit(c(0, 0, 0.40, 0),"inches")) #add bottom-margin (top, right, bottom, left)

#Generate completeness boxplot
genome_completeness <- genome_metadata %>%
        ggplot(aes(x=completeness)) +
                xlim(c(50,100)) +
                geom_boxplot(colour = "#999999", fill="#cccccc") +
                theme_void() +
                theme(legend.position = "none",
                    axis.title.x = element_blank(),
                    axis.title.y = element_blank(),
                    axis.text.y=element_blank(),
                    axis.ticks.y=element_blank(),
                    axis.text.x=element_blank(),
                    axis.ticks.x=element_blank(),
                    plot.margin = unit(c(0, 0, 0, 0.50),"inches")) #add left-margin (top, right, bottom, left)

#Render composite figure
grid.arrange(grobs = list(genome_completeness,genome_biplot,genome_contamination),
        layout_matrix = rbind(c(1,1,1,1,1,1,1,1,1,1,1,4),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3)))

8.0.3 Functional ordination

# Aggregate basal GIFT into elements
function_table <- genome_gifts %>%
    to.elements(., GIFT_db)

# Generate the tSNE ordination
tSNE_function <- Rtsne(X=function_table, dims = 2, check_duplicates = FALSE)

# Plot the ordination
function_ordination <- tSNE_function$Y %>%
                as.data.frame() %>%
                mutate(genome=rownames(function_table)) %>%
                inner_join(genome_metadata, by="genome") %>%
                rename(tSNE1="V1", tSNE2="V2") %>%
                select(genome,phylum,tSNE1,tSNE2, length) %>%
                ggplot(aes(x = tSNE1, y = tSNE2, color = phylum, size=length))+
                            geom_point(shape=16, alpha=0.7) +
                            scale_color_manual(values=phylum_colors) +
                            theme_minimal() +
                labs(color="Phylum", size="Genome size") +
                guides(color = guide_legend(override.aes = list(size = 5))) # enlarge Phylum dots in legend

function_ordination

8.0.4 Taxonomy barplot per treatment

genome_counts %>%
  mutate_at(vars(-genome),~./sum(.)) %>% #apply TSS nornalisation
  pivot_longer(-genome, names_to = "sample", values_to = "count") %>% #reduce to minimum number of columns
  left_join(., genome_metadata, by = join_by(genome == genome)) %>% #append genome metadata
  left_join(., sample_metadata, by = join_by(sample == sample)) %>% #append sample metadata
  filter(!is.na(count)) %>%
  filter(!is.na(animal)) %>%
  filter(cage %in% c("C03","C04","C05","C06","C10","C11","C12","C13")) %>%
  mutate(treatment = factor(treatment, levels = c("OP","HT","HR","CD","CR","DT","DR","AN","T1","T2","T3"))) %>%
  ggplot(., aes(x=count,y=sample, fill=phylum, group=phylum)) + #grouping enables keeping the same sorting of taxonomic units
    geom_bar(stat="identity", colour="white", linewidth=0.1) + #plot stacked bars with white borders
    scale_fill_manual(values=phylum_colors) +
    labs(y = "Relative abundance") +
    facet_nested(cage + treatment ~ .,  scales="free") + #facet per day and treatment
    guides(fill = guide_legend(ncol = 1)) +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
          strip.text.y = element_text(angle = 0),
          axis.title.x = element_blank(),
          panel.background = element_blank(),
          panel.border = element_blank(),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          axis.line = element_line(linewidth = 0.5, linetype = "solid", colour = "black"),
          panel.spacing = unit(0, "lines")) +
   labs(fill="Phylum")

8.0.5 Taxonomy barplot per individual

# Facet groups for coloring
facet_boxes <- sample_metadata %>%
  filter(!is.na(count)) %>%
  filter(!is.na(animal)) %>%
  filter(cage %in% c("C03","C04","C05","C06","C10","C11","C12","C13")) %>%
    distinct(group,cage,animal) %>%
    arrange(group,cage)
Warning: There was 1 warning in `filter()`.
ℹ In argument: `!is.na(count)`.
Caused by warning in `is.na()`:
! is.na() applied to non-(list or vector) of type 'closure'
strip_background <- strip_nested(background_y = elem_list_rect(fill = c(
        # level1 colors
        case_match(
          unique(facet_boxes$group),
          "invariable" ~ "#ccd47b",
          "variable" ~ "#7dacc9",
          .default = "grey"
        ),
        # level2 colors
        case_match(
          unique(facet_boxes$cage),
          "C04" ~ "#dee3aa",
          "C13" ~ "#a7ab7e",
          "C03" ~ "#adcde0",
          "C05" ~ "#89a0ad",
          "C06" ~ "#adcde0",
          "C10" ~ "#89a0ad",
          "C11" ~ "#adcde0",
          "C12" ~ "#89a0ad",
          .default = "grey"
        ),
    # level3 colors
        case_match(
          unique(facet_boxes$animal),
          "X" ~ "green",
          .default = "#f4f4f4"
        ) 
    )))


genome_counts %>%
  mutate_at(vars(-genome),~./sum(.)) %>% #apply TSS nornalisation
  pivot_longer(-genome, names_to = "sample", values_to = "count") %>% #reduce to minimum number of columns
  left_join(., genome_metadata, by = join_by(genome == genome)) %>% #append genome metadata
  left_join(., sample_metadata, by = join_by(sample == sample)) %>% #append sample metadata
  filter(!is.na(count)) %>%
  filter(!is.na(animal)) %>%
  filter(cage %in% c("C03","C04","C05","C06","C10","C11","C12","C13")) %>%
  filter(treatment %in% c("OP","HT","HR","CT","CR","DT","DR","AN","T1","T2","T3")) %>%
  mutate(sample = factor(sample, levels = sample_metadata %>% 
  arrange(animal,match(treatment,rev(c("OP","HT","HR","CT","CR","DT","DR","AN","T1","T2","T3")))) %>% 
  select(sample) %>% 
  filter(!is.na(sample)) %>% 
  pull() ))  %>% # sort samples
  ggplot(., aes(x=count,y=sample, fill=phylum, group=phylum)) + #grouping enables keeping the same sorting of taxonomic units
    geom_bar(stat="identity", colour="white", linewidth=0.1) + #plot stacked bars with white borders
    scale_fill_manual(values=phylum_colors) +
    labs(x = "Relative abundance", y ="Samples") +
    facet_nested(group + cage + animal ~ .,  scales="free", strip = strip_background) + #facet per day and treatment
    guides(fill = guide_legend(ncol = 1)) +
    scale_x_continuous(expand = c(0.001, 0.001)) +
    theme(axis.text.y = element_blank(),
          axis.title.x = element_blank(),
          strip.text.y = element_text(angle = 0),
          panel.background = element_blank(),
          panel.border = element_blank(),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          axis.line = element_line(linewidth = 0.5, linetype = "solid", colour = "black"),
          panel.spacing = unit(0, "lines"),
          legend.position = "none")