Chapter 4 MAG catalogue

4.1 Genome phylogeny

Circular tree with genome size and completeness

# Generate the phylum color heatmap
phylum_heatmap <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
    right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
    arrange(match(genome, genome_tree$tip.label)) %>%
   dplyr::select(genome,phylum) %>%
    mutate(phylum = factor(phylum, levels = unique(phylum))) %>%
    column_to_rownames(var = "genome")


# Generate  basal tree
circular_tree <- force.ultrametric(genome_tree, method="extend") %>% # extend to ultrametric for the sake of visualisation
    ggtree(., layout="fan", open.angle=10, size=0.5)
***************************************************************
*                          Note:                              *
*    force.ultrametric does not include a formal method to    *
*    ultrametricize a tree & should only be used to coerce    *
*   a phylogeny that fails is.ultrametric due to rounding --  *
*    not as a substitute for formal rate-smoothing methods.   *
***************************************************************
# Add phylum ring
circular_tree <- gheatmap(circular_tree, phylum_heatmap, offset=0.05, width=0.05, colnames=FALSE) +
        scale_fill_manual(values=phylum_colors) +
  labs(fill="Phylum")
        #theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0))

# Flush color scale to enable a new color scheme in the next ring
circular_tree <- circular_tree + new_scale_fill()

# Add completeness ring
circular_tree <- circular_tree +
        new_scale_fill() +
        scale_fill_gradient(low = "#d1f4ba", high = "#f4baba") +
        geom_fruit(
                data=genome_metadata,
                geom=geom_bar,
                mapping = aes(x=completeness, y=genome, fill=contamination),
                offset = 0.24,
                pwidth = 0.1,
                orientation="y",
              stat="identity")+
  labs(fill="Contamination")

# Add genome-size ring
circular_tree <-  circular_tree + new_scale_fill()

circular_tree <-  circular_tree +
  geom_fruit(
    data=genome_metadata,
    geom=geom_bar,
    mapping = aes(x=length, y=genome),
    fill = "#1e6e55",
    offset = 0.05,
    orientation="y",
    stat="identity")


# Add text
circular_tree <-  circular_tree +
        annotate('text', x=2.15, y=0, label='            Phylum', family='arial', size=3.5) +
        annotate('text', x=2.6, y=0, label='                         Genome quality', family='arial', size=3.5) +
        annotate('text', x=2.8, y=0, label='                     Genome size', family='arial', size=3.5) 

#Plot circular tree
circular_tree %>% open_tree(30) %>% rotate_tree(90)

Investigating large genome MAG:

large_genome_mag <- genome_metadata %>% 
  arrange(desc(size)) %>% 
  mutate(size_Mb = size /1000000)%>%
 dplyr::select(genus, size_Mb ,msa_percent, completeness, contamination, score, contig_count) 
  

head(large_genome_mag)
# A tibble: 6 × 7
  genus              size_Mb msa_percent completeness contamination score contig_count
  <chr>                <dbl>       <dbl>        <dbl>         <dbl> <dbl>        <dbl>
1 Paraclostridium      15.4         68.1         93.9          1.61  90.7         1960
2 Escherichia           4.45        98.1        100.0          0.12  99.7          196
3 Bacteroides           3.85        93.2        100.0          0.38  99.2          548
4 Parabacteroides       3.72        81.6         88.2          1.16  85.9          594
5 Anaerobiospirillum    3.66        90.4         89.4          2.27  84.8          768
6 Plesiomonas           3.59        97.6        100.0          0.29  99.4           25

Paraclostridium species tend to have genome sizes of 2.9-3.6Mb. This MAG seems to have lower msa_percent, is this relevant? It also has the highest contig count.

Circular tree with genome presence/absence in different sample locations and fox behaviour

# Generate the phylum color heatmap
phylum_heatmap <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
    mutate(phylum=str_remove_all(phylum, "p__")) %>%
    right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
    arrange(match(genome, genome_tree$tip.label)) %>%
    dplyr::select(genome,phylum) %>%
    mutate(phylum = factor(phylum, levels = unique(phylum))) %>%
    column_to_rownames(var = "genome")

# Generate prevalence data
prevalence_data <- genome_counts_filt  %>%
  pivot_longer(!genome,names_to="sample",values_to="abundance") %>% 
  left_join(sample_metadata,by="sample") %>% 
  group_by(genome,gut_location) %>% 
  summarise(presence=ifelse(sum(abundance)>0,1,0)) %>% 
  group_by(genome) %>%
  summarise(prevalence=sum(presence))

# Generate F_colon heatmap
f_colon_heatmap <- genome_counts_filt  %>%
  pivot_longer(!genome,names_to="sample",values_to="abundance") %>% 
  left_join(sample_metadata,by="sample") %>% 
  filter(gut_location =="F_colon") %>% 
  group_by(genome) %>% 
  summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>% 
  column_to_rownames(var="genome")

# Generate M_colon heatmap
m_colon_heatmap <- genome_counts_filt  %>%
  pivot_longer(!genome,names_to="sample",values_to="abundance") %>% 
  left_join(sample_metadata,by="sample") %>% 
  filter(gut_location =="M_colon") %>% 
  group_by(genome) %>% 
  summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>% 
  column_to_rownames(var="genome")

# Generate D_ileum heatmap
d_ileum_heatmap <- genome_counts_filt  %>%
  pivot_longer(!genome,names_to="sample",values_to="abundance") %>% 
  left_join(sample_metadata,by="sample") %>% 
  filter(gut_location =="D_ileum") %>% 
  group_by(genome) %>% 
  summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>% 
  column_to_rownames(var="genome")

# Generate K_ileum heatmap
k_ileum_heatmap <- genome_counts_filt  %>%
  pivot_longer(!genome,names_to="sample",values_to="abundance") %>% 
  left_join(sample_metadata,by="sample") %>% 
  filter(gut_location =="K_ileum") %>% 
  group_by(genome) %>% 
  summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>% 
  column_to_rownames(var="genome")

# Generate Gut content heatmap
gut_content_heatmap <- genome_counts_filt  %>%
  pivot_longer(!genome,names_to="sample",values_to="abundance") %>% 
  left_join(sample_metadata,by="sample") %>% 
  filter(sample_type =="Gut content") %>% 
  group_by(genome) %>% 
  summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>% 
  column_to_rownames(var="genome")


# Generate Gut tissue heatmap
gut_tissue_heatmap <- genome_counts_filt  %>%
  pivot_longer(!genome,names_to="sample",values_to="abundance") %>% 
  left_join(sample_metadata,by="sample") %>% 
  filter(sample_type =="Gut tissue") %>% 
  group_by(genome) %>% 
  summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>% 
  column_to_rownames(var="genome")

# Generate tame heatmap
tame_heatmap <- genome_counts_filt  %>%
  pivot_longer(!genome,names_to="sample",values_to="abundance") %>% 
  left_join(sample_metadata,by="sample") %>% 
  filter(fox_behaviour=="tame") %>% 
  group_by(genome) %>% 
  summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>% 
  column_to_rownames(var="genome")

# Generate aggressive heatmap
aggressive_heatmap <- genome_counts_filt  %>%
  pivot_longer(!genome,names_to="sample",values_to="abundance") %>% 
  left_join(sample_metadata,by="sample") %>% 
  filter(fox_behaviour=="aggr") %>% 
  group_by(genome) %>% 
  summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>% 
  column_to_rownames(var="genome")

# Generate unselected heatmap
unsel_heatmap <- genome_counts_filt  %>%
  pivot_longer(!genome,names_to="sample",values_to="abundance") %>% 
  left_join(sample_metadata,by="sample") %>% 
  filter(fox_behaviour=="unsel") %>% 
  group_by(genome) %>% 
  summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>% 
  column_to_rownames(var="genome")
# Generate  basal tree
circular_tree <- force.ultrametric(genome_tree, method="extend") %>% # extend to ultrametric for the sake of visualisation
    ggtree(., layout="fan", open.angle=10, size=0.3)
***************************************************************
*                          Note:                              *
*    force.ultrametric does not include a formal method to    *
*    ultrametricize a tree & should only be used to coerce    *
*   a phylogeny that fails is.ultrametric due to rounding --  *
*    not as a substitute for formal rate-smoothing methods.   *
***************************************************************
# Add phylum ring
circular_tree <- gheatmap(circular_tree, phylum_heatmap, offset=0, width=0.05, colnames=FALSE) +
        scale_fill_manual(values=phylum_colors) +
        theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0)) +
        new_scale_fill()

# Add F_colon ring
circular_tree <- gheatmap(circular_tree, f_colon_heatmap, offset=0.2, width=0.05, colnames=FALSE) +
        scale_fill_manual(values = c("absent" = "#ffffff", "present" = "#d6604d")) +
        theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0)) +
        new_scale_fill()

# Add M_colon ring
circular_tree <- gheatmap(circular_tree, m_colon_heatmap, offset=0.4, width=0.05, colnames=FALSE) +
        scale_fill_manual(values=c("#ffffff","#542788")) +
        theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0)) +
        new_scale_fill()

# Add D_ileum ring
circular_tree <- gheatmap(circular_tree, d_ileum_heatmap, offset=0.3, width=0.05, colnames=FALSE) +
        scale_fill_manual(values=c("#ffffff","#fdb863")) +
        theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0)) +
        new_scale_fill()

# Add K_ileum ring
circular_tree <- gheatmap(circular_tree, k_ileum_heatmap, offset=0.5, width=0.05, colnames=FALSE) +
        scale_fill_manual(values=c("absent" =  "#ffffff", "present" = "#bb99d8")) +
        theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0)) +
        new_scale_fill()



# Add prevalence ring
circular_tree <-  circular_tree +
        new_scale_fill() +
        scale_fill_manual(values = "#cccccc") +
        geom_fruit(
             data=prevalence_data,
             geom=geom_bar,
             mapping = aes(x=prevalence, y=genome),
                 offset = 0.4,
                 orientation="y",
         stat="identity")

# Add tame ring
circular_tree <- gheatmap(circular_tree, tame_heatmap, offset=1.2, width=0.05, colnames=FALSE) +
        scale_fill_manual(values=c("#ffffff","#1f77b4")) +
        theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0)) +
        new_scale_fill()

# Add aggressive ring
circular_tree <- gheatmap(circular_tree, aggressive_heatmap, offset=1.3, width=0.05, colnames=FALSE) +
        scale_fill_manual(values=c("#ffffff","#d62728")) +
        theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0)) +
        new_scale_fill()

# Add unselected ring
circular_tree <- gheatmap(circular_tree, unsel_heatmap, offset=1.4, width=0.05, colnames=FALSE) +
        scale_fill_manual(values=c("#ffffff","#2ca02c")) +
        theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0)) +
        new_scale_fill()


# Add text
circular_tree <-  circular_tree +
        annotate('text', x=2.05, y= 0, label='             Phylum', family='arial', size=3.5) +
        annotate('text', x=2.4, y=0, label='               Gut location', family='arial', size=3.5) +
        annotate('text', x=2.9, y=0, label='             Prevalence', family='arial', size=3.5) +
        annotate('text', x= 3.4,y=0, label='                Fox behaviour', family='arial', size=3.5) 


#Plot circular tree
circular_tree %>% open_tree(30) %>% rotate_tree(90)

4.1.1 Taxonomy overview

tax_mag <-genome_metadata %>%
  group_by(phylum) %>%
  summarise(mag_n=n())

tax_mag %>%
  mutate(percetage_mag=round(mag_n*100/sum(mag_n), 2)) %>%
  arrange(-percetage_mag) %>%
  tt()
phylum mag_n percetage_mag
Bacillota_A 55 38.46
Bacteroidota 21 14.69
Bacillota 18 12.59
Bacillota_I 14 9.79
Pseudomonadota 10 6.99
Campylobacterota 6 4.20
Fusobacteriota 6 4.20
Actinomycetota 5 3.50
Bacillota_B 2 1.40
Desulfobacterota 2 1.40
Spirochaetota 2 1.40
Bacillota_C 1 0.70
Deferribacterota 1 0.70

4.1.2 Mag size (MB)

genome_metadata <- genome_metadata %>%
  mutate(corrected_size=100*length/completeness) %>%
  arrange(completeness)
genome_metadata %>%
  summarise(Average_corrected_size=mean(corrected_size))
# A tibble: 1 × 1
  Average_corrected_size
                   <dbl>
1               2515112.

The average corrected size is 2.52 Mb.

4.2 Genome quality

tibble(Completeness=
         paste0(round(genome_metadata$completeness %>% mean(),2),
                "±",
                round(genome_metadata$completeness %>% sd(),2)),
       Contamination=
           paste0(round(genome_metadata$contamination %>% mean(),2),
                "±",
                round(genome_metadata$contamination %>% sd(),2))) %>%
  tt()
Completeness Contamination
92.6±6.75 1.92±2.11
#Generate quality biplot
genome_biplot <- genome_metadata %>%
  dplyr::select(c(genome,phylum,completeness,contamination,length)) %>%
  arrange(match(genome, rev(genome_tree$tip.label))) %>% #sort MAGs according to phylogenetic tree
  ggplot(aes(x=completeness,y=contamination,size=length,color=phylum)) +
              geom_point(alpha=0.7) +
                    xlim(c(70,100)) +
                    ylim(c(10,0)) +
                    scale_color_manual(values=phylum_colors) +
                    labs(y= "Contamination", x = "Completeness") +
                    theme_classic() +
                    theme(legend.position = "none")

#Generate contamination boxplot
genome_contamination <- genome_metadata %>%
            ggplot(aes(y=contamination)) +
                    ylim(c(10,0)) +
                    geom_boxplot(colour = "#999999", fill="#cccccc") +
                    theme_classic() +
                    theme(legend.position = "none",
                    axis.line = element_blank(),
                    axis.title = element_blank(),
                    axis.text=element_blank(),
                    axis.ticks=element_blank(),
                        plot.margin = unit(c(0, 0, 0.40, 0),"inches")) #add bottom-margin (top, right, bottom, left)

#Generate completeness boxplot
genome_completeness <- genome_metadata %>%
        ggplot(aes(x=completeness)) +
                xlim(c(70,100)) +
                geom_boxplot(colour = "#999999", fill="#cccccc") +
                theme_classic() +
                theme(legend.position = "none",
                    axis.line = element_blank(),
                    axis.title = element_blank(),
                    axis.text=element_blank(),
                    axis.ticks=element_blank(),
                    plot.margin = unit(c(0, 0, 0, 0.50),"inches")) #add left-margin (top, right, bottom, left)

#Render composite figure
#pdf("figures/completeness_contamination.pdf",width=10, height=5)
grid.arrange(grobs = list(genome_completeness,genome_biplot,genome_contamination),
        layout_matrix = rbind(c(1,1,1,1,1,1,1,1,1,1,1,4),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3)))

#dev.off()

4.3 Functional overview

4.3.0.1 Predicted genes

pred_genes <- genome_annotations %>%
  nrow()

cat(pred_genes)
319804

4.3.0.2 Number of annotated genes and percentages

#How many genes have at least 1 annotation
genome_annota <- genome_annotations %>%
  filter(if_any(c(kegg, pfam, cazy), ~ !is.na(.))) %>%
  nrow()

cat(genome_annota)
257893
#Percentage of predicted genes with at least 1 annotation
genome_annota*100/pred_genes
[1] 80.64096

4.3.0.3 Number of KEGG annotatated genes and percentages

# KEGG annotation
kegg_annota <- genome_annotations %>%
  filter(!is.na(kegg)) %>%
  nrow()
cat(kegg_annota)
219314
# KEGG annotation percentage
kegg_annota*100/genome_annota
[1] 85.0407

4.3.0.4 Function tree

# Aggregate basal GIFT into elements
function_table <- genome_gifts %>%
    to.elements(., GIFT_db)

# Generate  basal tree
function_tree <- force.ultrametric(genome_tree, method="extend") %>%
                ggtree(., size = 0.3) 
***************************************************************
*                          Note:                              *
*    force.ultrametric does not include a formal method to    *
*    ultrametricize a tree & should only be used to coerce    *
*   a phylogeny that fails is.ultrametric due to rounding --  *
*    not as a substitute for formal rate-smoothing methods.   *
***************************************************************
#Add phylum colors next to the tree tips
function_tree <- gheatmap(function_tree, phylum_heatmap, offset=0, width=0.1, colnames=FALSE) +
            scale_fill_manual(values=phylum_colors) +
            labs(fill="Phylum")

#Reset fill scale to use a different colour profile in the heatmap
function_tree <- function_tree + new_scale_fill()

#Add functions heatmap
function_tree <- gheatmap(function_tree, function_table, offset=0.5, width=3.5, colnames=FALSE) +
            vexpand(.08) +
            coord_cartesian(clip = "off") +
            scale_fill_gradient(low = "#f4f4f4", high = "steelblue", na.value="white") +
            labs(fill="GIFT")

#Reset fill scale to use a different colour profile in the heatmap
function_tree <- function_tree + new_scale_fill()

# Add completeness barplots
function_tree <- function_tree +
            geom_fruit(data=genome_metadata,
            geom=geom_bar,
            grid.params=list(axis="x", text.size=2, nbreak = 1),
            axis.params=list(vline=TRUE),
            mapping = aes(x=length, y=genome, fill=completeness),
                 offset = 3.8,
                 orientation="y",
                 stat="identity") +
            scale_fill_gradient(low = "#cf8888", high = "#a2cc87") +
            labs(fill="Genome\ncompleteness")

function_tree

4.4 Functional ordination

PCoA with Euclidean distances:

gift_pcoa <- genome_gifts %>%
    to.elements(., GIFT_db) %>%
    as.data.frame() %>%
    vegdist(method="euclidean") %>%
    pcoa()

gift_pcoa_rel_eigen <- gift_pcoa$values$Relative_eig[1:10]


# Get genome positions
gift_pcoa_vectors <- gift_pcoa$vectors %>% #extract vectors
  as.data.frame() %>% 
 dplyr::select(Axis.1,Axis.2) # keep the first 2 axes

gift_pcoa_eigenvalues <- gift_pcoa$values$Eigenvalues[c(1,2)]


#For the black arrows: Functional group loadings
gift_pcoa_gifts <- cov(genome_gifts, scale(gift_pcoa_vectors)) %*%   diag((gift_pcoa_eigenvalues/(nrow(genome_gifts)-1))^(-0.5)) %>%
  as.data.frame() %>% 
  rename(Axis.1=1,Axis.2=2) %>% 
  rownames_to_column(var="label") %>% 
  #get function summary vectors
  mutate(func=substr(label,1,3)) %>% 
  group_by(func) %>% 
  summarise(Axis.1=mean(Axis.1),
            Axis.2=mean(Axis.2)) %>% 
  rename(label=func) %>% 
  filter(!label %in% c("S01","S02","S03"))
set.seed(101)
scale <- 20 # scale for vector loadings (to make arrows visible)
gift_pcoa_vectors %>% 
  rownames_to_column(var="genome") %>% 
  left_join(genome_metadata, by="genome") %>%
  ggplot() +
      #genome positions
      scale_color_manual(values=phylum_colors)+
      geom_point(aes(x=Axis.1,y=Axis.2, color=phylum, size=length), 
                 alpha=0.9, shape=16) +
      #scale_color_manual(values=phylum_colors) +
      scale_size_continuous(range = c(0.1,5)) +
      #loading positions
      geom_segment(data=gift_pcoa_gifts, 
                   aes(x=0, y=0, xend=Axis.1 * scale, yend=Axis.2 * scale),
                    arrow = arrow(length = unit(0.3, "cm"), 
                    type = "open", 
                    angle = 25),
                    linewidth = 0.5, 
                    color = "black") +
     #Primary and secondary scale adjustments
     scale_x_continuous(name = paste0("PCoA1 (",round(gift_pcoa_rel_eigen[1]*100, digits = 2), " %)"),
                      sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA1")
            ) +
     scale_y_continuous(name = paste0("PCoA2 (",round(gift_pcoa_rel_eigen[2]*100, digits = 2), " %)"),
                      sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA2")
            ) +
    geom_label_repel(data = gift_pcoa_gifts,
                     aes(label = label, x = Axis.1 * scale, y = Axis.2 * scale),
                     segment.color = 'transparent') +
    xlim(-3,3) + 
    ylim(-2.5,2.5) +
    theme_minimal() + 
    theme(legend.position = "none")+ 
  labs(
    x = paste0("PCoA1 (", round(gift_pcoa_rel_eigen[1]*100, 2), " %)"),
    y = paste0("PCoA2 (", round(gift_pcoa_rel_eigen[2]*100, 2), " %)")
  )
Warning: Removed 1 row containing missing values or values outside the scale range (`geom_point()`).
Warning: ggrepel: 12 unlabeled data points (too many overlaps). Consider increasing max.overlaps

arrow_functions <- gift_pcoa_gifts %>%
  left_join(GIFT_db, by = c("label" = "Code_function")) %>%
  group_by(label) %>%
  summarise(
    Axis.1 = mean(Axis.1),
    Axis.2 = mean(Axis.2),
    Function = first(Function),
    Domain  = first(Domain),
    Length  = sqrt(Axis.1^2 + Axis.2^2), # vector length
    .groups = "drop"
  ) %>%
  arrange(desc(Length))  # sort so most important arrows are first

arrow_functions
# A tibble: 18 × 6
   label     Axis.1     Axis.2 Function                           Domain          Length
   <chr>      <dbl>      <dbl> <chr>                              <chr>            <dbl>
 1 B02    0.119     -0.0345    Amino acid biosynthesis            Biosynthesis 0.124    
 2 B07    0.0905     0.0708    Vitamin biosynthesis               Biosynthesis 0.115    
 3 B01    0.0805     0.0477    Nucleic acid biosynthesis          Biosynthesis 0.0936   
 4 D09    0.00454   -0.0913    Antibiotic degradation             Degradation  0.0914   
 5 B06    0.0774     0.0195    Organic anion biosynthesis         Biosynthesis 0.0798   
 6 B03    0.0302    -0.0148    Amino acid derivative biosynthesis Biosynthesis 0.0336   
 7 D01    0.00244   -0.0100    Lipid degradation                  Degradation  0.0103   
 8 D03    0.00248   -0.00530   Sugar degradation                  Degradation  0.00586  
 9 D05    0.00466    0.00234   Amino acid degradation             Degradation  0.00521  
10 D06    0.00255    0.00355   Nitrogen compound degradation      Degradation  0.00437  
11 D08    0.00261   -0.00107   Xenobiotic degradation             Degradation  0.00282  
12 B04   -0.000742  -0.00219   SCFA biosynthesis                  Biosynthesis 0.00231  
13 B09    0.000491  -0.000896  Metallophore biosynthesis          Biosynthesis 0.00102  
14 B10    0.0000905 -0.0000140 Antibiotic biosynthesis            Biosynthesis 0.0000916
15 B08    0          0         Aromatic compound biosynthesis     Biosynthesis 0        
16 D02    0          0         Polysaccharide degradation         Degradation  0        
17 D04    0          0         Protein degradation                Degradation  0        
18 D07    0          0         Alcohol degradation                Degradation  0        

PCoA with Bray-Curtis (relative composition)

gift_pcoa_bray <- genome_gifts %>%
    to.elements(., GIFT_db) %>%
    as.data.frame() %>%
    vegdist(method="bray") %>%
    pcoa()
Warning in vegdist(., method = "bray"): you have empty rows: their dissimilarities may be
                 meaningless in method "bray"
gift_pcoa_rel_eigen_bray <- gift_pcoa_bray$values$Relative_eig[1:10]


# Get genome positions
gift_pcoa_vectors_bray <- gift_pcoa_bray$vectors %>% #extract vectors
  as.data.frame() %>% 
 dplyr::select(Axis.1,Axis.2) # keep the first 2 axes

gift_pcoa_eigenvalues_bray <- gift_pcoa_bray$values$Eigenvalues[c(1,2)]

gift_pcoa_gifts_bray <- cov(genome_gifts, scale(gift_pcoa_vectors_bray)) %*% diag((gift_pcoa_eigenvalues_bray/(nrow(genome_gifts)-1))^(-0.5)) %>%
  as.data.frame() %>% 
  rename(Axis.1=1,Axis.2=2) %>% 
  rownames_to_column(var="label") %>% 
  #get function summary vectors
  mutate(func=substr(label,1,3)) %>% 
  group_by(func) %>% 
  summarise(Axis.1=mean(Axis.1),
            Axis.2=mean(Axis.2)) %>% 
  rename(label=func) %>% 
  filter(!label %in% c("S01","S02","S03"))
set.seed(101)
scale <- 20 # scale for vector loadings
gift_pcoa_vectors_bray %>% 
  rownames_to_column(var="genome") %>% 
  left_join(genome_metadata, by="genome") %>%
  ggplot() +
      #genome positions
      scale_color_manual(values=phylum_colors)+
      geom_point(aes(x=Axis.1,y=Axis.2, color=phylum, size=length), 
                 alpha=0.9, shape=16) +
      #scale_color_manual(values=phylum_colors) +
      scale_size_continuous(range = c(0.1,5)) +
      #loading positions
      geom_segment(data=gift_pcoa_gifts_bray, 
                   aes(x=0, y=0, xend=Axis.1 * scale, yend=Axis.2 * scale),
                    arrow = arrow(length = unit(0.3, "cm"), 
                    type = "open", 
                    angle = 25),
                    linewidth = 0.5, 
                    color = "black") +
     #Primary and secondary scale adjustments
     scale_x_continuous(name = paste0("PCoA1 (",round(gift_pcoa_rel_eigen_bray[1]*100, digits = 2), " %)"),
                      sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA1")
            ) +
     scale_y_continuous(name = paste0("PCoA2 (",round(gift_pcoa_rel_eigen_bray[2]*100, digits = 2), " %)"),
                      sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA2")
            ) +
    geom_label_repel(data = gift_pcoa_gifts_bray,
                     aes(label = label, x = Axis.1 * scale, y = Axis.2 * scale),
                     segment.color = 'transparent') +
   xlim(-0.8,0.8) + 
    ylim(-0.5,0.5) +
    theme_minimal() + 
    theme(legend.position = "none")+ 
  labs(
    x = paste0("PCoA1 (", round(gift_pcoa_rel_eigen[1]*100, 2), " %)"),
    y = paste0("PCoA2 (", round(gift_pcoa_rel_eigen[2]*100, 2), " %)")
  )
Warning: Removed 1 row containing missing values or values outside the scale range (`geom_point()`).
Warning: Removed 9 rows containing missing values or values outside the scale range (`geom_segment()`).
Warning: Removed 9 rows containing missing values or values outside the scale range (`geom_label_repel()`).

arrow_functions_bray <- gift_pcoa_gifts_bray %>%
  left_join(GIFT_db, by = c("label" = "Code_function")) %>%
  group_by(label) %>%
  summarise(
    Axis.1 = mean(Axis.1),
    Axis.2 = mean(Axis.2),
    Function = first(Function),
    Domain  = first(Domain),
    Length  = sqrt(Axis.1^2 + Axis.2^2),
    .groups = "drop"
  ) %>%
  arrange(desc(Length))

arrow_functions_bray
# A tibble: 18 × 6
   label    Axis.1    Axis.2 Function                           Domain         Length
   <chr>     <dbl>     <dbl> <chr>                              <chr>           <dbl>
 1 B02    0.912     0.444    Amino acid biosynthesis            Biosynthesis 1.01    
 2 B07    0.794    -0.455    Vitamin biosynthesis               Biosynthesis 0.915   
 3 D09    0.00697   0.864    Antibiotic degradation             Degradation  0.864   
 4 B01    0.736    -0.152    Nucleic acid biosynthesis          Biosynthesis 0.752   
 5 B06    0.605    -0.0718   Organic anion biosynthesis         Biosynthesis 0.609   
 6 B03    0.236     0.170    Amino acid derivative biosynthesis Biosynthesis 0.291   
 7 D01   -0.0146    0.0836   Lipid degradation                  Degradation  0.0849  
 8 D03    0.0182    0.0481   Sugar degradation                  Degradation  0.0514  
 9 D05    0.0428   -0.0105   Amino acid degradation             Degradation  0.0441  
10 D06    0.0212   -0.0183   Nitrogen compound degradation      Degradation  0.0280  
11 D08    0.0199    0.0195   Xenobiotic degradation             Degradation  0.0279  
12 B04   -0.0124    0.0191   SCFA biosynthesis                  Biosynthesis 0.0227  
13 B09    0.00328   0.00980  Metallophore biosynthesis          Biosynthesis 0.0103  
14 B10    0.000699  0.000633 Antibiotic biosynthesis            Biosynthesis 0.000943
15 B08    0         0        Aromatic compound biosynthesis     Biosynthesis 0       
16 D02    0         0        Polysaccharide degradation         Degradation  0       
17 D04    0         0        Protein degradation                Degradation  0       
18 D07    0         0        Alcohol degradation                Degradation  0       

UMAP

set.seed(1001)

#parameters to test
neighbors_values <- c(5, 15, 30)
min_dist_values <- c(0.1, 0.3, 0.5)

umap_plots <- list()

# Loop over combinations
for (nn in neighbors_values) {
  for (md in min_dist_values) {
    
    umap_result <- umap(function_table,
                        n_neighbors = nn,
                        min_dist = md,
                        metric = "euclidean")
    
    df_umap <- as.data.frame(umap_result) %>%
      mutate(genome = rownames(function_table)) %>%
      inner_join(genome_metadata, by="genome") %>%
      rename(UMAP1 = V1, UMAP2 = V2)
    
    p <- ggplot(df_umap, aes(x = UMAP1, y = UMAP2,
                             color = phylum, size = length)) +
      geom_point(alpha = 0.7, shape=16) +
      scale_color_manual(values = phylum_colors) +
      theme_minimal() +
      labs(title = paste0("UMAP (neighbors=", nn,
                          ", min_dist=", md, ")"),
           color="Phylum", size="Genome size") +
      guides(color = guide_legend(override.aes = list(size = 5)))
    
    p <- p + theme(legend.position = "none")
    umap_plots[[paste(nn, md, sep="_")]] <- p
    
  }
}
ggpubr::ggarrange(plotlist = umap_plots, ncol = 3, nrow = 3)

set.seed(1001)

# Trying different perplexity values
perplexities <- c(5, 7,8, 9, 10, 11, 12, 13, 15)

plots <- lapply(perplexities, function(p) {
  tsne_result <- Rtsne(
    X = function_table, 
    dims = 2, 
    perplexity = p, 
    check_duplicates = FALSE
  )
  
  tsne_df <- as.data.frame(tsne_result$Y) %>%
    mutate(genome = rownames(function_table)) %>%
    inner_join(genome_metadata, by = "genome") %>%
    rename(tSNE1 = "V1", tSNE2 = "V2")
  
  ggplot(tsne_df, aes(x = tSNE1, y = tSNE2, color = phylum)) +
    geom_point(alpha = 0.7, size = 2) +
    scale_color_manual(values=phylum_colors)+
    theme_minimal(base_size = 5) +  
    theme(legend.position = "none") +  # remove legend
    labs(title = paste("Perplexity =", p))
})


ggpubr::ggarrange(plotlist = plots, ncol = 3, nrow = 3) 

nrow(function_table)
[1] 144
# Generate the tSNE ordination
set.seed(1001)
tSNE_function <- Rtsne(X=function_table, dims = 2, perplexity = 10, check_duplicates = FALSE)

# Plot the ordination
function_ordination <- tSNE_function$Y %>%
                as.data.frame() %>%
                mutate(genome=rownames(function_table)) %>%
                inner_join(genome_metadata, by="genome") %>%
                rename(tSNE1="V1", tSNE2="V2") %>%
               dplyr::select(genome,phylum,tSNE1,tSNE2, length) %>%
                ggplot(aes(x = tSNE1, y = tSNE2, color = phylum, size=length))+
                            geom_point(shape=16, alpha=0.7) +
                            scale_color_manual(values=phylum_colors) +
                            theme_minimal() +
                labs(color="Phylum", size="Genome size") +
                guides(color = guide_legend(override.aes = list(size = 5))) # enlarge Phylum dots in legend

function_ordination