Chapter 8 HMSC analysis

8.1 Load data

load("data/data.Rdata")
if(!file.exists("hmsc_origin/fit_model1_250_10.Rdata")){
  download.file(url='https://sid.erda.dk/share_redirect/A6rm945NbY/hmsc_origin/fit_model1_250_10.Rdata',
              destfile='hmsc_origin/fit_model1_250_10.Rdata', method='curl')
}
load("hmsc_origin/fit_model1_250_10.Rdata")

# Select desired support threshold
support_threshold=0.9
negsupport_threshold=1-support_threshold

8.2 Variance partitioning

# Compute variance partitioning
varpart=computeVariancePartitioning(m)

varpart$vals %>%
   as.data.frame() %>%
   rownames_to_column(var="variable") %>%
   pivot_longer(!variable, names_to = "genome", values_to = "value") %>%
   mutate(variable=factor(variable, levels=rev(c("origin","sex","logseqdepth","Random: location")))) %>%
   group_by(variable) %>%
   summarise(mean=mean(value)*100,sd=sd(value)*100) %>%
   tt()

variable	mean	sd
Random: location	40.1111261	24.825232
logseqdepth	54.0718559	25.120545
sex	4.8180020	5.744219
origin	0.9990159	1.223566

# Basal tree
varpart_tree <- genome_tree

#Varpart table
varpart_table <- varpart$vals %>%
   as.data.frame() %>%
   rownames_to_column(var="variable") %>%
   pivot_longer(!variable, names_to = "genome", values_to = "value") %>%
   mutate(genome=factor(genome, levels=rev(varpart_tree$tip.label))) %>%
   mutate(variable=factor(variable, levels=rev(c("origin","sex","logseqdepth","Random: location"))))

#Phyla
phylum_groups <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
    mutate(phylum=str_remove_all(phylum, "p__"))%>%
   right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
    filter(genome %in% varpart_tree$tip.label) %>%
    arrange(match(genome, varpart_tree$tip.label)) %>%
    mutate(phylum = factor(phylum, levels = unique(phylum))) %>%
    column_to_rownames(var = "genome") %>%
    dplyr::select(phylum)

colors_alphabetic <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
    mutate(phylum=str_remove_all(phylum, "p__"))%>%
  right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
    filter(genome %in% varpart_tree$tip.label) %>%
    arrange(match(genome, varpart_tree$tip.label)) %>%
     dplyr::select(phylum, colors) %>%
    unique() %>%
    arrange(phylum) %>%
    dplyr::select(colors) %>%
    pull()

# Basal ggtree
varpart_tree <- varpart_tree %>%
        force.ultrametric(.,method="extend") %>%
        ggtree(., size = 0.3)

***************************************************************
*                          Note:                              *
*    force.ultrametric does not include a formal method to    *
*    ultrametricize a tree & should only be used to coerce    *
*   a phylogeny that fails is.ultrametric due to rounding --  *
*    not as a substitute for formal rate-smoothing methods.   *
***************************************************************

# Add phylum colors next to the tree tips
varpart_tree <- gheatmap(varpart_tree, phylum_groups, offset=-0.2, width=0.1, colnames=FALSE) +
   scale_fill_manual(values=colors_alphabetic)+
      labs(fill="Phylum")

#Reset fill scale to use a different colour profile in the heatmap
varpart_tree <- varpart_tree + new_scale_fill()

# Add variance stacked barplot
vertical_tree <-  varpart_tree +
       scale_fill_manual(values=c("#506a96","#cccccc","#be3e2b","#f6de6c"))+
        geom_fruit(
             data=varpart_table,
             geom=geom_bar,
             mapping = aes(x=value, y=genome, fill=variable, group=variable),
             pwidth = 2,
             offset = 0.05,
             width= 1,
             orientation="y",
             stat="identity")+
      labs(fill="Variable")

vertical_tree

8.3 Model fit

MFCV <- evaluateModelFit(hM=m, predY=cv)

mean(MFCV$R2, na.rm = TRUE)

[1] 0.524663

genome_fit <- tibble(genome=m$spNames, r2 = MFCV[[2]])

genome_counts_filt %>% 
mutate_if(is.numeric, ~ . / sum(.)) %>% 
left_join(genome_fit, by="genome") %>% 
filter(r2>0.5) %>% 
select(-c(genome,r2)) %>% 
colSums() %>%  
hist(main="Predictive capacity")

var_pred_table <- tibble(mag=m$spNames,
       pred=MFCV$R2,
       var_pred=MFCV$R2 * varpart$vals[1,],
       support=getPostEstimate(hM=m, parName="Beta")$support %>% .[2,],
       estimate=getPostEstimate(hM=m, parName="Beta")$mean %>% .[2,]) %>%
  mutate(enrichment=ifelse(support>=support_threshold,"Feral","Neutral")) %>% 
  mutate(enrichment=ifelse(support<=negsupport_threshold,"Domestic",enrichment))

var_pred_table %>% 
  ggplot(aes(x=estimate,y=var_pred, color=enrichment))+
    geom_point()+
    scale_color_manual(values=c("#bd70ae","#949293","#ebe8e8"))+
    geom_hline(yintercept=0.005, linetype="dashed")+
    theme_minimal()

predictive_mags <- var_pred_table %>% 
  filter(var_pred>=0.005) %>% 
  pull(mag)

8.4 Posterior estimates

# Basal tree
postestimates_tree <- genome_tree

# Posterior estimate table
post_beta <- getPostEstimate(hM=m, parName="Beta")$support %>%
    as.data.frame() %>%
    mutate(variable=m$covNames) %>%
    pivot_longer(!variable, names_to = "genome", values_to = "value") %>%
    mutate(genome=factor(genome, levels=rev(postestimates_tree$tip.label))) %>%
    mutate(value = case_when(
          value >= support_threshold ~ "Positive",
          value <= negsupport_threshold ~ "Negative",
          TRUE ~ "Neutral")) %>%
    mutate(value=factor(value, levels=c("Positive","Neutral","Negative"))) %>%
    pivot_wider(names_from = variable, values_from = value) %>%
    #select(genome,sp_vulgaris,area_semi,area_urban,sp_vulgarisxarea_semi,sp_vulgarisxarea_urban,season_spring,season_winter,sp_vulgarisxseason_spring,sp_vulgarisxseason_winter) %>%
    column_to_rownames(var="genome")

# Genome-specific attributes
genome_depth <- genome_counts_filt %>%
    mutate_if(is.numeric, ~ . / sum(.)) %>%
    rowwise() %>%
    mutate(depth = mean(c_across(where(is.numeric)), na.rm = TRUE)) %>%
    ungroup() %>%
    select(genome,depth)

#Phylums
phylum_groups <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
    mutate(phylum=str_remove_all(phylum, "p__")) %>%
    right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
    filter(genome %in% postestimates_tree$tip.label) %>%
    arrange(match(genome, postestimates_tree$tip.label)) %>%
    mutate(phylum = factor(phylum, levels = unique(phylum))) %>%
    column_to_rownames(var = "genome") %>%
    dplyr::select(phylum)

colors_alphabetic <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
    mutate(phylum=str_remove_all(phylum, "p__")) %>%
    right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
    filter(genome %in% postestimates_tree$tip.label) %>%
    arrange(match(genome, postestimates_tree$tip.label)) %>%
     dplyr::select(phylum, colors) %>%
    unique() %>%
    arrange(phylum) %>%
    dplyr::select(colors) %>%
    pull()

# Basal ggtree
postestimates_tree <- postestimates_tree %>%
        force.ultrametric(.,method="extend") %>%
        ggtree(., size = 0.3)

***************************************************************
*                          Note:                              *
*    force.ultrametric does not include a formal method to    *
*    ultrametricize a tree & should only be used to coerce    *
*   a phylogeny that fails is.ultrametric due to rounding --  *
*    not as a substitute for formal rate-smoothing methods.   *
***************************************************************

#Add phylum colors next to the tree tips
postestimates_tree <- gheatmap(postestimates_tree, phylum_groups, offset=-0.2, width=0.1, colnames=FALSE) +
      scale_fill_manual(values=colors_alphabetic)+
      labs(fill="Phylum")

#Reset fill scale to use a different colour profile in the heatmap
postestimates_tree <- postestimates_tree + new_scale_fill()

# Add posterior significant heatmap

postestimates_tree <- gheatmap(postestimates_tree, post_beta, offset=0, width=0.5, colnames=TRUE, colnames_position="top",colnames_angle=90, colnames_offset_y=1, hjust=0) +
        scale_fill_manual(values=c("#be3e2b","#f4f4f4","#b2b530"))+
        labs(fill="Trend")


postestimates_tree +
        vexpand(.25, 1) # expand top

### Association with cat origin

estimate <- getPostEstimate(hM=m, parName="Beta")$mean %>%
    as.data.frame() %>%
    mutate(variable=m$covNames) %>%
    filter(variable=="originFeral") %>%
    pivot_longer(!variable, names_to = "genome", values_to = "mean") %>%
    dplyr::select(genome,mean)

support <- getPostEstimate(hM=m, parName="Beta")$support %>%
    as.data.frame() %>%
    mutate(variable=m$covNames) %>%
    filter(variable=="originFeral") %>%
    pivot_longer(!variable, names_to = "genome", values_to = "support") %>%
    dplyr::select(genome,support)

#pdf("figures/sig_Mags_violin.pdf",width=10, height=8)
inner_join(estimate,support,by=join_by(genome==genome)) %>%
    mutate(significance=ifelse(support >= 0.9 | support <= 0.1,1,0)) %>%
    mutate(support=ifelse(mean<0,1-support,support)) %>%
    left_join(genome_metadata, by = join_by(genome == genome)) %>%
    mutate(phylum = ifelse(support > 0.9, phylum, NA)) %>%
    ggplot(aes(x=mean,y=support,color=phylum))+
      geom_point(alpha=0.7, shape=16, size=3)+
      scale_color_manual(values = phylum_colors) +
      geom_vline(xintercept = 0) +
      xlim(c(-0.4,0.4)) +
      labs(x = "Beta regression coefficient", y = "Posterior probability") +
      theme_minimal()

estimate_quantiles <- getPostEstimate(hM=m, parName="Beta", q=c(0.1,0.9))$q

map_dfr(1:229, ~ {
  tibble(
    q10 = estimate_quantiles[1, 2, .x], # Extract 10% quantile
    q90 = estimate_quantiles[2, 2, .x]  # Extract 90% quantile
  )
}) %>% mutate(genome=estimate$genome) %>% 
  right_join(estimate, by="genome") %>% 
  left_join(genome_metadata, by="genome") %>% 
  filter(q10>0 | q90<0) %>% 
  ggplot(aes(x=mean, y=fct_reorder(genome, mean), xmin=q10, xmax=q90, color=phylum)) +
      geom_point() +
      geom_errorbar() +
      #xlim(c(-0.03,0.04)) +
      geom_vline(xintercept=0) +
      scale_color_manual(values = phylum_colors) +
      theme_minimal() +
      labs(x="Regression coefficient",y="Bacterial genome") +
      theme(legend.position="none")

estimate_top <- estimate %>% 
  arrange(desc(mean)) %>%                
  slice_head(n = 15) %>%                      
  bind_rows(                                  
    estimate %>%
      arrange(mean) %>%                      
      slice_head(n = 15))

estimate_top_significant <-map_dfr(1:229, ~ {
  tibble(
    q10 = estimate_quantiles[1, 2, .x], # Extract 10% quantile
    q90 = estimate_quantiles[2, 2, .x]  # Extract 90% quantile
  )
}) %>% mutate(genome=estimate$genome) %>% 
  right_join(estimate_top, by="genome") %>% 
  left_join(genome_metadata, by="genome") %>% 
  filter(q10>0 | q90<0) %>%
  arrange(mean)

estimate_top_significant %>% 
  ggplot(aes(x=mean, y=fct_reorder(genome, mean), xmin=q10, xmax=q90, color=phylum)) +
      geom_point() +
      geom_errorbar() +
      scale_y_discrete(labels = estimate_top_significant$genus) +
      geom_vline(xintercept=0) +
      scale_color_manual(values = phylum_colors[-c(5,7,9)]) +
      theme_minimal() +
      labs(x="Regression coefficient",y="Functional trait") +
      theme(legend.position="none")

8.4.0.1 Associated with domestic cats

origin_domestic <- getPostEstimate(hM=m, parName="Beta")$support %>%
    as.data.frame() %>%
    mutate(variable=m$covNames) %>%
    pivot_longer(!variable, names_to = "genome", values_to = "value") %>%
    mutate(trend = case_when(
          value >= support_threshold ~ "Positive",
          value <= negsupport_threshold ~ "Negative",
          TRUE ~ "Neutral")) %>%
    filter(variable=="originFeral", trend=="Negative") %>%
    arrange(-value) %>%
    left_join(genome_metadata,by=join_by(genome==genome)) %>%
    dplyr::select(genome,phylum,class,order,family,species,value) %>%
    arrange(phylum, class, family, species)

origin_domestic %>%
  paged_table()

8.4.0.2 Associated with feral cats

origin_feral <- getPostEstimate(hM=m, parName="Beta")$support %>%
    as.data.frame() %>%
    mutate(variable=m$covNames) %>%
    pivot_longer(!variable, names_to = "genome", values_to = "value") %>%
    mutate(trend = case_when(
          value >= support_threshold ~ "Positive",
          value <= negsupport_threshold ~ "Negative",
          TRUE ~ "Neutral")) %>%
    filter(variable=="originFeral", trend=="Positive") %>%
    arrange(value) %>%
    left_join(genome_metadata,by=join_by(genome==genome)) %>%
    dplyr::select(genome,phylum,class,order,family,species,value) %>%
    arrange(phylum,class,family,species)

origin_feral %>%
  paged_table()

8.5 Phylogenetic signal

mpost <- convertToCodaObject(m)
quantile(unlist(mpost$Rho), probs = c(.05,.5,.95))

  5%  50%  95% 
0.96 0.98 0.99

8.6 Microbiome composition predictions

Support values of the cat origin variable per genome are interpreted as enrichment for domestic or feral cats.

The most likely microbiome compositions are predicted for domestic and feral cats based on the hmsc models

Log-abundance differences between domestic and feral cats are plotted for genomes found enriched for either origin.

8.7 Functional predictions

8.7.1 Element level

elements_table <- genome_gifts %>%
    to.elements(., GIFT_db) %>%
    as.data.frame() %>%
    .[predictive_mags,] #keep only predictive mags

#Calculate community-level functional traits for predicted communities
community_elements <- prediction %>%
  filter(genome %in% predictive_mags) %>% #keep only predictive mags
  group_by(origin, genome) %>%
  mutate(row_id = row_number()) %>%
  pivot_wider(names_from = genome, values_from = value) %>%
  ungroup() %>%
  group_split(row_id) %>%
  as.list() %>%
  lapply(., FUN = function(x){x %>%
    dplyr::select(-row_id) %>%
    column_to_rownames(var = "origin") %>%
    as.data.frame() %>%
    exp() %>%
    t() %>%
    tss() %>%
    to.community(elements_table,.,GIFT_db) %>% 
    as.data.frame() %>%
    rownames_to_column(var="origin")
   })

element_predictions <- map_dfr(community_elements, function(df) {
  domestic_values <- df %>% filter(origin == "domestic") %>% select(-origin)
  feral_values <- df %>% filter(origin == "feral") %>% select(-origin)
  domestic_values - feral_values
}) %>%
  mutate(iteration=c(1:1000)) %>% 
  pivot_longer(!iteration,names_to="trait",values_to="value") %>%
      group_by(trait) %>%
      summarise(mean=mean(value),
        p1 = quantile(value, probs = 0.1),
        p9 = quantile(value, probs = 0.9),
        positive_support = sum(value > 0)/1000,
        negative_support = sum(value < 0)/1000) %>%
      arrange(-positive_support)

8.7.1.1 Positively associated functions at element level

# Positively associated

unique_funct_db<- GIFT_db[c(2,4,5,6)] %>% 
  distinct(Code_element, .keep_all = TRUE)

element_predictions %>%
  filter(mean >0) %>%
  arrange(-positive_support) %>%
  filter(positive_support>=0.9) %>%
  left_join(unique_funct_db, by = join_by(trait == Code_element))%>%
  arrange(Domain,Function)%>%
  paged_table()

8.7.1.2 Negatively associated functions at element level

element_predictions %>%
  filter(mean <0) %>%
  arrange(-negative_support) %>%
  filter(negative_support>=0.9) %>%
  left_join(unique_funct_db, by = join_by(trait == Code_element))%>%
  arrange(Domain,Function)%>%
  paged_table()

positive_origin <- element_predictions %>%
  filter(mean >0) %>%
  arrange(mean) %>%
  filter(positive_support>=0.9) %>%
  dplyr::select(-negative_support) %>%
  rename(support=positive_support)

negative_origin <- element_predictions %>%
  filter(mean <0) %>%
  arrange(mean) %>%
  filter(negative_support>=0.9) %>%
  dplyr::select(-positive_support) %>%
  rename(support=negative_support)

all_elements <- bind_rows(positive_origin,negative_origin) %>%
  left_join(GIFT_db,by=join_by(trait==Code_element)) %>%
  mutate(trait=factor(trait,levels=c(rev(positive_origin$trait),rev(negative_origin$trait)))) %>%
  mutate(Code_function=factor(Code_function)) %>%
  mutate(element_legend=str_c(trait," - ",Element)) %>%
  mutate(function_legend=str_c(Code_function," - ",Function)) %>%
  select(trait,mean,p1,p9,element_legend,function_legend) %>% 
  unique()

gift_colors <- read_tsv("data/gift_colors.tsv") %>% 
  mutate(legend=str_c(Code_function," - ",Function))  %>% 
  filter(legend %in% all_elements$function_legend)

all_elements %>%
  ggplot(aes(x=-mean, y=fct_reorder(element_legend, -mean), xmin=-p1, xmax=-p9, color=function_legend)) +
      geom_point() +
      geom_errorbar() +
      xlim(c(-0.04,0.12)) +
      geom_vline(xintercept=0) +
      scale_color_manual(values = gift_colors$Color) +
      theme_minimal() +
      labs(x="Regression coefficient",y="Functional trait") +
    theme(legend.position="none")

8.7.2 Function level

functions_table <- elements_table %>%
    to.functions(., GIFT_db) %>%
    as.data.frame()

community_functions <- prediction %>%
  filter(genome %in% predictive_mags) %>% #keep only predictive mags
  group_by(origin, genome) %>%
  mutate(row_id = row_number()) %>%
  pivot_wider(names_from = genome, values_from = value) %>%
  ungroup() %>%
  group_split(row_id) %>%
  as.list() %>%
  lapply(., FUN = function(x){x %>%
    dplyr::select(-row_id) %>%
    column_to_rownames(var = "origin") %>%
    as.data.frame() %>%
    exp() %>%
    t() %>%
    tss() %>%
    to.community(functions_table,.,GIFT_db) %>% 
    as.data.frame() %>%
    rownames_to_column(var="origin")
   })

function_predictions <- map_dfr(community_functions, function(df) {
  domestic_values <- df %>% filter(origin == "domestic") %>% select(-origin)
  feral_values <- df %>% filter(origin == "feral") %>% select(-origin)
  domestic_values - feral_values
}) %>%
  mutate(iteration=c(1:1000)) %>% 
  pivot_longer(!iteration,names_to="trait",values_to="value") %>%
      group_by(trait) %>%
      summarise(mean=mean(value),
        p1 = quantile(value, probs = 0.1),
        p9 = quantile(value, probs = 0.9),
        positive_support = sum(value > 0)/1000,
        negative_support = sum(value < 0)/1000) %>%
      arrange(-positive_support)

# Positively associated
function_predictions %>%
  filter(mean >0) %>%
  arrange(-positive_support) %>%
  paged_table()

# Negatively associated
function_predictions %>%
  filter(mean <0) %>%
  arrange(-negative_support) %>%
  paged_table()

positive <- function_predictions %>%
  filter(mean >0) %>%
  arrange(mean) %>%
  filter(positive_support>=0.9) %>%
  dplyr::select(-negative_support) %>%
  rename(support=positive_support)

negative <- function_predictions %>%
  filter(mean <0) %>%
  arrange(mean) %>%
  filter(negative_support>=0.9) %>%
  dplyr::select(-positive_support) %>%
  rename(support=negative_support)

all_functions <- function_predictions %>%
  left_join(GIFT_db,by=join_by(trait==Code_function)) %>%
  mutate(trait=factor(trait)) %>%
  mutate(function_legend=str_c(trait," - ",Function)) %>%
  select(trait,mean,p1,p9,function_legend) %>% 
  unique()

gift_colors <- read_tsv("data/gift_colors.tsv") %>% 
  mutate(legend=str_c(Code_function," - ",Function))  %>% 
  filter(legend %in% all_functions$function_legend)

all_functions %>%
  ggplot(aes(x=-mean, y=fct_reorder(function_legend, -mean), xmin=-p1, xmax=-p9, color=function_legend)) +
      geom_point() +
      geom_errorbar() +
      xlim(c(-0.05,0.12)) +
      geom_vline(xintercept=0) +
      scale_color_manual(values = gift_colors$Color) +
      theme_minimal() +
      labs(x="Regression coefficient",y="Functional trait") +
      guides(col = guide_legend(ncol = 1))