Chapter 11 HMSC analysis

11.1 Load data

load("data/data.Rdata")
if(!file.exists("hmsc_behaviour/fit_model1_250_10.Rdata")){
  download.file(url='https://sid.erda.dk/share_redirect/A6rm945NbY/hmsc_behaviour/fit_model1_250_10.Rdata',
              destfile='hmsc/fit_model1_250_10.Rdata', method='curl')
}
load("hmsc_behaviour/fit_model1_250_10.Rdata")

# Select desired support threshold
support_threshold=0.9
negsupport_threshold=1-support_threshold

11.2 Variance partitioning

# Compute variance partitioning
varpart=computeVariancePartitioning(m)

varpart$vals %>%
   as.data.frame() %>%
   rownames_to_column(var="variable") %>%
   pivot_longer(!variable, names_to = "genome", values_to = "value") %>%
   mutate(variable=factor(variable, levels=rev(c("behaviour","sex","logseqdepth","Random: location")))) %>%
   group_by(variable) %>%
   summarise(mean=mean(value)*100,sd=sd(value)*100) %>%
   tt()

variable	mean	sd
Random: location	40.1370903	24.4507737
logseqdepth	54.0327314	25.1272735
sex	4.9385216	5.6346956
behaviour	0.8916567	0.9764987

# Basal tree
varpart_tree <- genome_tree

#Varpart table
varpart_table <- varpart$vals %>%
   as.data.frame() %>%
   rownames_to_column(var="variable") %>%
   pivot_longer(!variable, names_to = "genome", values_to = "value") %>%
   mutate(genome=factor(genome, levels=rev(varpart_tree$tip.label))) %>%
   mutate(variable=factor(variable, levels=rev(c("behaviour","sex","logseqdepth","Random: location"))))

#Phyla
phylum_groups <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
    mutate(phylum=str_remove_all(phylum, "p__"))%>%
   right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
    filter(genome %in% varpart_tree$tip.label) %>%
    arrange(match(genome, varpart_tree$tip.label)) %>%
    mutate(phylum = factor(phylum, levels = unique(phylum))) %>%
    column_to_rownames(var = "genome") %>%
    dplyr::select(phylum)

colors_alphabetic <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
    mutate(phylum=str_remove_all(phylum, "p__"))%>%
  right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
    filter(genome %in% varpart_tree$tip.label) %>%
    arrange(match(genome, varpart_tree$tip.label)) %>%
     dplyr::select(phylum, colors) %>%
    unique() %>%
    arrange(phylum) %>%
    dplyr::select(colors) %>%
    pull()

# Basal ggtree
varpart_tree <- varpart_tree %>%
        force.ultrametric(.,method="extend") %>%
        ggtree(., size = 0.3)

***************************************************************
*                          Note:                              *
*    force.ultrametric does not include a formal method to    *
*    ultrametricize a tree & should only be used to coerce    *
*   a phylogeny that fails is.ultrametric due to rounding --  *
*    not as a substitute for formal rate-smoothing methods.   *
***************************************************************

# Add phylum colors next to the tree tips
varpart_tree <- gheatmap(varpart_tree, phylum_groups, offset=-0.2, width=0.1, colnames=FALSE) +
   scale_fill_manual(values=colors_alphabetic)+
      labs(fill="Phylum")

#Reset fill scale to use a different colour profile in the heatmap
varpart_tree <- varpart_tree + new_scale_fill()

# Add variance stacked barplot
vertical_tree <-  varpart_tree +
       scale_fill_manual(values=c("#506a96","#cccccc","#be3e2b","#f6de6c"))+
        geom_fruit(
             data=varpart_table,
             geom=geom_bar,
             mapping = aes(x=value, y=genome, fill=variable, group=variable),
             pwidth = 2,
             offset = 0.05,
             width= 1,
             orientation="y",
             stat="identity")+
      labs(fill="Variable")

vertical_tree

11.3 Model fit

MFCV <- evaluateModelFit(hM=m, predY=cv)

mean(MFCV$R2, na.rm = TRUE)

[1] 0.5184004

genome_fit <- tibble(genome=m$spNames, r2 = MFCV[[2]])

genome_counts_filt %>% 
mutate_if(is.numeric, ~ . / sum(.)) %>% 
left_join(genome_fit, by="genome") %>% 
filter(r2>0.5) %>% 
select(-c(genome,r2)) %>% 
colSums() %>%  
hist(main="Predictive capacity")

var_pred_table <- tibble(mag=m$spNames,
       pred=MFCV$R2,
       var_pred=MFCV$R2 * varpart$vals[1,],
       support=getPostEstimate(hM=m, parName="Beta")$support %>% .[2,],
       estimate=getPostEstimate(hM=m, parName="Beta")$mean %>% .[2,]) %>%
  mutate(enrichment=ifelse(support>=support_threshold,"Feral","Neutral")) %>% 
  mutate(enrichment=ifelse(support<=negsupport_threshold,"Domestic",enrichment))

var_pred_table %>% 
  ggplot(aes(x=estimate,y=var_pred, color=enrichment))+
    geom_point()+
    scale_color_manual(values=c("#bd70ae","#949293","#ebe8e8"))+
    geom_hline(yintercept=0.005, linetype="dashed")+
    theme_minimal()

predictive_mags <- var_pred_table %>% 
  filter(var_pred>=0.005) %>% 
  pull(mag)

11.4 Posterior estimates

# Basal tree
postestimates_tree <- genome_tree

# Posterior estimate table
post_beta <- getPostEstimate(hM=m, parName="Beta")$support %>%
    as.data.frame() %>%
    mutate(variable=m$covNames) %>%
    pivot_longer(!variable, names_to = "genome", values_to = "value") %>%
    mutate(genome=factor(genome, levels=rev(postestimates_tree$tip.label))) %>%
    mutate(value = case_when(
          value >= support_threshold ~ "Positive",
          value <= negsupport_threshold ~ "Negative",
          TRUE ~ "Neutral")) %>%
    mutate(value=factor(value, levels=c("Positive","Neutral","Negative"))) %>%
    pivot_wider(names_from = variable, values_from = value) %>%
    #select(genome,sp_vulgaris,area_semi,area_urban,sp_vulgarisxarea_semi,sp_vulgarisxarea_urban,season_spring,season_winter,sp_vulgarisxseason_spring,sp_vulgarisxseason_winter) %>%
    column_to_rownames(var="genome")

# Genome-specific attributes
genome_depth <- genome_counts_filt %>%
    mutate_if(is.numeric, ~ . / sum(.)) %>%
    rowwise() %>%
    mutate(depth = mean(c_across(where(is.numeric)), na.rm = TRUE)) %>%
    ungroup() %>%
    select(genome,depth)

#Phylums
phylum_groups <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
    mutate(phylum=str_remove_all(phylum, "p__")) %>%
    right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
    filter(genome %in% postestimates_tree$tip.label) %>%
    arrange(match(genome, postestimates_tree$tip.label)) %>%
    mutate(phylum = factor(phylum, levels = unique(phylum))) %>%
    column_to_rownames(var = "genome") %>%
    dplyr::select(phylum)

colors_alphabetic <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
    mutate(phylum=str_remove_all(phylum, "p__")) %>%
    right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
    filter(genome %in% postestimates_tree$tip.label) %>%
    arrange(match(genome, postestimates_tree$tip.label)) %>%
     dplyr::select(phylum, colors) %>%
    unique() %>%
    arrange(phylum) %>%
    dplyr::select(colors) %>%
    pull()

# Basal ggtree
postestimates_tree <- postestimates_tree %>%
        force.ultrametric(.,method="extend") %>%
        ggtree(., size = 0.3)

***************************************************************
*                          Note:                              *
*    force.ultrametric does not include a formal method to    *
*    ultrametricize a tree & should only be used to coerce    *
*   a phylogeny that fails is.ultrametric due to rounding --  *
*    not as a substitute for formal rate-smoothing methods.   *
***************************************************************

#Add phylum colors next to the tree tips
postestimates_tree <- gheatmap(postestimates_tree, phylum_groups, offset=-0.2, width=0.1, colnames=FALSE) +
      scale_fill_manual(values=colors_alphabetic)+
      labs(fill="Phylum")

#Reset fill scale to use a different colour profile in the heatmap
postestimates_tree <- postestimates_tree + new_scale_fill()

# Add posterior significant heatmap

postestimates_tree <- gheatmap(postestimates_tree, post_beta, offset=0, width=0.5, colnames=TRUE, colnames_position="top",colnames_angle=90, colnames_offset_y=1, hjust=0) +
        scale_fill_manual(values=c("#be3e2b","#f4f4f4","#b2b530"))+
        labs(fill="Trend")


postestimates_tree +
        vexpand(.25, 1) # expand top

11.4.1 Association with cat behaviour

estimate <- getPostEstimate(hM=m, parName="Beta")$mean %>%
    as.data.frame() %>%
    mutate(variable=m$covNames) %>%
    filter(variable=="behaviour") %>%
    pivot_longer(!variable, names_to = "genome", values_to = "mean") %>%
    dplyr::select(genome,mean)

support <- getPostEstimate(hM=m, parName="Beta")$support %>%
    as.data.frame() %>%
    mutate(variable=m$covNames) %>%
    filter(variable=="behaviour") %>%
    pivot_longer(!variable, names_to = "genome", values_to = "support") %>%
    dplyr::select(genome,support)

#pdf("figures/sig_Mags_violin.pdf",width=10, height=8)
inner_join(estimate,support,by=join_by(genome==genome)) %>%
    mutate(significance=ifelse(support >= 0.9 | support <= 0.1,1,0)) %>%
    mutate(support=ifelse(mean<0,1-support,support)) %>%
    left_join(genome_metadata, by = join_by(genome == genome)) %>%
    mutate(phylum = ifelse(support > 0.9, phylum, NA)) %>%
    ggplot(aes(x=mean,y=support,color=phylum))+
      geom_point(alpha=0.7, shape=16, size=3)+
      scale_color_manual(values = phylum_colors) +
      geom_vline(xintercept = 0) +
      xlim(c(-0.4,0.4)) +
      labs(x = "Beta regression coefficient", y = "Posterior probability") +
      theme_minimal()

estimate_quantiles <- getPostEstimate(hM=m, parName="Beta", q=c(0.1,0.9))$q

map_dfr(1:229, ~ {
  tibble(
    q10 = estimate_quantiles[1, 2, .x], # Extract 10% quantile
    q90 = estimate_quantiles[2, 2, .x]  # Extract 90% quantile
  )
}) %>% mutate(genome=estimate$genome) %>% 
  right_join(estimate, by="genome") %>% 
  left_join(genome_metadata, by="genome") %>% 
  filter(q10>0 | q90<0) %>% 
  ggplot(aes(x=mean, y=fct_reorder(genome, mean), xmin=q10, xmax=q90, color=phylum)) +
      geom_point() +
      geom_errorbar() +
      #xlim(c(-0.03,0.04)) +
      geom_vline(xintercept=0) +
      scale_color_manual(values = phylum_colors) +
      theme_minimal() +
      labs(x="Regression coefficient",y="Bacterial genomes") +
      theme(legend.position="none")

estimate_top <- estimate %>% 
  arrange(desc(mean)) %>%                
  slice_head(n = 15) %>%                      
  bind_rows(                                  
    estimate %>%
      arrange(mean) %>%                      
      slice_head(n = 15))

estimate_top_significant <-map_dfr(1:229, ~ {
  tibble(
    q10 = estimate_quantiles[1, 2, .x], # Extract 10% quantile
    q90 = estimate_quantiles[2, 2, .x]  # Extract 90% quantile
  )
}) %>% mutate(genome=estimate$genome) %>% 
  right_join(estimate_top, by="genome") %>% 
  left_join(genome_metadata, by="genome") %>% 
  filter(q10>0 | q90<0) %>%
  arrange(mean)

estimate_top_significant %>% 
  ggplot(aes(x=mean, y=fct_reorder(genome, mean), xmin=q10, xmax=q90, color=phylum)) +
      geom_point() +
      geom_errorbar() +
      scale_y_discrete(labels = estimate_top_significant$genus) +
      geom_vline(xintercept=0) +
      scale_color_manual(values = phylum_colors[-c(6,7)]) +
      theme_minimal() +
      labs(x="Regression coefficient",y="Functional trait") +
      theme(legend.position="none")

11.4.1.1 Associated with tame cats

behaviour_tame <- getPostEstimate(hM=m, parName="Beta")$support %>%
    as.data.frame() %>%
    mutate(variable=m$covNames) %>%
    pivot_longer(!variable, names_to = "genome", values_to = "value") %>%
    mutate(trend = case_when(
          value >= support_threshold ~ "Positive",
          value <= negsupport_threshold ~ "Negative",
          TRUE ~ "Neutral")) %>%
    filter(variable=="behaviour", trend=="Negative") %>%
    arrange(-value) %>%
    left_join(genome_metadata,by=join_by(genome==genome)) %>%
    dplyr::select(genome,phylum,class,order,family,genus, species,value) %>%
    arrange(phylum, class, family, genus, species)

behaviour_tame %>%
  arrange(value) %>% 
  paged_table()

11.4.1.2 Associated with aggresive cats

behaviour_aggresive <- getPostEstimate(hM=m, parName="Beta")$support %>%
    as.data.frame() %>%
    mutate(variable=m$covNames) %>%
    pivot_longer(!variable, names_to = "genome", values_to = "value") %>%
    mutate(trend = case_when(
          value >= support_threshold ~ "Positive",
          value <= negsupport_threshold ~ "Negative",
          TRUE ~ "Neutral")) %>%
    filter(variable=="behaviour", trend=="Positive") %>%
    arrange(value) %>%
    left_join(genome_metadata,by=join_by(genome==genome)) %>%
    dplyr::select(genome,phylum,class,order,family,genus,species,value) %>%
    arrange(phylum,class,family,genus,species)

behaviour_aggresive %>%
  arrange(-value) %>% 
  paged_table()

11.5 Phylogenetic signal

mpost <- convertToCodaObject(m)
quantile(unlist(mpost$Rho), probs = c(.05,.5,.95))

  5%  50%  95% 
0.97 0.99 0.99

11.6 Microbiome composition predictions

Support values of the cat behaviour variable per genome are interpreted as enrichment for aggresuve or tame cats.

The most likely microbiome compositions are predicted for the gradient from tameness to aggressiveness based on the hmsc models

11.7 Functional predictions

11.7.1 Element level

elements_table <- genome_gifts %>%
    to.elements(., GIFT_db) %>%
    as.data.frame() %>%
    .[predictive_mags,] #keep only predictive mags

#Calculate community-level functional traits for predicted communities
community_elements <- prediction %>%
  filter(genome %in% predictive_mags) %>% #keep only predictive mags
  group_by(behaviour, genome) %>%
  mutate(row_id = row_number()) %>%
  pivot_wider(names_from = genome, values_from = value) %>%
  ungroup() %>%
  group_split(row_id) %>%
  as.list() %>%
  lapply(., FUN = function(x){x %>%
    dplyr::select(-row_id) %>%
    column_to_rownames(var = "behaviour") %>%
    as.data.frame() %>%
    exp() %>%
    t() %>%
    tss() %>%
    to.community(elements_table,.,GIFT_db) %>% 
    as.data.frame() %>%
    rownames_to_column(var="behaviour")
   })

calculate_slope <- function(x) {
  lm_fit <- lm(unlist(x) ~ seq_along(unlist(x)))
  coef(lm_fit)[2]
}

element_predictions <- map_dfc(community_elements, function(df) {
  df %>%
        column_to_rownames(var = "behaviour") %>%
        t() %>%
        as.data.frame() %>%
        rowwise() %>%
        mutate(slope = calculate_slope(c_across(everything()))) %>%
        select(slope) }) %>%
      t() %>%
      as.data.frame() %>%
      set_names(colnames(community_elements[[1]])[-1]) %>%
      rownames_to_column(var="iteration") %>%
      pivot_longer(!iteration, names_to="trait",values_to="value") %>%
      group_by(trait) %>%
      summarise(mean=mean(value),
        p1 = quantile(value, probs = 0.1),
        p9 = quantile(value, probs = 0.9),
        positive_support = sum(value > 0)/1000,
        negative_support = sum(value < 0)/1000) %>%
      arrange(-positive_support)

11.7.1.1 Positive associated functions at element level

# Positively associated

unique_funct_db<- GIFT_db[c(2,4,5,6)] %>% 
  distinct(Code_element, .keep_all = TRUE)

element_predictions %>%
  filter(mean >0) %>%
  arrange(-positive_support) %>%
  filter(positive_support>=0.9) %>%
  left_join(unique_funct_db, by = join_by(trait == Code_element))%>%
  arrange(Domain,Function)%>%
  paged_table()

11.7.1.2 Negatively associated functions at element level

element_predictions %>%
  filter(mean <0) %>%
  arrange(-negative_support) %>%
  filter(negative_support>=0.9) %>%
  left_join(unique_funct_db, by = join_by(trait == Code_element))%>%
  arrange(Domain,Function)%>%
  paged_table()

positive_behaviour <- element_predictions %>%
  filter(mean >0) %>%
  arrange(mean) %>%
  filter(positive_support>=0.9) %>%
  dplyr::select(-negative_support) %>%
  rename(support=positive_support)

negative_behaviour <- element_predictions %>%
  filter(mean <0) %>%
  arrange(mean) %>%
  filter(negative_support>=0.9) %>%
  dplyr::select(-positive_support) %>%
  rename(support=negative_support)

all_elements <- bind_rows(positive_behaviour,negative_behaviour) %>%
  left_join(GIFT_db,by=join_by(trait==Code_element)) %>%
  mutate(trait=factor(trait,levels=c(rev(positive_behaviour$trait),rev(negative_behaviour$trait)))) %>%
  mutate(Code_function=factor(Code_function)) %>%
  mutate(element_legend=str_c(trait," - ",Element)) %>%
  mutate(function_legend=str_c(Code_function," - ",Function)) %>%
  select(trait,mean,p1,p9,element_legend,function_legend) %>% 
  unique()

gift_colors <- read_tsv("data/gift_colors.tsv") %>% 
  mutate(legend=str_c(Code_function," - ",Function))  %>% 
  filter(legend %in% all_elements$function_legend)

all_elements %>%
  ggplot(aes(x=mean, y=fct_reorder(element_legend, mean), xmin=p1, xmax=p9, color=function_legend)) +
      geom_point() +
      geom_errorbar() +
      xlim(c(-0.03,0.04)) +
      geom_vline(xintercept=0) +
      scale_color_manual(values = gift_colors$Color) +
      theme_minimal() +
      labs(x="Regression coefficient",y="Functional trait") +
      theme(legend.position="none")

11.7.2 Function level

functions_table <- elements_table %>%
    to.functions(., GIFT_db) %>%
    as.data.frame()

community_functions <- prediction %>%
  filter(genome %in% predictive_mags) %>% #keep only predictive mags
  group_by(behaviour, genome) %>%
  mutate(row_id = row_number()) %>%
  pivot_wider(names_from = genome, values_from = value) %>%
  ungroup() %>%
  group_split(row_id) %>%
  as.list() %>%
  lapply(., FUN = function(x){x %>%
    dplyr::select(-row_id) %>%
    column_to_rownames(var = "behaviour") %>%
    as.data.frame() %>%
    exp() %>%
    t() %>%
    tss() %>%
    to.community(functions_table,.,GIFT_db) %>% 
    as.data.frame() %>%
    rownames_to_column(var="behaviour")
   })

calculate_slope <- function(x) {
  lm_fit <- lm(unlist(x) ~ seq_along(unlist(x)))
  coef(lm_fit)[2]
}

function_predictions <- map_dfc(community_functions, function(df) {
      df %>%
        column_to_rownames(var = "behaviour") %>%
        t() %>%
        as.data.frame() %>%
        rowwise() %>%
        mutate(slope = calculate_slope(c_across(everything()))) %>%
        select(slope) }) %>%
      t() %>%
      as.data.frame() %>%
      set_names(colnames(community_functions[[1]])[-1]) %>%
      rownames_to_column(var="iteration") %>%
      pivot_longer(!iteration, names_to="trait",values_to="value") %>%
      group_by(trait) %>%
      summarise(mean=mean(value),
        p1 = quantile(value, probs = 0.1),
        p9 = quantile(value, probs = 0.9),
        positive_support = sum(value > 0)/1000,
        negative_support = sum(value < 0)/1000) %>%
      arrange(-positive_support)

positive <- function_predictions %>%
  filter(mean >0) %>%
  arrange(mean) %>%
  filter(positive_support>=0.9) %>%
  dplyr::select(-negative_support) %>%
  rename(support=positive_support)

negative <- function_predictions %>%
  filter(mean <0) %>%
  arrange(mean) %>%
  filter(negative_support>=0.9) %>%
  dplyr::select(-positive_support) %>%
  rename(support=negative_support)

all_functions <- function_predictions %>%
  left_join(GIFT_db,by=join_by(trait==Code_function)) %>%
  mutate(trait=factor(trait)) %>%
  mutate(function_legend=str_c(trait," - ",Function)) %>%
  select(trait,mean,p1,p9,function_legend) %>% 
  unique()

gift_colors <- read_tsv("data/gift_colors.tsv") %>% 
  mutate(legend=str_c(Code_function," - ",Function))  %>% 
  filter(legend %in% all_functions$function_legend)

all_functions %>%
  ggplot(aes(x=mean, y=fct_reorder(function_legend, mean), xmin=p1, xmax=p9, color=function_legend)) +
      geom_point() +
      geom_errorbar() +
      geom_vline(xintercept=0) +
      scale_color_manual(values = gift_colors$Color) +
      theme_minimal() +
      labs(x="Regression coefficient",y="Functional trait") +
      guides(col = guide_legend(ncol = 1))