Chapter 8 HMSC set-up

8.1 Load data

load("data/data.Rdata")

8.2 Subsetting

# Subset by prevalence (present in more than 5 samples)
selected_genomes1 <- genome_counts %>%
    filter(rowSums(across(starts_with("EHI")) != 0) >= 5) %>%
    select(genome) %>% pull()

# Subset by minimum representation of 1% relative abundance in 5 samples
selected_genomes2 <- genome_counts %>%
    filter(genome %in% selected_genomes1) %>%
    column_to_rownames(var="genome") %>%
    tss() %>%
    as.data.frame() %>%
    filter(rowSums(across(starts_with("EHI")) >= 0.01) >= 5) %>%
    rownames()

# Subset genome metadata
genome_metadata_subset <- genome_metadata %>%
    filter(genome %in% selected_genomes2)

# Random effects data (study design)
StudyDesign <- sample_metadata %>%
                    select(EHI_number,Sampling_point) %>%
                    mutate(Sampling_point = factor(Sampling_point)) %>%
                    #mutate(Transect = factor(Transect)) %>%
                    #mutate(Elevation = factor(Elevation)) %>%
                    column_to_rownames("EHI_number")

#Calculate normalisation factor to account for genome length
normalisation_factor <- genome_metadata %>% 
  filter(genome %in% selected_genomes2) %>%
  mutate(factor=median(length)/length) %>%
  pull(factor)

# Genome count table (quantitative community data)
YData <- read_counts  %>%
                    filter(genome %in% selected_genomes2) %>% #subset genomes
                    mutate(across(where(is.numeric), ~ round(. * normalisation_factor,0) )) %>% 
                    mutate(across(where(is.numeric), ~ . +1 )) %>% #add +1 pseudocount to remove zeros
                    mutate(across(where(is.numeric), ~  log(.) )) %>% #log-transform
                    column_to_rownames("genome") %>%
                    select(all_of(row.names(StudyDesign))) %>%
                    as.data.frame() %>%
                    t() # transpose

# Fixed effects data (explanatory variables)
XData <- sample_metadata %>%
                    select(EHI_number,Elevation,Transect) %>%
                    mutate(Transect = factor(Transect)) %>%
                    mutate(logseqdepth=read_counts %>% #total log-sequencing depth
                        select(all_of(row.names(StudyDesign))) %>%
                        colSums() %>%
                        log(),
                        Elevation2 = Elevation^2  # Adding the quadratic effect of Elevation
                    ) %>%
                    column_to_rownames("EHI_number")


# Genome trait data
elements_table_hmsc <- genome_gifts_filt %>%
    to.elements(., GIFT_db) %>%
    as.data.frame()

elements_table_hmsc<-rownames_to_column(elements_table_hmsc,var = "genome")

TrData <- elements_table_hmsc %>%
  filter(genome %in% selected_genomes2) %>% #subset genomes
  arrange(match(genome, colnames(YData))) %>%
  column_to_rownames(var="genome") %>%
  to.functions(.,GIFT_db) %>%
  as.data.frame()

TrFormula=~B01+B02+B03+B04+B06+B07+B08+B09+B10+D01+D02+D03+D05+D06+D07+D08+D09+S01+S02+S03

# Genome phylogeny
PData <- genome_tree

8.3 Define formulas of the Hmsc model

# Fixed effects formula
XFormula1 = ~Elevation + I(Elevation^2) + Transect + logseqdepth

# Study design
rL.sampling_point = HmscRandomLevel(units = levels(StudyDesign$Sampling_point))
#rL.transect = HmscRandomLevel(units = levels(StudyDesign$Transect))

8.4 Define and Hmsc models

#Define models
model1 = Hmsc(Y=YData,
         XData = XData,
         XFormula = XFormula1,
         studyDesign = StudyDesign,
         TrData = TrData,
         TrFormula=TrFormula,
         phyloTree = PData,
         ranLevels = list("Sampling_point"=rL.sampling_point),
         distr = "normal",
         YScale = TRUE)

#model2 = Hmsc(Y=YData,
         #XData = XData,
         #XFormula = XFormula2,
         #studyDesign = StudyDesign,
         #TrData = TrData,
         #TrFormula=TrFormula,
         #phyloTree = PData,
         #ranLevels = list("Sampling_point"=rL.sampling_point),
         #distr = "normal",
         #YScale = TRUE)

#Save list of models as an R object.
model_list = list(model1=model1)
if (!dir.exists("hmsc")){dir.create("hmsc")}
save(model_list, file = "hmsc/hmsc.Rdata")

Upload hmsc/hmsc.Rdata to the HPC respecting the directory structure.

8.5 Define MCMC

# How often to sample the MCMC
MCMC_samples_list = 250

# The number of MCMC steps between each recording sample
MCMC_thin_list = c(1, 10)

# The number of MCMC chains to use
nChains = 4

8.6 Generate Hmsc executables

The next chunk generates shell files for every combination of model, MCMC samples and MCMM thinning, ready to be launched as SLURM jobs.

modelchains <- expand.grid(model = names(model_list), sample = MCMC_samples_list, thin = MCMC_thin_list)

if (!dir.exists("hmsc")){dir.create("hmsc")}
for(i in c(1:nrow(modelchains))){
      modelname=as.character(modelchains[i,1])
      sample=modelchains[i,2]
      thin=modelchains[i,3]
      executablename <- paste0("hmsc/exe_",modelname,"_",sample,"_",thin,".sh")
      fitname <- paste0("hmsc/fit_",modelname,"_",sample,"_",thin,".Rdata")
      convname <- paste0("hmsc/conv_",modelname,"_",sample,"_",thin,".Rdata")
      model <- paste0('model_list$',modelname)
      psrf.beta.name <-  paste0("psrf.beta.",modelname,"_",sample,"_",thin)
      psrf.gamma.name <-  paste0("psrf.gamma.",modelname,"_",sample,"_",thin)
      psrf.rho.name <-  paste0("psrf.rho.",modelname,"_",sample,"_",thin)
      jobname <- paste0("hmsc_",modelname,"_",sample,"_",thin)
      minutes <- round(sample * thin * (ncol(YData)/50), 0)
      code <- sprintf("#!/bin/bash
#SBATCH --job-name=%s                   # Job name
#SBATCH --nodes=1
#SBATCH --ntasks=4                      # Run on 4 CPUs
#SBATCH --mail-user=garazi.bideguren@sund.ku.dk
#SBATCH --mem=200gb                      # Job memory request
#SBATCH --time=%d                       # In minutes

# Activate conda environment
module load mamba/1.3.1
if ! conda info --envs | grep -q hmsc; then
  mamba create -p ./hmsc/hmsc_env -y r-essentials r-base r-tidyverse r-Hmsc
fi
source activate /maps/projects/mjolnir1/people/dlz554/hmsc_env

# Run R script
Rscript -e '
library(tidyverse)
library(Hmsc)
# Load formulas and data
load(\"hmsc/hmsc.Rdata\")

# Declare placeholders
modelname = \"%s\"
model = %s
fitname = \"%s\"
convname = \"%s\"
sample = %d
thin = %d
nchains = %d

# Run model fitting
m = sampleMcmc(hM = model,
         samples = sample,
         thin = thin,
         adaptNf=rep(ceiling(0.4*sample*thin),model$nr),
         transient = ceiling(0.5*sample*thin),
         nChains = nchains,
         nParallel = nchains)

# Run model cross-validation, each fold will include the samples from the whole elevation gradient of each transect, randomly selected
#partition <- createPartition(m, nfolds = 4)

StudyDesign<-m$studyDesign

partition<-numeric(length = nrow(StudyDesign))
folds<-1:4
n_sampling_points<-length(unique(StudyDesign$Sampling_point))

set.seed(1)
for(i in 1:n_sampling_points){
  location_tmp<-which(StudyDesign$Sampling_point==unique(StudyDesign$Sampling_point)[i])
  if(length(location_tmp)>4){
    random_points<-sample(location_tmp,4)
    partition[random_points]<-sample(folds,4)
    remaining_points<-location_tmp[is.na(match(location_tmp, random_points))]
    partition[remaining_points]<-sample(folds,length(remaining_points))
  }else{
    partition[location_tmp]<-sample(folds,length(location_tmp))
  }
}

cv <- computePredictedValues(m, partition=partition, nChains = 4)         
         
# Assess chain convergence
mpost = convertToCodaObject(m,
      spNamesNumbers = c(T,F),
      covNamesNumbers = c(T,F),
      Beta = TRUE,
      Gamma = TRUE,
      V = FALSE,
      Sigma = FALSE,
      Rho = TRUE,
      Eta = FALSE,
      Lambda = FALSE,
      Alpha = FALSE,
      Omega = FALSE,
      Psi = FALSE,
      Delta = FALSE) # Convert to CODA object

# Fixed effects
assign(paste0(\"psrf.beta.\", modelname,\"_\",sample,\"_\",thin), gelman.diag(mpost$Beta,multivariate=FALSE)$psrf)

# Traits
assign(paste0(\"psrf.gamma.\", modelname,\"_\",sample,\"_\",thin), gelman.diag(mpost$Gamma,multivariate=FALSE)$psrf)

# Phylogeny
assign(paste0(\"psrf.rho.\", modelname,\"_\",sample,\"_\",thin), gelman.diag(mpost$Rho,multivariate=FALSE)$psrf)

# Write convergence data
save(%s, %s, %s, file=convname)

# Save model fit object
save(m,cv, file=fitname)
'
", jobname, minutes, modelname, model, fitname, convname, sample, thin, nChains, psrf.beta.name, psrf.gamma.name, psrf.rho.name)
      writeLines(code, executablename)
    }

Upload the produced hmsc/exe_XXXXX.sh files to the HPC respecting the directory structure.

8.7 Fit Hmsc models (in Mjolnir HPC)

Launch the SLURM jobs by using:

# Submit all .sh files in the hmsc folder
for jobfile in hmsc/exe_*.sh; do
    sbatch "$jobfile"
done

#Or launch them one by one only the ones you want to launch
sbatch hmsc/exe_model1_250_1.sh
sbatch hmsc/exe_model2_250_10.sh

8.8 Assess chain convergence

Convergence diagnostic values substantially above 1 indicate lack of convergence. Values below 1.1 are considered good enough

# Load all conv file available in the hmsc folder
file_paths <-list.files(path = "hmsc_elevation", pattern = "^conv_", full.names = TRUE, include.dirs = TRUE)
for (file_path in file_paths) {
    load(file_path, verbose = TRUE)  # Remove .GlobalEnv argument and specify verbose for each load operation
}

Loading objects:
  psrf.beta.model1_250_1
  psrf.gamma.model1_250_1
  psrf.rho.model1_250_1
Loading objects:
  psrf.beta.model1_250_10
  psrf.gamma.model1_250_10
  psrf.rho.model1_250_10

# Create a merged psrf.beta (genome) plot
ls() %>%
        grep("^psrf\\.beta", ., value = TRUE) %>%
        map_dfr(~ {
         mat <- get(.x)
          data.frame(modelchain = .x, as.data.frame(mat, , stringsAsFactors = FALSE)) %>%
              rownames_to_column(var="parameter") %>%
              mutate(model = str_split(modelchain, "_") %>% map_chr(1) %>% gsub("psrf.beta.","",.)) %>%
              mutate(sample = str_split(modelchain, "_")[[1]][2]) %>% #extract sample info from model name
              mutate(thin = str_split(modelchain, "_")[[1]][3]) #extract thin info from model name
      }) %>%
      ggplot(.,aes(x=reorder(modelchain,-Point.est.,fun=function(x) {quantile(x, probs = 0.9)}),y=Point.est.)) +
        geom_violin(fill="#b8d9e3", color="#328da8") +
        geom_jitter(alpha=0.3,size=0.2, color="#a8babf") +
        stat_summary(fun=function(x) {quantile(x, probs = 0.9)}, geom="crossbar", width=0.2, color="orange") +
        geom_hline(yintercept=1.1, linetype="dashed", color = "red") +
        ylim(0.9,2)+
        labs(x="Model chains",y="Parameter estimates")+
        theme_classic()+
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))