Chapter 8 HMSC set-up
8.2 Subsetting
# Subset by prevalence (present in more than 5 samples)
selected_genomes1 <- genome_counts %>%
filter(rowSums(across(starts_with("EHI")) != 0) >= 5) %>%
select(genome) %>% pull()
# Subset by minimum representation of 1% relative abundance in 5 samples
selected_genomes2 <- genome_counts %>%
filter(genome %in% selected_genomes1) %>%
column_to_rownames(var="genome") %>%
tss() %>%
as.data.frame() %>%
filter(rowSums(across(starts_with("EHI")) >= 0.01) >= 5) %>%
rownames()
# Subset genome metadata
genome_metadata_subset <- genome_metadata %>%
filter(genome %in% selected_genomes2)
# Random effects data (study design)
StudyDesign <- sample_metadata %>%
select(EHI_number,Sampling_point) %>%
mutate(Sampling_point = factor(Sampling_point)) %>%
#mutate(Transect = factor(Transect)) %>%
#mutate(Elevation = factor(Elevation)) %>%
column_to_rownames("EHI_number")
#Calculate normalisation factor to account for genome length
normalisation_factor <- genome_metadata %>%
filter(genome %in% selected_genomes2) %>%
mutate(factor=median(length)/length) %>%
pull(factor)
# Genome count table (quantitative community data)
YData <- read_counts %>%
filter(genome %in% selected_genomes2) %>% #subset genomes
mutate(across(where(is.numeric), ~ round(. * normalisation_factor,0) )) %>%
mutate(across(where(is.numeric), ~ . +1 )) %>% #add +1 pseudocount to remove zeros
mutate(across(where(is.numeric), ~ log(.) )) %>% #log-transform
column_to_rownames("genome") %>%
select(all_of(row.names(StudyDesign))) %>%
as.data.frame() %>%
t() # transpose
# Fixed effects data (explanatory variables)
XData <- sample_metadata %>%
select(EHI_number,Elevation,Transect) %>%
mutate(Transect = factor(Transect)) %>%
mutate(logseqdepth=read_counts %>% #total log-sequencing depth
select(all_of(row.names(StudyDesign))) %>%
colSums() %>%
log(),
Elevation2 = Elevation^2 # Adding the quadratic effect of Elevation
) %>%
column_to_rownames("EHI_number")
# Genome trait data
elements_table_hmsc <- genome_gifts_filt %>%
to.elements(., GIFT_db) %>%
as.data.frame()
elements_table_hmsc<-rownames_to_column(elements_table_hmsc,var = "genome")
TrData <- elements_table_hmsc %>%
filter(genome %in% selected_genomes2) %>% #subset genomes
arrange(match(genome, colnames(YData))) %>%
column_to_rownames(var="genome") %>%
to.functions(.,GIFT_db) %>%
as.data.frame()
TrFormula=~B01+B02+B03+B04+B06+B07+B08+B09+B10+D01+D02+D03+D05+D06+D07+D08+D09+S01+S02+S03
# Genome phylogeny
PData <- genome_tree
8.4 Define and Hmsc models
#Define models
model1 = Hmsc(Y=YData,
XData = XData,
XFormula = XFormula1,
studyDesign = StudyDesign,
TrData = TrData,
TrFormula=TrFormula,
phyloTree = PData,
ranLevels = list("Sampling_point"=rL.sampling_point),
distr = "normal",
YScale = TRUE)
#model2 = Hmsc(Y=YData,
#XData = XData,
#XFormula = XFormula2,
#studyDesign = StudyDesign,
#TrData = TrData,
#TrFormula=TrFormula,
#phyloTree = PData,
#ranLevels = list("Sampling_point"=rL.sampling_point),
#distr = "normal",
#YScale = TRUE)
#Save list of models as an R object.
model_list = list(model1=model1)
if (!dir.exists("hmsc")){dir.create("hmsc")}
save(model_list, file = "hmsc/hmsc.Rdata")
Upload hmsc/hmsc.Rdata to the HPC respecting the directory structure.
8.6 Generate Hmsc executables
The next chunk generates shell files for every combination of model, MCMC samples and MCMM thinning, ready to be launched as SLURM jobs.
modelchains <- expand.grid(model = names(model_list), sample = MCMC_samples_list, thin = MCMC_thin_list)
if (!dir.exists("hmsc")){dir.create("hmsc")}
for(i in c(1:nrow(modelchains))){
modelname=as.character(modelchains[i,1])
sample=modelchains[i,2]
thin=modelchains[i,3]
executablename <- paste0("hmsc/exe_",modelname,"_",sample,"_",thin,".sh")
fitname <- paste0("hmsc/fit_",modelname,"_",sample,"_",thin,".Rdata")
convname <- paste0("hmsc/conv_",modelname,"_",sample,"_",thin,".Rdata")
model <- paste0('model_list$',modelname)
psrf.beta.name <- paste0("psrf.beta.",modelname,"_",sample,"_",thin)
psrf.gamma.name <- paste0("psrf.gamma.",modelname,"_",sample,"_",thin)
psrf.rho.name <- paste0("psrf.rho.",modelname,"_",sample,"_",thin)
jobname <- paste0("hmsc_",modelname,"_",sample,"_",thin)
minutes <- round(sample * thin * (ncol(YData)/50), 0)
code <- sprintf("#!/bin/bash
#SBATCH --job-name=%s # Job name
#SBATCH --nodes=1
#SBATCH --ntasks=4 # Run on 4 CPUs
#SBATCH --mail-user=garazi.bideguren@sund.ku.dk
#SBATCH --mem=200gb # Job memory request
#SBATCH --time=%d # In minutes
# Activate conda environment
module load mamba/1.3.1
if ! conda info --envs | grep -q hmsc; then
mamba create -p ./hmsc/hmsc_env -y r-essentials r-base r-tidyverse r-Hmsc
fi
source activate /maps/projects/mjolnir1/people/dlz554/hmsc_env
# Run R script
Rscript -e '
library(tidyverse)
library(Hmsc)
# Load formulas and data
load(\"hmsc/hmsc.Rdata\")
# Declare placeholders
modelname = \"%s\"
model = %s
fitname = \"%s\"
convname = \"%s\"
sample = %d
thin = %d
nchains = %d
# Run model fitting
m = sampleMcmc(hM = model,
samples = sample,
thin = thin,
adaptNf=rep(ceiling(0.4*sample*thin),model$nr),
transient = ceiling(0.5*sample*thin),
nChains = nchains,
nParallel = nchains)
# Run model cross-validation, each fold will include the samples from the whole elevation gradient of each transect, randomly selected
#partition <- createPartition(m, nfolds = 4)
StudyDesign<-m$studyDesign
partition<-numeric(length = nrow(StudyDesign))
folds<-1:4
n_sampling_points<-length(unique(StudyDesign$Sampling_point))
set.seed(1)
for(i in 1:n_sampling_points){
location_tmp<-which(StudyDesign$Sampling_point==unique(StudyDesign$Sampling_point)[i])
if(length(location_tmp)>4){
random_points<-sample(location_tmp,4)
partition[random_points]<-sample(folds,4)
remaining_points<-location_tmp[is.na(match(location_tmp, random_points))]
partition[remaining_points]<-sample(folds,length(remaining_points))
}else{
partition[location_tmp]<-sample(folds,length(location_tmp))
}
}
cv <- computePredictedValues(m, partition=partition, nChains = 4)
# Assess chain convergence
mpost = convertToCodaObject(m,
spNamesNumbers = c(T,F),
covNamesNumbers = c(T,F),
Beta = TRUE,
Gamma = TRUE,
V = FALSE,
Sigma = FALSE,
Rho = TRUE,
Eta = FALSE,
Lambda = FALSE,
Alpha = FALSE,
Omega = FALSE,
Psi = FALSE,
Delta = FALSE) # Convert to CODA object
# Fixed effects
assign(paste0(\"psrf.beta.\", modelname,\"_\",sample,\"_\",thin), gelman.diag(mpost$Beta,multivariate=FALSE)$psrf)
# Traits
assign(paste0(\"psrf.gamma.\", modelname,\"_\",sample,\"_\",thin), gelman.diag(mpost$Gamma,multivariate=FALSE)$psrf)
# Phylogeny
assign(paste0(\"psrf.rho.\", modelname,\"_\",sample,\"_\",thin), gelman.diag(mpost$Rho,multivariate=FALSE)$psrf)
# Write convergence data
save(%s, %s, %s, file=convname)
# Save model fit object
save(m,cv, file=fitname)
'
", jobname, minutes, modelname, model, fitname, convname, sample, thin, nChains, psrf.beta.name, psrf.gamma.name, psrf.rho.name)
writeLines(code, executablename)
}
Upload the produced hmsc/exe_XXXXX.sh files to the HPC respecting the directory structure.
8.8 Assess chain convergence
Convergence diagnostic values substantially above 1 indicate lack of convergence. Values below 1.1 are considered good enough
# Load all conv file available in the hmsc folder
file_paths <-list.files(path = "hmsc_elevation", pattern = "^conv_", full.names = TRUE, include.dirs = TRUE)
for (file_path in file_paths) {
load(file_path, verbose = TRUE) # Remove .GlobalEnv argument and specify verbose for each load operation
}
Loading objects:
psrf.beta.model1_250_1
psrf.gamma.model1_250_1
psrf.rho.model1_250_1
Loading objects:
psrf.beta.model1_250_10
psrf.gamma.model1_250_10
psrf.rho.model1_250_10
# Create a merged psrf.beta (genome) plot
ls() %>%
grep("^psrf\\.beta", ., value = TRUE) %>%
map_dfr(~ {
mat <- get(.x)
data.frame(modelchain = .x, as.data.frame(mat, , stringsAsFactors = FALSE)) %>%
rownames_to_column(var="parameter") %>%
mutate(model = str_split(modelchain, "_") %>% map_chr(1) %>% gsub("psrf.beta.","",.)) %>%
mutate(sample = str_split(modelchain, "_")[[1]][2]) %>% #extract sample info from model name
mutate(thin = str_split(modelchain, "_")[[1]][3]) #extract thin info from model name
}) %>%
ggplot(.,aes(x=reorder(modelchain,-Point.est.,fun=function(x) {quantile(x, probs = 0.9)}),y=Point.est.)) +
geom_violin(fill="#b8d9e3", color="#328da8") +
geom_jitter(alpha=0.3,size=0.2, color="#a8babf") +
stat_summary(fun=function(x) {quantile(x, probs = 0.9)}, geom="crossbar", width=0.2, color="orange") +
geom_hline(yintercept=1.1, linetype="dashed", color = "red") +
ylim(0.9,2)+
labs(x="Model chains",y="Parameter estimates")+
theme_classic()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))