Chapter 5 Alpha diversity

5.1 Hill numbers

# Calculate Hill numbers
richness <- genome_counts_filt %>%
  column_to_rownames(var = "genome") %>%
  dplyr::select(where(~ !all(. == 0))) %>%
  hilldiv(., q = 0) %>%
  t() %>%
  as.data.frame() %>%
  dplyr::rename(richness = 1) %>%
  rownames_to_column(var = "sample")

neutral <- genome_counts_filt %>%
  column_to_rownames(var = "genome") %>%
  dplyr::select(where(~ !all(. == 0))) %>%
  hilldiv(., q = 1) %>%
  t() %>%
  as.data.frame() %>%
  dplyr::rename(neutral = 1) %>%
  rownames_to_column(var = "sample")

phylogenetic <- genome_counts_filt %>%
  column_to_rownames(var = "genome") %>%
  dplyr::select(where(~ !all(. == 0))) %>%
  hilldiv(., q = 1, tree = genome_tree) %>%
  t() %>%
  as.data.frame() %>%
  dplyr::rename(phylogenetic = 1) %>%
  rownames_to_column(var = "sample")

# Aggregate basal GIFT into elements
dist <- genome_gifts %>%
  to.elements(., GIFT_db) %>%
  traits2dist(., method = "gower")

functional <- genome_counts_filt %>%
  column_to_rownames(var = "genome") %>%
  dplyr::select(where(~ !all(. == 0))) %>%
  hilldiv(., q = 1, dist = dist) %>%
  t() %>%
  as.data.frame() %>%
  dplyr::rename(functional = 1) %>%
  rownames_to_column(var = "sample") %>%
  mutate(functional = if_else(is.nan(functional), 1, functional))

# Merge all metrics
alpha_div <- richness %>%
  full_join(neutral, by = join_by(sample == sample)) %>%
  full_join(phylogenetic, by = join_by(sample == sample)) %>%
  full_join(functional, by = join_by(sample == sample))

5.2 By location

5.2.1 Plots

richness_mean <- alpha_div %>%
  left_join(sample_metadata, by = join_by(sample == sample))%>%
  group_by(location) %>%
  dplyr::summarise_at(.vars = names(.)[2], .funs = c("Richness mean" = "mean", "Richness sd" = "sd"))

neutral_mean <- alpha_div %>%
  left_join(sample_metadata, by = join_by(sample == sample))%>%
  group_by(location) %>%
  dplyr::summarise_at(.vars = names(.)[3], .funs = c("Neutral mean" = "mean", "Neutral sd" = "sd"))

phylogenetic_mean <- alpha_div %>%
  left_join(sample_metadata, by = join_by(sample == sample))%>%
  group_by(location) %>%
  dplyr::summarise_at(.vars = names(.)[4], .funs = c("Phylogenetic mean" = "mean", "Phylogenetic sd" = "sd"))

cbind(richness_mean, neutral_mean[, 2:3], phylogenetic_mean[, 2:3])%>%
  tt()

location	Richness mean	Richness sd	Neutral mean	Neutral sd	Phylogenetic mean	Phylogenetic sd
Aruba	22.12500	20.75532	11.82228	9.034244	5.522053	1.715380
Brazil	74.81250	23.70434	25.03024	11.453472	6.319856	1.456310
CaboVerde	31.93333	14.60659	15.22392	4.306461	6.984720	1.414464
Spain	70.93333	15.76373	24.75182	11.327483	6.440501	1.526864
Denmark	61.80000	18.60952	20.50564	9.170413	5.594599	1.142487
Malaysia	46.46667	25.00248	24.35444	11.129897	6.509729	1.569361

group_n <- alpha_div %>%
  left_join(., sample_metadata, by = join_by(sample == sample))%>%
  select(location) %>%
  pull() %>%
  unique() %>%
  length()

#pdf("figures/diversity_location.pdf",width=20, height=9)
alpha_div %>%
  pivot_longer(-sample, names_to = "metric", values_to = "value") %>%
  left_join(sample_metadata, by = "sample") %>%
  mutate(metric=factor(metric,levels=c("richness","neutral","phylogenetic","functional"))) %>%
  ggplot(aes(y = value, x = location, group=location, color=location, fill=location)) +
  geom_boxplot(outlier.shape = NA, show.legend = FALSE) +
  geom_jitter(alpha=0.5) +
  scale_color_manual(values = location_colors) +
  scale_fill_manual(values = str_c(location_colors, "50")) +
  facet_wrap(. ~ metric, scales = "free", ncol=4) +
  coord_cartesian(xlim = c(1, NA)) +
  theme_classic() +
  theme(
    axis.ticks.x = element_blank(),
        strip.text.x = element_text(size = 12, color="black",face="bold"),
        strip.background = element_blank(),
        panel.grid.minor.x = element_line(size = .1, color = "grey"),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        axis.text.x = element_blank(),
        legend.text=element_text(size=10),
        legend.title = element_text(size=12))+
    guides(fill = guide_legend(override.aes = list(size=3)))

#dev.off()

5.2.2 Mixed models

5.2.2.1 Richness

Analysis of Deviance Table

Model: Negative Binomial(4.3858), link: log

Response: richness

Terms added sequentially (first to last)

         Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
NULL                        91    165.145              
location  5   66.294        86     98.851 6.039e-13 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

                R2m       R2c
delta     0.4502528 0.4502528
lognormal 0.4782539 0.4782539
trigamma  0.4193938 0.4193938

$emmeans
 location  emmean    SE  df asymp.LCL asymp.UCL
 Aruba       3.10 0.131 Inf      2.84      3.35
 Brazil      4.31 0.123 Inf      4.07      4.56
 CaboVerde   3.46 0.131 Inf      3.21      3.72
 Spain       4.26 0.127 Inf      4.01      4.51
 Denmark     4.12 0.128 Inf      3.87      4.37
 Malaysia    3.84 0.129 Inf      3.59      4.09

Results are given on the log (not the response) scale. 
Confidence level used: 0.95 

$contrasts
 contrast             estimate    SE  df z.ratio p.value
 Aruba - Brazil        -1.2183 0.179 Inf  -6.793  <.0001
 Aruba - CaboVerde     -0.3669 0.185 Inf  -1.979  0.3541
 Aruba - Spain         -1.1650 0.182 Inf  -6.392  <.0001
 Aruba - Denmark       -1.0272 0.183 Inf  -5.624  <.0001
 Aruba - Malaysia      -0.7420 0.184 Inf  -4.041  0.0008
 Brazil - CaboVerde     0.8513 0.180 Inf   4.732  <.0001
 Brazil - Spain         0.0532 0.177 Inf   0.301  0.9997
 Brazil - Denmark       0.1911 0.177 Inf   1.079  0.8899
 Brazil - Malaysia      0.4762 0.178 Inf   2.674  0.0805
 CaboVerde - Spain     -0.7981 0.183 Inf  -4.365  0.0002
 CaboVerde - Denmark   -0.6603 0.183 Inf  -3.604  0.0042
 CaboVerde - Malaysia  -0.3751 0.184 Inf  -2.036  0.3215
 Spain - Denmark        0.1378 0.180 Inf   0.766  0.9732
 Spain - Malaysia       0.4230 0.181 Inf   2.337  0.1794
 Denmark - Malaysia     0.2852 0.181 Inf   1.572  0.6172

Results are given on the log (not the response) scale. 
P value adjustment: tukey method for comparing a family of 6 estimates

5.2.2.2 Neutral

Modelq1_Loca <- lm(formula = neutral ~ location, data = alpha_div_meta) 
anova(Modelq1_Loca)

Analysis of Variance Table

Response: neutral
          Df Sum Sq Mean Sq F value    Pr(>F)    
location   5 2438.6  487.72  5.1405 0.0003575 ***
Residuals 86 8159.6   94.88                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

r.squaredGLMM(Modelq1_Loca)

           R2m       R2c
[1,] 0.2202391 0.2202391

emmeans(Modelq1_Loca, pairwise ~ location)

$emmeans
 location  emmean   SE df lower.CL upper.CL
 Aruba       11.8 2.44 86     6.98     16.7
 Brazil      25.0 2.44 86    20.19     29.9
 CaboVerde   15.2 2.52 86    10.22     20.2
 Spain       24.8 2.52 86    19.75     29.8
 Denmark     20.5 2.52 86    15.51     25.5
 Malaysia    24.4 2.52 86    19.35     29.4

Confidence level used: 0.95 

$contrasts
 contrast             estimate   SE df t.ratio p.value
 Aruba - Brazil        -13.208 3.44 86  -3.835  0.0032
 Aruba - CaboVerde      -3.402 3.50 86  -0.972  0.9257
 Aruba - Spain         -12.930 3.50 86  -3.693  0.0051
 Aruba - Denmark        -8.683 3.50 86  -2.480  0.1415
 Aruba - Malaysia      -12.532 3.50 86  -3.580  0.0073
 Brazil - CaboVerde      9.806 3.50 86   2.801  0.0669
 Brazil - Spain          0.278 3.50 86   0.080  1.0000
 Brazil - Denmark        4.525 3.50 86   1.292  0.7884
 Brazil - Malaysia       0.676 3.50 86   0.193  1.0000
 CaboVerde - Spain      -9.528 3.56 86  -2.679  0.0901
 CaboVerde - Denmark    -5.282 3.56 86  -1.485  0.6747
 CaboVerde - Malaysia   -9.131 3.56 86  -2.567  0.1168
 Spain - Denmark         4.246 3.56 86   1.194  0.8387
 Spain - Malaysia        0.397 3.56 86   0.112  1.0000
 Denmark - Malaysia     -3.849 3.56 86  -1.082  0.8872

P value adjustment: tukey method for comparing a family of 6 estimates

5.2.2.3 Phylogenetic

Modelq1p_Loca <- lm(formula = phylogenetic ~ location, data = alpha_div_meta) 
anova(Modelq1p_Loca)

Analysis of Variance Table

Response: phylogenetic
          Df  Sum Sq Mean Sq F value  Pr(>F)  
location   5  24.581  4.9161  2.2328 0.05818 .
Residuals 86 189.353  2.2018                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

r.squaredGLMM(Modelq1p_Loca)

          R2m      R2c
[1,] 0.109275 0.109275

emmeans(Modelq1p_Loca, pairwise ~ location)

$emmeans
 location  emmean    SE df lower.CL upper.CL
 Aruba       5.52 0.371 86     4.78     6.26
 Brazil      6.32 0.371 86     5.58     7.06
 CaboVerde   6.98 0.383 86     6.22     7.75
 Spain       6.44 0.383 86     5.68     7.20
 Denmark     5.59 0.383 86     4.83     6.36
 Malaysia    6.51 0.383 86     5.75     7.27

Confidence level used: 0.95 

$contrasts
 contrast             estimate    SE df t.ratio p.value
 Aruba - Brazil        -0.7978 0.525 86  -1.521  0.6520
 Aruba - CaboVerde     -1.4627 0.533 86  -2.743  0.0773
 Aruba - Spain         -0.9184 0.533 86  -1.722  0.5211
 Aruba - Denmark       -0.0725 0.533 86  -0.136  1.0000
 Aruba - Malaysia      -0.9877 0.533 86  -1.852  0.4386
 Brazil - CaboVerde    -0.6649 0.533 86  -1.247  0.8125
 Brazil - Spain        -0.1206 0.533 86  -0.226  0.9999
 Brazil - Denmark       0.7253 0.533 86   1.360  0.7505
 Brazil - Malaysia     -0.1899 0.533 86  -0.356  0.9992
 CaboVerde - Spain      0.5442 0.542 86   1.004  0.9153
 CaboVerde - Denmark    1.3901 0.542 86   2.566  0.1172
 CaboVerde - Malaysia   0.4750 0.542 86   0.877  0.9511
 Spain - Denmark        0.8459 0.542 86   1.561  0.6260
 Spain - Malaysia      -0.0692 0.542 86  -0.128  1.0000
 Denmark - Malaysia    -0.9151 0.542 86  -1.689  0.5428

P value adjustment: tukey method for comparing a family of 6 estimates

5.2.2.4 Functional

Modelq1F_Loca <- lm(formula = functional ~ location, data = alpha_div_meta) 
anova(Modelq1F_Loca)

Analysis of Variance Table

Response: functional
          Df  Sum Sq   Mean Sq F value  Pr(>F)  
location   5 0.06044 0.0120886  2.3531 0.04726 *
Residuals 86 0.44181 0.0051373                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

r.squaredGLMM(Modelq1F_Loca)

           R2m       R2c
[1,] 0.1144882 0.1144882

emmeans(Modelq1F_Loca, pairwise ~ location)

$emmeans
 location  emmean     SE df lower.CL upper.CL
 Aruba       1.41 0.0179 86     1.37     1.45
 Brazil      1.43 0.0179 86     1.39     1.47
 CaboVerde   1.46 0.0185 86     1.42     1.50
 Spain       1.45 0.0185 86     1.41     1.49
 Denmark     1.47 0.0185 86     1.43     1.51
 Malaysia    1.49 0.0185 86     1.45     1.52

Confidence level used: 0.95 

$contrasts
 contrast             estimate     SE df t.ratio p.value
 Aruba - Brazil       -0.02104 0.0253 86  -0.830  0.9611
 Aruba - CaboVerde    -0.05095 0.0258 86  -1.978  0.3635
 Aruba - Spain        -0.04197 0.0258 86  -1.629  0.5818
 Aruba - Denmark      -0.05914 0.0258 86  -2.296  0.2071
 Aruba - Malaysia     -0.07819 0.0258 86  -3.035  0.0363
 Brazil - CaboVerde   -0.02991 0.0258 86  -1.161  0.8539
 Brazil - Spain       -0.02093 0.0258 86  -0.812  0.9645
 Brazil - Denmark     -0.03810 0.0258 86  -1.479  0.6784
 Brazil - Malaysia    -0.05714 0.0258 86  -2.218  0.2402
 CaboVerde - Spain     0.00898 0.0262 86   0.343  0.9994
 CaboVerde - Denmark  -0.00820 0.0262 86  -0.313  0.9996
 CaboVerde - Malaysia -0.02724 0.0262 86  -1.041  0.9027
 Spain - Denmark      -0.01717 0.0262 86  -0.656  0.9861
 Spain - Malaysia     -0.03622 0.0262 86  -1.384  0.7365
 Denmark - Malaysia   -0.01904 0.0262 86  -0.728  0.9780

P value adjustment: tukey method for comparing a family of 6 estimates

5.3 By behaviour and location

5.3.1 Plots

5.3.1.1 Richness

alpha_div %>%
  pivot_longer(-sample, names_to = "metric", values_to = "value") %>%
  left_join(sample_metadata, by = "sample") %>%
  filter(!is.na(location)) %>%
  filter(metric=="richness") %>%
      ggplot(aes(y = value, x = origin, group=origin, color=origin, fill=origin)) +
      geom_boxplot(outlier.shape = NA) +
      geom_jitter(width = 0.1, alpha=0.5) +
      scale_color_manual(values = origin_colors) +
      scale_fill_manual(values = str_c(origin_colors, "50")) +
      facet_wrap(. ~ location, scales = "fixed", ncol=6) +
      coord_cartesian(xlim = c(1, NA)) +
      theme_classic() +
      theme(
        strip.background = element_blank(),
        panel.grid.minor.x = element_line(size = .1, color = "grey"),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        axis.text.x = element_text(angle = 45, hjust = 1)
      )

5.3.1.2 Neutral

#pdf("figures/alpha_q1n_behaviour.pdf",width=9, height=5)
alpha_div %>%
  pivot_longer(-sample, names_to = "metric", values_to = "value") %>%
  left_join(sample_metadata, by = "sample") %>%
  filter(!is.na(location)) %>%
  filter(metric=="neutral") %>%
      ggplot(aes(y = value, x = origin, group=origin, color=origin, fill=origin)) +
      geom_boxplot(outlier.shape = NA) +
      geom_jitter(width = 0.1, alpha=0.5) +
      scale_color_manual(values = origin_colors) +
      scale_fill_manual(values = str_c(origin_colors, "50")) +
      facet_wrap(. ~ location, scales = "fixed", ncol=6) +
      coord_cartesian(xlim = c(1, NA)) +
      theme_classic() +
      theme(
        strip.background = element_blank(),
        panel.grid.minor.x = element_line(size = .1, color = "grey"),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        axis.text.x = element_text(angle = 45, hjust = 1)
      )

#dev.off()

5.3.1.3 Phylogenetic

#pdf("figures/alpha_q1p_behaviour.pdf",width=9, height=5)
alpha_div %>%
  pivot_longer(-sample, names_to = "metric", values_to = "value") %>%
  left_join(sample_metadata, by = "sample") %>%
  filter(!is.na(location)) %>%
  filter(metric=="phylogenetic") %>%
      ggplot(aes(y = value, x = origin, group=origin, color=origin, fill=origin)) +
      geom_boxplot(outlier.shape = NA) +
      geom_jitter(width = 0.1, alpha=0.5) +
      scale_color_manual(values = origin_colors) +
      scale_fill_manual(values = str_c(origin_colors, "50")) +
      facet_wrap(. ~ location, scales = "fixed", ncol=6) +
      coord_cartesian(xlim = c(1, NA)) +
      theme_classic() +
      theme(
        strip.background = element_blank(),
        panel.grid.minor.x = element_line(size = .1, color = "grey"),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        axis.text.x = element_text(angle = 45, hjust = 1)
      )

#dev.off()

5.3.1.4 Functional

alpha_div %>%
  pivot_longer(-sample, names_to = "metric", values_to = "value") %>%
  left_join(sample_metadata, by = "sample") %>%
  filter(!is.na(location)) %>%
  filter(metric=="functional") %>%
      ggplot(aes(y = value, x = origin, group=origin, color=origin, fill=origin)) +
      geom_boxplot(outlier.shape = NA) +
      geom_jitter(width = 0.1, alpha=0.5) +
      scale_color_manual(values = origin_colors) +
      scale_fill_manual(values = str_c(origin_colors, "50")) +
      facet_wrap(. ~ location, scales = "fixed", ncol=6) +
      coord_cartesian(xlim = c(1, NA)) +
      theme_classic() +
      theme(
        strip.background = element_blank(),
        panel.grid.minor.x = element_line(size = .1, color = "grey"),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        axis.text.x = element_text(angle = 45, hjust = 1)
      )

5.3.2 Mixed models

5.3.2.1 Richness

Model_richness <- glmer.nb(richness ~ origin+sex+(1|location), data = alpha_div_meta)
summary(Model_richness)

Generalized linear mixed model fit by maximum likelihood (Laplace Approximation) ['glmerMod']
 Family: Negative Binomial(4.2906)  ( log )
Formula: richness ~ origin + sex + (1 | location)
   Data: alpha_div_meta

     AIC      BIC   logLik deviance df.resid 
   857.1    872.2   -422.5    845.1       86 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-1.6859 -0.6041 -0.0643  0.5858  3.2671 

Random effects:
 Groups   Name        Variance Std.Dev.
 location (Intercept) 0.1653   0.4065  
Number of obs: 92, groups:  location, 6

Fixed effects:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  3.90133    0.19455  20.053   <2e-16 ***
originFeral  0.00238    0.10976   0.022    0.983    
sexMale     -0.25024    0.11805  -2.120    0.034 *  
sexUnknown   0.23928    0.36462   0.656    0.512    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation of Fixed Effects:
            (Intr) orgnFr sexMal
originFeral -0.228              
sexMale     -0.260  0.044       
sexUnknown  -0.281 -0.113  0.076

5.3.2.2 Neutral

Model_neutral <- lme(fixed = neutral ~ origin+sex, data = alpha_div_meta,
               random = ~ 1 | location)#log(seq_depth)+
summary(Model_neutral)

Linear mixed-effects model fit by REML
  Data: alpha_div_meta 
       AIC      BIC    logLik
  680.5938 695.4578 -334.2969

Random effects:
 Formula: ~1 | location
        (Intercept) Residual
StdDev:    5.311173 9.630345

Fixed effects:  neutral ~ origin + sex 
                Value Std.Error DF   t-value p-value
(Intercept) 20.948125  2.856806 83  7.332709  0.0000
originFeral  1.703197  2.090004 83  0.814925  0.4174
sexMale     -4.147994  2.232360 83 -1.858120  0.0667
sexUnknown   0.883327  5.749323 83  0.153640  0.8783
 Correlation: 
            (Intr) orgnFr sexMal
originFeral -0.306              
sexMale     -0.352  0.058       
sexUnknown  -0.309 -0.117  0.121

Standardized Within-Group Residuals:
       Min         Q1        Med         Q3        Max 
-1.9366916 -0.6727309 -0.1549049  0.6419273  2.8015816 

Number of Observations: 92
Number of Groups: 6

5.3.2.3 Phylogenetic

Model_phylo <- lme(fixed = phylogenetic ~ origin+sex, data = alpha_div_meta,
               random = ~ 1 | location)
summary(Model_phylo)

Linear mixed-effects model fit by REML
  Data: alpha_div_meta 
       AIC      BIC    logLik
  345.8789 360.7429 -166.9394

Random effects:
 Formula: ~1 | location
        (Intercept) Residual
StdDev:   0.3796989 1.473687

Fixed effects:  phylogenetic ~ origin + sex 
                Value Std.Error DF   t-value p-value
(Intercept)  6.599600 0.3116529 83 21.176119  0.0000
originFeral  0.019084 0.3174909 83  0.060108  0.9522
sexMale     -0.621959 0.3392944 83 -1.833097  0.0704
sexUnknown  -0.955378 0.6039183 83 -1.581965  0.1175
 Correlation: 
            (Intr) orgnFr sexMal
originFeral -0.442              
sexMale     -0.507  0.060       
sexUnknown  -0.335 -0.124  0.225

Standardized Within-Group Residuals:
        Min          Q1         Med          Q3         Max 
-1.86150185 -0.62279155  0.02054232  0.74108402  2.09713515 

Number of Observations: 92
Number of Groups: 6

5.3.2.4 Functional

Model_func <- lme(fixed = functional ~ origin+sex, data = alpha_div_meta,
               random = ~ 1 | location)
summary(Model_func)

Linear mixed-effects model fit by REML
  Data: alpha_div_meta 
        AIC       BIC   logLik
  -186.0933 -171.2292 99.04663

Random effects:
 Formula: ~1 | location
        (Intercept)  Residual
StdDev:  0.02356286 0.0712589

Fixed effects:  functional ~ origin + sex 
                 Value  Std.Error DF  t-value p-value
(Intercept)  1.4453479 0.01635760 83 88.35944  0.0000
originFeral  0.0194761 0.01539254 83  1.26530  0.2093
sexMale     -0.0148974 0.01644558 83 -0.90586  0.3676
sexUnknown   0.0178121 0.03256538 83  0.54696  0.5859
 Correlation: 
            (Intr) orgnFr sexMal
originFeral -0.404              
sexMale     -0.465  0.059       
sexUnknown  -0.330 -0.121  0.192

Standardized Within-Group Residuals:
       Min         Q1        Med         Q3        Max 
-2.0213571 -0.6393834  0.0797046  0.5702719  3.7232590 

Number of Observations: 92
Number of Groups: 6