Chapter 4 Alpha diversity

4.1 Metagenomics

load("resources/metagenomics/data_filtered.Rdata")
species_color <- c("#e5bd5b", "#6b7398", "#76b183")

sample_metadata <- sample_metadata %>%
  select(1,2,7,8) %>%
  mutate(method="Metagenomics")

4.1.1 MAG level

richness <- genome_counts_filt %>%
  column_to_rownames(var = "genome") %>%
  dplyr::select(where( ~ !all(. == 0))) %>%
  hilldiv(., q = 0) %>%
  t() %>%
  as.data.frame() %>%
  dplyr::rename(richness = 1) %>%
  rownames_to_column(var = "sample")

neutral <- genome_counts_filt %>%
  column_to_rownames(var = "genome") %>%
  dplyr::select(where( ~ !all(. == 0))) %>%
  hilldiv(., q = 1) %>%
  t() %>%
  as.data.frame() %>%
  dplyr::rename(neutral = 1) %>%
  rownames_to_column(var = "sample")

# Merge all metrics
alpha_div_meta <- richness %>%
  full_join(neutral, by = join_by(sample == sample))
microbial_fraction_mapped <- microbial_fraction %>% 
  select(sample, mapped_microbial_fraction) %>% 
  right_join(alpha_div_meta, by="sample") %>% 
  left_join(sample_metadata, by="sample")

microbial_fraction_mapped %>% 
  summarise(
    Total_mean = mean(mapped_microbial_fraction, na.rm = T),
    Total_sd = sd(mapped_microbial_fraction, na.rm = T),
    Eb_mean = mean(mapped_microbial_fraction[Species == "Eb"], na.rm = T),
    Eb_sd = sd(mapped_microbial_fraction[Species == "Eb"], na.rm = T),
    Ha_mean = mean(mapped_microbial_fraction[Species == "Ha"], na.rm = T),
    Ha_sd = sd(mapped_microbial_fraction[Species == "Ha"], na.rm = T),
    Pk_mean = mean(mapped_microbial_fraction[Species == "Pk"], na.rm = T),
    Pk_sd = sd(mapped_microbial_fraction[Species == "Pk"], na.rm = T)
  ) %>%
  mutate(
    Total = str_c(round(Total_mean, 3), "±", round(Total_sd, 3)),
    Cnephaeus = str_c(round(Eb_mean, 3), "±", round(Eb_sd, 3)),
    Hypsugo = str_c(round(Ha_mean, 3), "±", round(Ha_sd, 3)),
    Pipistrellus = str_c(round(Pk_mean, 3), "±", round(Pk_sd, 3))
  ) %>%
  arrange(-Eb_mean) %>%
  dplyr::select(
                Total,
                Cnephaeus,
                Hypsugo,
                Pipistrellus) %>%
  tt()
Total Cnephaeus Hypsugo Pipistrellus
24.104±24.375 33.35±26.484 9.277±11.338 35.238±26.185
  kruskal.test(mapped_microbial_fraction ~ Species, data = microbial_fraction_mapped)

    Kruskal-Wallis rank sum test

data:  mapped_microbial_fraction by Species
Kruskal-Wallis chi-squared = 14.513, df = 2, p-value = 0.0007055

Correlation between diversity and estimate microbial fraction

cor.test(microbial_fraction_mapped$mapped_microbial_fraction, microbial_fraction_mapped$richness, method = "spearman")

    Spearman's rank correlation rho

data:  microbial_fraction_mapped$mapped_microbial_fraction and microbial_fraction_mapped$richness
S = 8375.6, p-value = 0.0006649
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.4834688 
cor.test(microbial_fraction_mapped$mapped_microbial_fraction, microbial_fraction_mapped$neutral, method = "spearman")

    Spearman's rank correlation rho

data:  microbial_fraction_mapped$mapped_microbial_fraction and microbial_fraction_mapped$neutral
S = 10011, p-value = 0.008683
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.3826143 
# Histogram
hist(microbial_fraction_mapped$mapped_microbial_fraction, main = "Read Fraction", xlab = "read_fraction")

hist(microbial_fraction_mapped$richness, main = "Richness", xlab = "richness")

# QQ plots
qqnorm(microbial_fraction_mapped$mapped_microbial_fraction); qqline(microbial_fraction_mapped$mapped_microbial_fraction)

qqnorm(microbial_fraction_mapped$richness); qqline(microbial_fraction_mapped$richness)

shapiro.test(microbial_fraction_mapped$mapped_microbial_fraction)

    Shapiro-Wilk normality test

data:  microbial_fraction_mapped$mapped_microbial_fraction
W = 0.85774, p-value = 4.915e-05
shapiro.test(microbial_fraction_mapped$richness)

    Shapiro-Wilk normality test

data:  microbial_fraction_mapped$richness
W = 0.68491, p-value = 3.533e-09
p1 <- ggplot(microbial_fraction_mapped, aes(x = mapped_microbial_fraction, y = richness)) +
  geom_point(size = 3, color = "#2E86AB") +
  geom_smooth(method = "lm", se = TRUE, color = "black") +
  theme_classic() +
  labs(x = "Read Fraction (%)", y = "Richness")

p2 <- ggplot(microbial_fraction_mapped, aes(x = mapped_microbial_fraction, y = neutral)) +
  geom_point(size = 3, color = "#F39C12") +
  geom_smooth(method = "lm", se = TRUE, color = "black") +
  theme_classic() +
  labs(x = "Read Fraction (%)", y = "Neutral Alpha Diversity")

p1 + p2

metagenome_alpha_div <- alpha_div_meta %>%
  select(1:3) %>% 
  left_join(sample_metadata, by = join_by(sample == sample))
saveRDS(metagenome_alpha_div, "resources/metagenomics/metagenome_alpha_div.rds")

Diversity values at MAG level

alpha_div_meta %>%
  pivot_longer(-sample, names_to = "alpha", values_to = "value") %>%
  left_join(sample_metadata, by = join_by(sample == sample)) %>%
  group_by(alpha) %>%
  summarise(
    total_mean = mean(value, na.rm = T),
    total_sd = sd(value, na.rm = T),
    Eb_mean = mean(value[Species == "Eb"], na.rm = T),
    Eb_sd = sd(value[Species == "Eb"], na.rm = T),
    Ha_mean = mean(value[Species == "Ha"], na.rm = T),
    Ha_sd = sd(value[Species == "Ha"], na.rm = T),
    Pk_mean = mean(value[Species == "Pk"], na.rm = T),
    Pk_sd = sd(value[Species == "Pk"], na.rm = T)
  ) %>%
  mutate(
    Total = str_c(round(total_mean, 2), "±", round(total_sd, 2)),
    Cnephaeus = str_c(round(Eb_mean, 2), "±", round(Eb_sd, 2)),
    Hypsugo = str_c(round(Ha_mean, 2), "±", round(Ha_sd, 2)),
    Pipistrellus = str_c(round(Pk_mean, 2), "±", round(Pk_sd, 2))
  ) %>%
  arrange(-Eb_mean) %>%
  dplyr::select(alpha, Total, Cnephaeus, Pipistrellus, Hypsugo) %>%
  tt()
alpha Total Cnephaeus Pipistrellus Hypsugo
richness 9±11.45 19.7±18.6 8.82±8.73 3.58±2.73
neutral 4.42±5.06 7.33±6.87 5.05±5.48 2.15±1.42

Richness

Model_richness <- MASS::glm.nb(richness ~ Species, data = metagenome_alpha_div,trace=TRUE)
summary(Model_richness)

Call:
MASS::glm.nb(formula = richness ~ Species, data = metagenome_alpha_div, 
    trace = TRUE, init.theta = 1.617425835, link = log)

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)   2.9806     0.2587  11.523  < 2e-16 ***
SpeciesHa    -1.7055     0.3379  -5.048 4.46e-07 ***
SpeciesPk    -0.8038     0.3165  -2.540   0.0111 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for Negative Binomial(1.6174) family taken to be 1)

    Null deviance: 78.633  on 50  degrees of freedom
Residual deviance: 51.459  on 48  degrees of freedom
AIC: 317.29

Number of Fisher Scoring iterations: 1

              Theta:  1.617 
          Std. Err.:  0.370 

 2 x log-likelihood:  -309.287 
anova(Model_richness)
Analysis of Deviance Table

Model: Negative Binomial(1.6174), link: log

Response: richness

Terms added sequentially (first to last)

        Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
NULL                       50     78.633              
Species  2   27.173        48     51.459 1.257e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
emmeans(Model_richness, pairwise ~ Species)
$emmeans
 Species emmean    SE  df asymp.LCL asymp.UCL
 Eb        2.98 0.259 Inf     2.474      3.49
 Ha        1.28 0.217 Inf     0.849      1.70
 Pk        2.18 0.182 Inf     1.819      2.53

Results are given on the log (not the response) scale. 
Confidence level used: 0.95 

$contrasts
 contrast estimate    SE  df z.ratio p.value
 Eb - Ha     1.706 0.338 Inf   5.048  <.0001
 Eb - Pk     0.804 0.316 Inf   2.540  0.0298
 Ha - Pk    -0.902 0.284 Inf  -3.178  0.0042

Results are given on the log (not the response) scale. 
P value adjustment: tukey method for comparing a family of 3 estimates 

Neutral

Model_neutral <- lm(formula = neutral ~ Species, data = metagenome_alpha_div)
anova(Model_neutral)
Analysis of Variance Table

Response: neutral
          Df  Sum Sq Mean Sq F value  Pr(>F)  
Species    2  190.82  95.411  4.1954 0.02093 *
Residuals 48 1091.60  22.742                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
emmeans(Model_neutral, pairwise ~ Species)
$emmeans
 Species emmean   SE df lower.CL upper.CL
 Eb        7.33 1.51 48   4.2937    10.36
 Ha        2.15 1.09 48  -0.0467     4.35
 Pk        5.05 1.02 48   3.0060     7.09

Confidence level used: 0.95 

$contrasts
 contrast estimate   SE df t.ratio p.value
 Eb - Ha      5.17 1.86 48   2.776  0.0210
 Eb - Pk      2.28 1.82 48   1.251  0.4295
 Ha - Pk     -2.90 1.49 48  -1.940  0.1386

P value adjustment: tukey method for comparing a family of 3 estimates 

4.1.2 At different taxonomic levels

load("resources/metagenomics/data_filtered.Rdata")

genome_metadata <- genome_metadata %>%
  mutate(
    class  = if_else(str_trim(class) == "", paste0(phylum, "_Unclassified"), class),
    order  = if_else(str_trim(order) == "",
                     if_else(str_detect(class, "Unclassified"), class, paste0(class, "_Unclassified")),
                     order),
    family = if_else(str_trim(family) == "",
                     if_else(str_detect(order, "Unclassified"), order, paste0(order, "_Unclassified")),
                     family),
    genus  = if_else(str_trim(genus) == "",
                     if_else(str_detect(family, "Unclassified"), family, paste0(family, "_Unclassified")),
                     genus)
  )

genome_tax <- genome_counts_filt %>% 
  left_join(genome_metadata[c(1,3,4,6)], by = "genome")

# Aggregate counts at the Family level
family_level_agg <- genome_tax %>%
  select(-genome) %>%
  group_by(phylum, class, family) %>%
  summarise(across(where(is.numeric), \(x) sum(x, na.rm = TRUE)), .groups = "drop")%>% 
  as.data.frame() %>% 
  select(-c(phylum,class))

# Aggregate counts at the Phylum level
phylum_level_agg <- genome_tax %>%
  select(-genome) %>%
  group_by(phylum) %>%
  summarise(across(where(is.numeric), \(x) sum(x, na.rm = TRUE)), .groups = "drop")%>% 
  as.data.frame() 

4.1.2.1 Family level

richness_f_metag <- family_level_agg %>%
  column_to_rownames(var = "family") %>%
  dplyr::select(where( ~ !all(. == 0))) %>%
  hilldiv(., q = 0) %>%
  t() %>%
  as.data.frame() %>%
  dplyr::rename(richness = 1) %>%
  rownames_to_column(var = "sample")

neutral_f_metag <- family_level_agg %>%
  column_to_rownames(var = "family") %>%
  dplyr::select(where( ~ !all(. == 0))) %>%
  hilldiv(., q = 1) %>%
  t() %>%
  as.data.frame() %>%
  dplyr::rename(neutral = 1) %>%
  rownames_to_column(var = "sample")

alpha_div_f_metag <- richness_f_metag %>%
  full_join(neutral_f_metag, by = join_by(sample == sample))

alpha_div_f_metag_data <- alpha_div_f_metag %>%
  left_join(sample_metadata, by = join_by(sample == sample))

Diversity values

alpha_div_f_metag %>%
  pivot_longer(-sample, names_to = "alpha", values_to = "value") %>%
  left_join(sample_metadata, by = join_by(sample == sample)) %>%
  group_by(alpha) %>%
  summarise(
    total_mean = mean(value, na.rm = T),
    total_sd = sd(value, na.rm = T),
    Eb_mean = mean(value[Species == "Eb"], na.rm = T),
    Eb_sd = sd(value[Species == "Eb"], na.rm = T),
    Ha_mean = mean(value[Species == "Ha"], na.rm = T),
    Ha_sd = sd(value[Species == "Ha"], na.rm = T),
    Pk_mean = mean(value[Species == "Pk"], na.rm = T),
    Pk_sd = sd(value[Species == "Pk"], na.rm = T)
  ) %>%
  mutate(
    Total = str_c(round(total_mean, 2), "±", round(total_sd, 2)),
    Cnephaeus = str_c(round(Eb_mean, 2), "±", round(Eb_sd, 2)),
    Hypsugo = str_c(round(Ha_mean, 2), "±", round(Ha_sd, 2)),
    Pipistrellus = str_c(round(Pk_mean, 2), "±", round(Pk_sd, 2))
  ) %>%
  arrange(-Eb_mean) %>%
  dplyr::select(alpha, Total, Cnephaeus, Pipistrellus, Hypsugo) %>%
  tt()
alpha Total Cnephaeus Pipistrellus Hypsugo
richness 6.47±6.67 12.2±10.51 6.73±5.35 3.16±2.27
neutral 3.2±2.79 4.63±4.12 3.64±2.75 1.94±1.12

Richness

Model_richness <- MASS::glm.nb(richness ~ Species, data = alpha_div_f_metag_data,trace=TRUE)
anova(Model_richness)
Analysis of Deviance Table

Model: Negative Binomial(2.2583), link: log

Response: richness

Terms added sequentially (first to last)

        Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
NULL                       50     71.305              
Species  2   21.013        48     50.292 2.736e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
emmeans(Model_richness, pairwise ~ Species)
$emmeans
 Species emmean    SE  df asymp.LCL asymp.UCL
 Eb        2.50 0.229 Inf     2.052      2.95
 Ha        1.15 0.200 Inf     0.758      1.54
 Pk        1.91 0.164 Inf     1.585      2.23

Results are given on the log (not the response) scale. 
Confidence level used: 0.95 

$contrasts
 contrast estimate    SE  df z.ratio p.value
 Eb - Ha     1.352 0.304 Inf   4.445  <.0001
 Eb - Pk     0.595 0.282 Inf   2.113  0.0872
 Ha - Pk    -0.756 0.259 Inf  -2.925  0.0097

Results are given on the log (not the response) scale. 
P value adjustment: tukey method for comparing a family of 3 estimates 

Neutral

Model_neutral <- lm(formula = neutral ~ Species, data = alpha_div_f_metag_data)
anova(Model_neutral)
Analysis of Variance Table

Response: neutral
          Df Sum Sq Mean Sq F value Pr(>F)  
Species    2  55.07 27.5358  3.9508 0.0258 *
Residuals 48 334.54  6.9696                 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
emmeans(Model_neutral, pairwise ~ Species)
$emmeans
 Species emmean    SE df lower.CL upper.CL
 Eb        4.63 0.835 48     2.95     6.31
 Ha        1.94 0.606 48     0.72     3.16
 Pk        3.64 0.563 48     2.51     4.77

Confidence level used: 0.95 

$contrasts
 contrast estimate    SE df t.ratio p.value
 Eb - Ha     2.693 1.030 48   2.611  0.0316
 Eb - Pk     0.988 1.010 48   0.981  0.5923
 Ha - Pk    -1.705 0.827 48  -2.063  0.1085

P value adjustment: tukey method for comparing a family of 3 estimates 

4.1.2.2 Phylum level

richness_p_metag <- phylum_level_agg %>%
  column_to_rownames(var = "phylum") %>%
  dplyr::select(where( ~ !all(. == 0))) %>%
  hilldiv(., q = 0) %>%
  t() %>%
  as.data.frame() %>%
  dplyr::rename(richness = 1) %>%
  rownames_to_column(var = "sample")

neutral_p_metag <- phylum_level_agg %>%
  column_to_rownames(var = "phylum") %>%
  dplyr::select(where( ~ !all(. == 0))) %>%
  hilldiv(., q = 1) %>%
  t() %>%
  as.data.frame() %>%
  dplyr::rename(neutral = 1) %>%
  rownames_to_column(var = "sample")

alpha_div_phylum_metag <- richness_p_metag %>%
  full_join(neutral_p_metag, by = join_by(sample == sample))

alpha_div_phylum_metag_data <- alpha_div_phylum_metag %>%
  left_join(sample_metadata, by = join_by(sample == sample))

Diversity values

alpha_div_phylum_metag %>%
  pivot_longer(-sample, names_to = "alpha", values_to = "value") %>%
  left_join(sample_metadata, by = join_by(sample == sample)) %>%
  group_by(alpha) %>%
  summarise(
    total_mean = mean(value, na.rm = T),
    total_sd = sd(value, na.rm = T),
    Eb_mean = mean(value[Species == "Eb"], na.rm = T),
    Eb_sd = sd(value[Species == "Eb"], na.rm = T),
    Ha_mean = mean(value[Species == "Ha"], na.rm = T),
    Ha_sd = sd(value[Species == "Ha"], na.rm = T),
    Pk_mean = mean(value[Species == "Pk"], na.rm = T),
    Pk_sd = sd(value[Species == "Pk"], na.rm = T)
  ) %>%
  mutate(
    Total = str_c(round(total_mean, 2), "±", round(total_sd, 2)),
    Cnephaeus = str_c(round(Eb_mean, 2), "±", round(Eb_sd, 2)),
    Hypsugo = str_c(round(Ha_mean, 2), "±", round(Ha_sd, 2)),
    Pipistrellus = str_c(round(Pk_mean, 2), "±", round(Pk_sd, 2))
  ) %>%
  arrange(-Eb_mean) %>%
  dplyr::select(alpha, Total, Cnephaeus, Pipistrellus, Hypsugo) %>%
  tt()
alpha Total Cnephaeus Pipistrellus Hypsugo
richness 2.78±2.21 5.1±2.96 3.09±1.66 1.21±0.54
neutral 1.77±1.11 2.37±1.43 2.09±1.14 1.07±0.2

Richness

Model_richness_p <- MASS::glm.nb(richness ~ Species, data = alpha_div_phylum_metag_data,trace=TRUE)
anova(Model_richness_p)
Analysis of Deviance Table

Model: Negative Binomial(7888.097), link: log

Response: richness

Terms added sequentially (first to last)

        Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
NULL                       50     76.955              
Species  2   37.613        48     39.342 6.799e-09 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
emmeans(Model_richness_p, pairwise ~ Species)
$emmeans
 Species emmean    SE  df asymp.LCL asymp.UCL
 Eb       1.629 0.140 Inf     1.355      1.90
 Ha       0.191 0.209 Inf    -0.218      0.60
 Pk       1.128 0.121 Inf     0.891      1.37

Results are given on the log (not the response) scale. 
Confidence level used: 0.95 

$contrasts
 contrast estimate    SE  df z.ratio p.value
 Eb - Ha     1.438 0.251 Inf   5.725  <.0001
 Eb - Pk     0.501 0.185 Inf   2.703  0.0189
 Ha - Pk    -0.937 0.241 Inf  -3.886  0.0003

Results are given on the log (not the response) scale. 
P value adjustment: tukey method for comparing a family of 3 estimates 

Neutral

Model_neutral <- lm(formula = neutral ~ Species, data = alpha_div_phylum_metag_data)
anova(Model_neutral)
Analysis of Variance Table

Response: neutral
          Df Sum Sq Mean Sq F value   Pr(>F)   
Species    2 15.015  7.5073  7.8056 0.001161 **
Residuals 48 46.166  0.9618                    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
emmeans(Model_neutral, pairwise ~ Species)
$emmeans
 Species emmean    SE df lower.CL upper.CL
 Eb        2.37 0.310 48    1.747     2.99
 Ha        1.07 0.225 48    0.622     1.53
 Pk        2.09 0.209 48    1.666     2.51

Confidence level used: 0.95 

$contrasts
 contrast estimate    SE df t.ratio p.value
 Eb - Ha     1.297 0.383 48   3.385  0.0040
 Eb - Pk     0.285 0.374 48   0.761  0.7284
 Ha - Pk    -1.012 0.307 48  -3.296  0.0052

P value adjustment: tukey method for comparing a family of 3 estimates 

4.2 Amplicon

load("resources/amplicon/data_standard_filt.Rdata")
sample_metadata <- sample_metadata %>%
  select(1,9,10,13) %>%
  mutate(method="Metabarcoding_standard")

no_archaea <- genome_counts_filt %>% 
  left_join(genome_metadata[c(1,2)], by="genome") %>% 
  filter(domain!="Archaea") %>% 
  select(-domain)

4.2.1 ASV level

richness <- genome_counts_filt %>%
  column_to_rownames(var = "genome") %>%
  dplyr::select(where( ~ !all(. == 0))) %>%
  hilldiv(., q = 0) %>%
  t() %>%
  as.data.frame() %>%
  dplyr::rename(richness = 1) %>%
  rownames_to_column(var = "sample")

neutral <- genome_counts_filt %>%
  column_to_rownames(var = "genome") %>%
  dplyr::select(where( ~ !all(. == 0))) %>%
  hilldiv(., q = 1) %>%
  t() %>%
  as.data.frame() %>%
  dplyr::rename(neutral = 1) %>%
  rownames_to_column(var = "sample")

amplicon_alpha_div <- richness %>%
  full_join(neutral, by = join_by(sample == sample))

Correlation between diversity and estimate microbial fraction

fraction_ampli <- read_csv("resources/metagenomics/microbial_fraction_estimation.csv") %>% 
  mutate(sample = str_remove(sample, "_M_1")) %>% 
  select(sample, read_fraction) %>% 
  right_join(amplicon_alpha_div, by="sample") %>% 
  left_join(sample_metadata, by="sample")

cor.test(fraction_ampli$read_fraction, fraction_ampli$richness, method = "spearman")

    Spearman's rank correlation rho

data:  fraction_ampli$read_fraction and fraction_ampli$richness
S = 17781, p-value = 0.5231
alternative hypothesis: true rho is not equal to 0
sample estimates:
        rho 
-0.09659512 
cor.test(fraction_ampli$read_fraction, fraction_ampli$neutral, method = "spearman")

    Spearman's rank correlation rho

data:  fraction_ampli$read_fraction and fraction_ampli$neutral
S = 16198, p-value = 0.9948
alternative hypothesis: true rho is not equal to 0
sample estimates:
        rho 
0.001048412 
p1_ampli <- ggplot(fraction_ampli, aes(x = read_fraction, y = richness)) +
  geom_point(size = 3, color = "#2E86AB") +
  geom_smooth(method = "lm", se = TRUE, color = "black") +
  theme_classic() +
  labs(x = "Read Fraction (%)", y = "Richness", title= "Correlation between estimated microbial fraction and diversity")

p2_ampli <- ggplot(fraction_ampli, aes(x = read_fraction, y = neutral)) +
  geom_point(size = 3, color = "#F39C12") +
  geom_smooth(method = "lm", se = TRUE, color = "black") +
  theme_classic() +
  labs(x = "Read Fraction (%)", y = "Neutral Alpha Diversity")

p1_ampli + p2_ampli 

Diversity values

alpha Total Cnephaeus Pipistrellus Hypsugo
richness 103.63±92.06 155.4±140.37 87.55±89.59 95±49.42
neutral 20.43±20.03 30.59±33.95 17.86±17.52 18.06±10.46
amplicon_alpha_div_data <- amplicon_alpha_div %>%
  left_join(sample_metadata, by = join_by(sample == sample))
saveRDS(amplicon_alpha_div_data, "resources/amplicon/amplicon_alpha_div.rds")

Richness

Model_richness_ampli <- MASS::glm.nb(richness ~ Species, data = amplicon_alpha_div_data,trace=TRUE)
anova(Model_richness_ampli)
Analysis of Deviance Table

Model: Negative Binomial(1.8863), link: log

Response: richness

Terms added sequentially (first to last)

        Df Deviance Resid. Df Resid. Dev Pr(>Chi)  
NULL                       50     60.068           
Species  2    4.859        48     55.209  0.08808 .
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Neutral

Model_neutral <- lm(formula = neutral ~ Species, data = amplicon_alpha_div_data)
anova(Model_neutral)
Analysis of Variance Table

Response: neutral
          Df  Sum Sq Mean Sq F value Pr(>F)
Species    2  1284.1  642.07  1.6408 0.2045
Residuals 48 18783.4  391.32               

4.2.2 At different taxonomic level

load("resources/amplicon/data_standard_filt.Rdata")

sample_metadata_amplicon <- sample_metadata[c(1,9,10,13)] %>%
  mutate(sample = paste0(sample, "_ampli")) %>%
  mutate(method="Metabarcoding_standard")

genome_metadata_ampli <- genome_metadata %>%
  mutate(
    class  = if_else(is.na(class), paste0(phylum, "_Unclassified"), class),
    order  = if_else(is.na(order),
                     if_else(str_detect(class, "Unclassified"), class, paste0(class, "_Unclassified")),
                     order),
    family = if_else(is.na(family),
                     if_else(str_detect(order, "Unclassified"), order, paste0(order, "_Unclassified")),
                     family),
    genus  = if_else(is.na(genus),
                     if_else(str_detect(family, "Unclassified"), family, paste0(family, "_Unclassified")),
                     genus)
  )

genome_tax_ampli <- genome_counts_filt %>% 
  column_to_rownames(., "genome")%>%
  rename_with(~ paste0(., "_ampli")) %>%
  rownames_to_column(., "genome")%>%
  left_join(genome_metadata_ampli, by = "genome")

family_level_agg_ampli <- genome_tax_ampli %>%
  select(-genome) %>%
  group_by(phylum, class, family) %>%
  summarise(across(where(is.numeric), \(x) sum(x, na.rm = TRUE)), .groups = "drop")%>% 
    as.data.frame() %>% 
  select(-c(phylum,class))

phylum_level_agg_ampli <- genome_tax_ampli %>%
  select(-genome) %>%
  group_by(phylum) %>%
  summarise(across(where(is.numeric), \(x) sum(x, na.rm = TRUE)), .groups = "drop")%>% 
    as.data.frame()

4.2.2.1 Family level

richness_f_ampli <- family_level_agg_ampli %>%
  column_to_rownames(var = "family") %>%
  dplyr::select(where( ~ !all(. == 0))) %>%
  hilldiv(., q = 0) %>%
  t() %>%
  as.data.frame() %>%
  dplyr::rename(richness = 1) %>%
  rownames_to_column(var = "sample")

neutral_f_ampli <- family_level_agg_ampli %>%
  column_to_rownames(var = "family") %>%
  dplyr::select(where( ~ !all(. == 0))) %>%
  hilldiv(., q = 1) %>%
  t() %>%
  as.data.frame() %>%
  dplyr::rename(neutral = 1) %>%
  rownames_to_column(var = "sample")

alpha_div_f_ampli <- richness_f_ampli %>%
  full_join(neutral_f_ampli, by = join_by(sample == sample))

alpha_div_f_ampli_data <- alpha_div_f_ampli %>%
  left_join(sample_metadata_amplicon, by = join_by(sample == sample))

Diversity values

alpha_div_f_ampli %>%
  pivot_longer(-sample, names_to = "alpha", values_to = "value") %>%
  left_join(sample_metadata_amplicon, by = join_by(sample == sample)) %>%
  group_by(alpha) %>%
  summarise(
    total_mean = mean(value, na.rm = T),
    total_sd = sd(value, na.rm = T),
    Eb_mean = mean(value[Species == "Eb"], na.rm = T),
    Eb_sd = sd(value[Species == "Eb"], na.rm = T),
    Ha_mean = mean(value[Species == "Ha"], na.rm = T),
    Ha_sd = sd(value[Species == "Ha"], na.rm = T),
    Pk_mean = mean(value[Species == "Pk"], na.rm = T),
    Pk_sd = sd(value[Species == "Pk"], na.rm = T)
  ) %>%
  mutate(
    Total = str_c(round(total_mean, 2), "±", round(total_sd, 2)),
    Cnephaeus = str_c(round(Eb_mean, 2), "±", round(Eb_sd, 2)),
    Hypsugo = str_c(round(Ha_mean, 2), "±", round(Ha_sd, 2)),
    Pipistrellus = str_c(round(Pk_mean, 2), "±", round(Pk_sd, 2))
  ) %>%
  arrange(-Eb_mean) %>%
  dplyr::select(alpha, Total, Cnephaeus, Pipistrellus, Hypsugo) %>%
  tt()
alpha Total Cnephaeus Pipistrellus Hypsugo
richness 38.61±20.88 40.4±20.86 33.73±22.76 43.32±18.28
neutral 7.93±5.03 8.65±6.76 6.74±5.19 8.94±3.61

Richness

Model_richness_ampli_f <- MASS::glm.nb(richness ~ Species, data = alpha_div_f_ampli_data,trace=TRUE)
anova(Model_richness_ampli_f)
Analysis of Deviance Table

Model: Negative Binomial(3.6305), link: log

Response: richness

Terms added sequentially (first to last)

        Df Deviance Resid. Df Resid. Dev Pr(>Chi)
NULL                       50      55.61         
Species  2   2.2101        48      53.40   0.3312

Neutral

Model_neutral_ampli_f <- lm(formula = neutral ~ Species, data = alpha_div_f_ampli_data)
anova(Model_neutral_ampli_f)
Analysis of Variance Table

Response: neutral
          Df  Sum Sq Mean Sq F value Pr(>F)
Species    2   55.89  27.944  1.1072 0.3388
Residuals 48 1211.41  25.238               

4.2.2.2 Phylum level

richness_p_ampli <- phylum_level_agg_ampli %>%
  column_to_rownames(var = "phylum") %>%
  dplyr::select(where( ~ !all(. == 0))) %>%
  hilldiv(., q = 0) %>%
  t() %>%
  as.data.frame() %>%
  dplyr::rename(richness = 1) %>%
  rownames_to_column(var = "sample")

neutral_p_ampli <- phylum_level_agg_ampli %>%
  column_to_rownames(var = "phylum") %>%
  dplyr::select(where( ~ !all(. == 0))) %>%
  hilldiv(., q = 1) %>%
  t() %>%
  as.data.frame() %>%
  dplyr::rename(neutral = 1) %>%
  rownames_to_column(var = "sample")

alpha_div_phylum_ampli <- richness_p_ampli %>%
  full_join(neutral_p_ampli, by = join_by(sample == sample))

alpha_div_phylum_ampli_data <- alpha_div_phylum_ampli %>%
  left_join(sample_metadata_amplicon, by = join_by(sample == sample))

Diversity values

alpha_div_phylum_ampli %>%
  pivot_longer(-sample, names_to = "alpha", values_to = "value") %>%
  left_join(sample_metadata_amplicon, by = join_by(sample == sample)) %>%
  group_by(alpha) %>%
  summarise(
    total_mean = mean(value, na.rm = T),
    total_sd = sd(value, na.rm = T),
    Eb_mean = mean(value[Species == "Eb"], na.rm = T),
    Eb_sd = sd(value[Species == "Eb"], na.rm = T),
    Ha_mean = mean(value[Species == "Ha"], na.rm = T),
    Ha_sd = sd(value[Species == "Ha"], na.rm = T),
    Pk_mean = mean(value[Species == "Pk"], na.rm = T),
    Pk_sd = sd(value[Species == "Pk"], na.rm = T)
  ) %>%
  mutate(
    Total = str_c(round(total_mean, 2), "±", round(total_sd, 2)),
    Cnephaeus = str_c(round(Eb_mean, 2), "±", round(Eb_sd, 2)),
    Hypsugo = str_c(round(Ha_mean, 2), "±", round(Ha_sd, 2)),
    Pipistrellus = str_c(round(Pk_mean, 2), "±", round(Pk_sd, 2))
  ) %>%
  arrange(-Eb_mean) %>%
  dplyr::select(alpha, Total, Cnephaeus, Pipistrellus, Hypsugo) %>%
  tt()
alpha Total Cnephaeus Pipistrellus Hypsugo
richness 8.04±3.65 10±4.16 7.27±3.83 7.89±2.9
neutral 2.55±1.08 2.98±1.37 2.58±1.19 2.28±0.69

Richness

Model_richness_ampli_p <- MASS::glm.nb(richness ~ Species, data = alpha_div_phylum_ampli_data,trace=TRUE)
anova(Model_richness_ampli_p)
Analysis of Deviance Table

Model: Negative Binomial(18.5682), link: log

Response: richness

Terms added sequentially (first to last)

        Df Deviance Resid. Df Resid. Dev Pr(>Chi)
NULL                       50     53.958         
Species  2   4.2299        48     49.728   0.1206

Neutral

Model_neutral_ampli_p <- lm(formula = neutral ~ Species, data = alpha_div_phylum_ampli_data)
anova(Model_neutral_ampli_p)
Analysis of Variance Table

Response: neutral
          Df Sum Sq Mean Sq F value Pr(>F)
Species    2  3.311  1.6557  1.4477 0.2452
Residuals 48 54.898  1.1437               

4.3 Comparison

4.3.1 At family and phylum level

load("resources/amplicon/data_standard_filt.Rdata")

sample_metadata_amplicon <- sample_metadata[c(1,9,10,13)] %>%
  mutate(sample = paste0(sample, "_ampli")) %>%
  mutate(method="Metabarcoding_standard")

genome_metadata_ampli <- genome_metadata %>%
  mutate(
    class  = if_else(is.na(class), paste0(phylum, "_Unclassified"), class),
    order  = if_else(is.na(order),
                     if_else(str_detect(class, "Unclassified"), class, paste0(class, "_Unclassified")),
                     order),
    family = if_else(is.na(family),
                     if_else(str_detect(order, "Unclassified"), order, paste0(order, "_Unclassified")),
                     family),
    genus  = if_else(is.na(genus),
                     if_else(str_detect(family, "Unclassified"), family, paste0(family, "_Unclassified")),
                     genus)
  )

genome_tax_ampli <- genome_counts_filt %>% 
  column_to_rownames(., "genome")%>%
  rename_with(~ paste0(., "_ampli")) %>%
  rownames_to_column(., "genome")%>%
  left_join(genome_metadata_ampli, by = "genome")

family_level_agg_ampli <- genome_tax_ampli %>%
  select(-genome) %>%
  group_by(phylum, class, family) %>%
  summarise(across(where(is.numeric), \(x) sum(x, na.rm = TRUE)), .groups = "drop")%>% 
    as.data.frame() %>% 
  select(-c(phylum,class))

phylum_level_agg_ampli <- genome_tax_ampli %>%
  select(-genome) %>%
  group_by(phylum) %>%
  summarise(across(where(is.numeric), \(x) sum(x, na.rm = TRUE)), .groups = "drop")%>% 
    as.data.frame()
load("resources/metagenomics/data_filtered.Rdata")

genome_metadata <- genome_metadata %>%
  mutate(
    class  = if_else(str_trim(class) == "", paste0(phylum, "_Unclassified"), class),
    order  = if_else(str_trim(order) == "",
                     if_else(str_detect(class, "Unclassified"), class, paste0(class, "_Unclassified")),
                     order),
    family = if_else(str_trim(family) == "",
                     if_else(str_detect(order, "Unclassified"), order, paste0(order, "_Unclassified")),
                     family),
    genus  = if_else(str_trim(genus) == "",
                     if_else(str_detect(family, "Unclassified"), family, paste0(family, "_Unclassified")),
                     genus)
  )

genome_tax <- genome_counts_filt %>% 
  left_join(genome_metadata[c(1,3,4,6)], by = "genome")

# Aggregate counts at the Family level
family_level_agg <- genome_tax %>%
  select(-genome) %>%
  group_by(phylum, class, family) %>%
  summarise(across(where(is.numeric), \(x) sum(x, na.rm = TRUE)), .groups = "drop")%>% 
  as.data.frame() %>% 
  select(-c(phylum,class))

# Aggregate counts at the Phylum level
phylum_level_agg <- genome_tax %>%
  select(-genome) %>%
  group_by(phylum) %>%
  summarise(across(where(is.numeric), \(x) sum(x, na.rm = TRUE)), .groups = "drop")%>% 
  as.data.frame() 
merged_table <- full_join(family_level_agg_ampli,family_level_agg, 
          by = c("family"))%>%
  mutate(across(where(is.numeric), ~replace_na(., 0)))

merged_table_phylum <- full_join(phylum_level_agg_ampli,phylum_level_agg, 
          by = c("phylum"))%>%
  mutate(across(where(is.numeric), ~replace_na(., 0)))

sample_shot <- sample_metadata %>% 
  select(-c(Habitat,Longitude, Latitude, Region))%>%
  mutate(method="Metagenomics")

all_samples <- rbind(sample_shot, sample_metadata_amplicon)
richness <- merged_table %>%
  column_to_rownames(var = "family") %>%
  dplyr::select(where( ~ !all(. == 0))) %>%
  hilldiv(., q = 0) %>%
  t() %>%
  as.data.frame() %>%
  dplyr::rename(richness = 1) %>%
  rownames_to_column(var = "sample")

neutral <- merged_table %>%
  column_to_rownames(var = "family") %>%
  dplyr::select(where( ~ !all(. == 0))) %>%
  hilldiv(., q = 1) %>%
  t() %>%
  as.data.frame() %>%
  dplyr::rename(neutral = 1) %>%
  rownames_to_column(var = "sample")

alpha_div_family <- richness %>%
  full_join(neutral, by = join_by(sample == sample))

alpha_div_family_data <- alpha_div_family %>%
  left_join(all_samples, by = join_by(sample == sample)) %>% 
  mutate(sampleid = str_remove(sample, "_ampli"))

4.3.1.1 Family level

alpha_div_family_data %>%
  select(-sample,-sampleid, -Life_stage, -Sex) %>%
  pivot_longer(
    cols = -c(method, Species),
    names_to = "metric",
    values_to = "value"
  ) %>%
  mutate(
    metric = factor(
      metric,
      levels = c("richness", "neutral"),
      labels = c("Richness", "Neutral")
    ),
    Species = factor(
      Species,
      levels = c("Eb", "Pk", "Ha"),
      labels = c("C. bottae", "P. kuhlii", "H. ariel")
      )
  ) %>%  
  ggplot(aes(y = value, x = method, color = Species, fill = Species)) +
  geom_boxplot(
    width = 0.4,
    alpha = 0.5,
    outlier.shape = NA,
    position = position_dodge(width = 0.75)
  ) +
  geom_jitter(
    position = position_jitterdodge(jitter.width = 0.1, dodge.width = 0.75),
    size = 1,
    alpha = 0.8,
  ) +
  facet_wrap(~metric, scales = "free_y") +
  scale_color_manual(values=species_color,  labels = c(expression(italic("C. bottae")),
             expression(italic("H. ariel")),
             expression(italic("P. kuhlii"))))+
  scale_fill_manual(values=species_color,  labels = c(expression(italic("C. bottae")),
             expression(italic("H. ariel")),
             expression(italic("P. kuhlii")))) +
  theme_classic() +
  theme(
    strip.background = element_blank(),
    panel.grid.minor.x = element_line(size = 0.1, color = "grey"),
    axis.text.y = element_text(size = 12),
    axis.text.x = element_text(size = 12),
    axis.title.y = element_text(margin = margin(t = 0, r = 20, b = 0, l = 0)),
    strip.text = element_text(size = 14, lineheight = 0.6)
  ) +
  labs(
    x = "Methodological approach",
    y = "Alpha diversity at family level",
    color = "Species"
  )

Richness

Model_richness_stand <- glmer.nb(richness ~ Species*method+(1|sampleid), data = alpha_div_family_data)
summary(Model_richness_stand)
Generalized linear mixed model fit by maximum likelihood (Laplace Approximation) ['glmerMod']
 Family: Negative Binomial(3.0526)  ( log )
Formula: richness ~ Species * method + (1 | sampleid)
   Data: alpha_div_family_data

     AIC      BIC   logLik deviance df.resid 
   742.7    763.7   -363.3    726.7       94 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-1.4298 -0.7920 -0.2740  0.5214  2.3835 

Random effects:
 Groups   Name        Variance Std.Dev.
 sampleid (Intercept) 0.003989 0.06316 
Number of obs: 102, groups:  sampleid, 51

Fixed effects:
                             Estimate Std. Error z value Pr(>|z|)    
(Intercept)                   3.69577    0.19136  19.313  < 2e-16 ***
SpeciesHa                     0.07113    0.23368   0.304    0.761    
SpeciesPk                    -0.18218    0.22911  -0.795    0.427    
methodMetagenomics           -1.20029    0.27796  -4.318 1.57e-05 ***
SpeciesHa:methodMetagenomics -1.41893    0.35967  -3.945 7.98e-05 ***
SpeciesPk:methodMetagenomics -0.41133    0.33999  -1.210    0.226    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation of Fixed Effects:
            (Intr) SpecsH SpcsPk mthdMt SpcH:M
SpeciesHa   -0.808                            
SpeciesPk   -0.803  0.663                     
mthdMtgnmcs -0.647  0.537  0.562              
SpcsH:mthdM  0.503 -0.636 -0.433 -0.771       
SpcsPk:mthM  0.527 -0.438 -0.670 -0.819  0.631
anova(Model_richness_stand)
Analysis of Variance Table
               npar  Sum Sq Mean Sq  F value
Species           2   3.879   1.939   1.9394
method            1 199.977 199.977 199.9766
Species:method    2  18.326   9.163   9.1628
emmeans(Model_richness_stand, pairwise ~ Species)
$emmeans
 Species emmean    SE  df asymp.LCL asymp.UCL
 Eb        3.10 0.147 Inf      2.81      3.38
 Ha        2.46 0.117 Inf      2.23      2.69
 Pk        2.71 0.108 Inf      2.50      2.92

Results are averaged over the levels of: method 
Results are given on the log (not the response) scale. 
Confidence level used: 0.95 

$contrasts
 contrast estimate    SE  df z.ratio p.value
 Eb - Ha     0.638 0.183 Inf   3.489  0.0014
 Eb - Pk     0.388 0.171 Inf   2.270  0.0601
 Ha - Pk    -0.250 0.154 Inf  -1.629  0.2334

Results are averaged over the levels of: method 
Results are given on the log (not the response) scale. 
P value adjustment: tukey method for comparing a family of 3 estimates 
emmeans(Model_richness_stand, pairwise ~ method | Species)
$emmeans
Species = Eb:
 method                 emmean    SE  df asymp.LCL asymp.UCL
 Metabarcoding_standard   3.70 0.191 Inf     3.321      4.07
 Metagenomics             2.50 0.212 Inf     2.079      2.91

Species = Ha:
 method                 emmean    SE  df asymp.LCL asymp.UCL
 Metabarcoding_standard   3.77 0.138 Inf     3.497      4.04
 Metagenomics             1.15 0.186 Inf     0.783      1.51

Species = Pk:
 method                 emmean    SE  df asymp.LCL asymp.UCL
 Metabarcoding_standard   3.51 0.137 Inf     3.245      3.78
 Metagenomics             1.90 0.154 Inf     1.601      2.20

Results are given on the log (not the response) scale. 
Confidence level used: 0.95 

$contrasts
Species = Eb:
 contrast                              estimate    SE  df z.ratio p.value
 Metabarcoding_standard - Metagenomics     1.20 0.278 Inf   4.318  <.0001

Species = Ha:
 contrast                              estimate    SE  df z.ratio p.value
 Metabarcoding_standard - Metagenomics     2.62 0.229 Inf  11.434  <.0001

Species = Pk:
 contrast                              estimate    SE  df z.ratio p.value
 Metabarcoding_standard - Metagenomics     1.61 0.195 Inf   8.265  <.0001

Results are given on the log (not the response) scale. 

Neutral

Modelq1 <- lme(fixed = neutral ~ Species*method, data = alpha_div_family_data,
               random = ~ 1 | sampleid)
summary(Modelq1)
Linear mixed-effects model fit by REML
  Data: alpha_div_family_data 
       AIC      BIC    logLik
  565.6677 586.1824 -274.8338

Random effects:
 Formula: ~1 | sampleid
        (Intercept) Residual
StdDev:    2.370619 3.237882

Fixed effects:  neutral ~ Species * method 
                                 Value Std.Error DF   t-value p-value
(Intercept)                   8.648785  1.269004 48  6.815411  0.0000
SpeciesHa                     0.289247  1.567781 48  0.184495  0.8544
SpeciesPk                    -1.913748  1.530477 48 -1.250426  0.2172
methodMetagenomics           -4.018137  1.448025 48 -2.774909  0.0078
SpeciesHa:methodMetagenomics -2.982507  1.788950 48 -1.667183  0.1020
SpeciesPk:methodMetagenomics  0.925954  1.746384 48  0.530212  0.5984
 Correlation: 
                             (Intr) SpecsH SpcsPk mthdMt SpcH:M
SpeciesHa                    -0.809                            
SpeciesPk                    -0.829  0.671                     
methodMetagenomics           -0.571  0.462  0.473              
SpeciesHa:methodMetagenomics  0.462 -0.571 -0.383 -0.809       
SpeciesPk:methodMetagenomics  0.473 -0.383 -0.571 -0.829  0.671

Standardized Within-Group Residuals:
        Min          Q1         Med          Q3         Max 
-1.62037711 -0.57512732 -0.04722462  0.37902692  3.12052590 

Number of Observations: 102
Number of Groups: 51 
anova(Modelq1)
               numDF denDF   F-value p-value
(Intercept)        1    48 145.46863  <.0001
Species            2    48   0.68907  0.5069
method             1    48  54.41395  <.0001
Species:method     2    48   3.86407  0.0278
emmeans(Modelq1, pairwise ~ Species)
$emmeans
 Species emmean    SE df lower.CL upper.CL
 Eb        6.64 1.040 48     4.54     8.74
 Ha        5.44 0.756 48     3.92     6.96
 Pk        5.19 0.703 48     3.78     6.60

Results are averaged over the levels of: method 
Degrees-of-freedom method: containment 
Confidence level used: 0.95 

$contrasts
 contrast estimate   SE df t.ratio p.value
 Eb - Ha     1.202 1.29 48   0.934  0.6220
 Eb - Pk     1.451 1.26 48   1.154  0.4860
 Ha - Pk     0.249 1.03 48   0.241  0.9685

Results are averaged over the levels of: method 
Degrees-of-freedom method: containment 
P value adjustment: tukey method for comparing a family of 3 estimates 
emmeans(Modelq1, pairwise ~ method | Species)
$emmeans
Species = Eb:
 method                 emmean    SE df lower.CL upper.CL
 Metabarcoding_standard   8.65 1.270 50   6.0999    11.20
 Metagenomics             4.63 1.270 48   2.0791     7.18

Species = Ha:
 method                 emmean    SE df lower.CL upper.CL
 Metabarcoding_standard   8.94 0.921 48   7.0870    10.79
 Metagenomics             1.94 0.921 48   0.0863     3.79

Species = Pk:
 method                 emmean    SE df lower.CL upper.CL
 Metabarcoding_standard   6.74 0.856 48   5.0148     8.46
 Metagenomics             3.64 0.856 48   1.9226     5.36

Degrees-of-freedom method: containment 
Confidence level used: 0.95 

$contrasts
Species = Eb:
 contrast                              estimate    SE df t.ratio p.value
 Metabarcoding_standard - Metagenomics     4.02 1.450 48   2.775  0.0078

Species = Ha:
 contrast                              estimate    SE df t.ratio p.value
 Metabarcoding_standard - Metagenomics     7.00 1.050 48   6.664  <.0001

Species = Pk:
 contrast                              estimate    SE df t.ratio p.value
 Metabarcoding_standard - Metagenomics     3.09 0.976 48   3.167  0.0027

Degrees-of-freedom method: containment 

4.3.1.2 Phylum level

richness_phylum <- merged_table_phylum %>%
  column_to_rownames(var = "phylum") %>%
  dplyr::select(where( ~ !all(. == 0))) %>%
  hilldiv(., q = 0) %>%
  t() %>%
  as.data.frame() %>%
  dplyr::rename(richness = 1) %>%
  rownames_to_column(var = "sample")

neutral_phylum <- merged_table_phylum %>%
  column_to_rownames(var = "phylum") %>%
  dplyr::select(where( ~ !all(. == 0))) %>%
  hilldiv(., q = 1) %>%
  t() %>%
  as.data.frame() %>%
  dplyr::rename(neutral = 1) %>%
  rownames_to_column(var = "sample")

alpha_div_phylum <- richness_phylum %>%
  full_join(neutral_phylum, by = join_by(sample == sample))
alpha_div_phylum_data <- alpha_div_phylum %>%
  left_join(all_samples, by = join_by(sample == sample)) %>% 
  mutate(sampleid = str_remove(sample, "_ampli"))
alpha_div_phylum_data %>%
  select(-sample, -sampleid, -Life_stage, -Sex) %>%
  pivot_longer(
    cols = -c(method, Species),
    names_to = "metric",
    values_to = "value"
  ) %>%
  mutate(
    metric = factor(
      metric,
      levels = c("richness", "neutral"),
      labels = c("Richness", "Neutral")
    ),
    Species = factor(
      Species,
      levels = c("Eb", "Pk", "Ha"),
      labels = c("C. bottae", "P. kuhlii", "H. ariel")
      )
  ) %>%   
  ggplot(aes(y = value, x = method, color = Species, fill = Species)) +
  geom_boxplot(
    width = 0.4,
    alpha = 0.5,
    outlier.shape = NA,
    position = position_dodge(width = 0.75)
  ) +
  geom_jitter(
    position = position_jitterdodge(jitter.width = 0.1, dodge.width = 0.75),
    size = 1,
    alpha = 0.8,
    show.legend = FALSE
  ) +
  facet_wrap(~metric, scales = "free_y") +
  scale_color_manual(values=species_color,  labels = c(expression(italic("C. bottae")),
             expression(italic("H. ariel")),
             expression(italic("P. kuhlii"))))+
  scale_fill_manual(values=species_color,  labels = c(expression(italic("C. bottae")),
             expression(italic("H. ariel")),
             expression(italic("P. kuhlii")))) +
  theme_classic() +
  theme(
    strip.background = element_blank(),
    panel.grid.minor.x = element_line(size = 0.1, color = "grey"),
    axis.text.y = element_text(size = 12),
    axis.text.x = element_text(size = 12),
    axis.title.y = element_text(margin = margin(t = 0, r = 20, b = 0, l = 0)),
    strip.text = element_text(size = 14, lineheight = 0.6)
  ) +
  labs(
    x = "Methodological approach",
    y = "Alpha diversity at phylum level",
    color = "Species"
  )

Richness

Model_richness_random <- glmer.nb(richness ~ Species*method+(1|sampleid), data = alpha_div_phylum_data)
summary(Model_richness_random)
Generalized linear mixed model fit by maximum likelihood (Laplace Approximation) ['glmerMod']
 Family: Negative Binomial(608.3082)  ( log )
Formula: richness ~ Species * method + (1 | sampleid)
   Data: alpha_div_phylum_data

     AIC      BIC   logLik deviance df.resid 
   455.4    476.4   -219.7    439.4       94 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-1.5919 -0.4741 -0.1426  0.3313  2.6884 

Random effects:
 Groups   Name        Variance Std.Dev.
 sampleid (Intercept) 0.04368  0.209   
Number of obs: 102, groups:  sampleid, 51

Fixed effects:
                             Estimate Std. Error z value Pr(>|z|)    
(Intercept)                    2.2784     0.1216  18.737  < 2e-16 ***
SpeciesHa                     -0.2317     0.1540  -1.504   0.1326    
SpeciesPk                     -0.3177     0.1516  -2.096   0.0361 *  
methodMetagenomics            -0.6733     0.1730  -3.891 9.99e-05 ***
SpeciesHa:methodMetagenomics  -1.2017     0.2833  -4.242 2.22e-05 ***
SpeciesPk:methodMetagenomics  -0.1820     0.2259  -0.806   0.4204    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation of Fixed Effects:
            (Intr) SpecsH SpcsPk mthdMt SpcH:M
SpeciesHa   -0.783                            
SpeciesPk   -0.793  0.626                     
mthdMtgnmcs -0.483  0.381  0.388              
SpcsH:mthdM  0.295 -0.388 -0.237 -0.611       
SpcsPk:mthM  0.370 -0.292 -0.482 -0.766  0.468
anova(Model_richness_random)
Analysis of Variance Table
               npar Sum Sq Mean Sq F value
Species           2  6.856   3.428  3.4279
method            1 98.840  98.840 98.8404
Species:method    2 19.561   9.780  9.7804
emmeans(Model_richness_random, pairwise ~ Species)
$emmeans
 Species emmean     SE  df asymp.LCL asymp.UCL
 Eb        1.94 0.1100 Inf     1.726      2.16
 Ha        1.11 0.1230 Inf     0.869      1.35
 Pk        1.53 0.0864 Inf     1.364      1.70

Results are averaged over the levels of: method 
Results are given on the log (not the response) scale. 
Confidence level used: 0.95 

$contrasts
 contrast estimate    SE  df z.ratio p.value
 Eb - Ha     0.833 0.164 Inf   5.079  <.0001
 Eb - Pk     0.409 0.139 Inf   2.946  0.0090
 Ha - Pk    -0.424 0.149 Inf  -2.842  0.0124

Results are averaged over the levels of: method 
Results are given on the log (not the response) scale. 
P value adjustment: tukey method for comparing a family of 3 estimates 
emmeans(Model_richness_random, pairwise ~ method | Species)
$emmeans
Species = Eb:
 method                 emmean     SE  df asymp.LCL asymp.UCL
 Metabarcoding_standard  2.278 0.1220 Inf     2.040     2.517
 Metagenomics            1.605 0.1560 Inf     1.299     1.911

Species = Ha:
 method                 emmean     SE  df asymp.LCL asymp.UCL
 Metabarcoding_standard  2.047 0.0959 Inf     1.859     2.235
 Metagenomics            0.172 0.2140 Inf    -0.249     0.592

Species = Pk:
 method                 emmean     SE  df asymp.LCL asymp.UCL
 Metabarcoding_standard  1.961 0.0923 Inf     1.780     2.142
 Metagenomics            1.105 0.1300 Inf     0.850     1.361

Results are given on the log (not the response) scale. 
Confidence level used: 0.95 

$contrasts
Species = Eb:
 contrast                              estimate    SE  df z.ratio p.value
 Metabarcoding_standard - Metagenomics    0.673 0.173 Inf   3.891  0.0001

Species = Ha:
 contrast                              estimate    SE  df z.ratio p.value
 Metabarcoding_standard - Metagenomics    1.875 0.224 Inf   8.359  <.0001

Species = Pk:
 contrast                              estimate    SE  df z.ratio p.value
 Metabarcoding_standard - Metagenomics    0.855 0.145 Inf   5.887  <.0001

Results are given on the log (not the response) scale. 

Neutral

Modelq1_random <- lme(fixed = neutral ~ Species*method, data = alpha_div_phylum_data,
               random = ~ 1 | sampleid)
summary(Modelq1_random)
Linear mixed-effects model fit by REML
  Data: alpha_div_phylum_data 
       AIC      BIC    logLik
  292.4413 312.9561 -138.2206

Random effects:
 Formula: ~1 | sampleid
        (Intercept)  Residual
StdDev:   0.7637683 0.6851316

Fixed effects:  neutral ~ Species * method 
                                  Value Std.Error DF   t-value p-value
(Intercept)                   2.9831154 0.3244607 48  9.194074  0.0000
SpeciesHa                    -0.7063902 0.4008522 48 -1.762221  0.0844
SpeciesPk                    -0.4047138 0.3913143 48 -1.034242  0.3062
methodMetagenomics           -0.6121400 0.3064002 48 -1.997845  0.0514
SpeciesHa:methodMetagenomics -0.5905534 0.3785395 48 -1.560084  0.1253
SpeciesPk:methodMetagenomics  0.1200440 0.3695325 48  0.324854  0.7467
 Correlation: 
                             (Intr) SpecsH SpcsPk mthdMt SpcH:M
SpeciesHa                    -0.809                            
SpeciesPk                    -0.829  0.671                     
methodMetagenomics           -0.472  0.382  0.392              
SpeciesHa:methodMetagenomics  0.382 -0.472 -0.317 -0.809       
SpeciesPk:methodMetagenomics  0.392 -0.317 -0.472 -0.829  0.671

Standardized Within-Group Residuals:
       Min         Q1        Med         Q3        Max 
-2.7220701 -0.3730800 -0.1714426  0.3351791  3.6366141 

Number of Observations: 102
Number of Groups: 51 
anova(Modelq1_random)
               numDF denDF   F-value p-value
(Intercept)        1    48 289.57575  <.0001
Species            2    48   4.76013   0.013
method             1    48  33.08181  <.0001
Species:method     2    48   2.92926   0.063
emmeans(Modelq1_random, pairwise ~ Species)
$emmeans
 Species emmean    SE df lower.CL upper.CL
 Eb        2.68 0.286 48     2.10     3.25
 Ha        1.68 0.207 48     1.26     2.09
 Pk        2.33 0.193 48     1.94     2.72

Results are averaged over the levels of: method 
Degrees-of-freedom method: containment 
Confidence level used: 0.95 

$contrasts
 contrast estimate    SE df t.ratio p.value
 Eb - Ha     1.002 0.353 48   2.835  0.0180
 Eb - Pk     0.345 0.345 48   0.999  0.5809
 Ha - Pk    -0.657 0.283 48  -2.319  0.0626

Results are averaged over the levels of: method 
Degrees-of-freedom method: containment 
P value adjustment: tukey method for comparing a family of 3 estimates 
emmeans(Modelq1_random, pairwise ~ method | Species)
$emmeans
Species = Eb:
 method                 emmean    SE df lower.CL upper.CL
 Metabarcoding_standard   2.98 0.324 50    2.331     3.63
 Metagenomics             2.37 0.324 48    1.719     3.02

Species = Ha:
 method                 emmean    SE df lower.CL upper.CL
 Metabarcoding_standard   2.28 0.235 48    1.803     2.75
 Metagenomics             1.07 0.235 48    0.601     1.55

Species = Pk:
 method                 emmean    SE df lower.CL upper.CL
 Metabarcoding_standard   2.58 0.219 48    2.139     3.02
 Metagenomics             2.09 0.219 48    1.646     2.53

Degrees-of-freedom method: containment 
Confidence level used: 0.95 

$contrasts
Species = Eb:
 contrast                              estimate    SE df t.ratio p.value
 Metabarcoding_standard - Metagenomics    0.612 0.306 48   1.998  0.0514

Species = Ha:
 contrast                              estimate    SE df t.ratio p.value
 Metabarcoding_standard - Metagenomics    1.203 0.222 48   5.411  <.0001

Species = Pk:
 contrast                              estimate    SE df t.ratio p.value
 Metabarcoding_standard - Metagenomics    0.492 0.207 48   2.382  0.0212

Degrees-of-freedom method: containment