Chapter 7 Copy filtering & prevalence

load("resources/amplicon/data_nocopyfilt.Rdata")
load("resources/amplicon/master_table.Rdata")
species_color <- c("#000000","#e5bd5b", "#6b7398", "#76b183")

7.1 Total ASVs

asv_total <- master_table %>%
  group_by(copy_threshold, prevalence, genome) %>%
  summarise(asv_count = sum(value != 0, na.rm = TRUE)) %>%
  group_by(copy_threshold, prevalence) %>%
  summarise(Total = sum(asv_count != 0, na.rm = TRUE))
p <- master_table %>%
  group_by(Species, copy_threshold, prevalence) %>%
  summarise(asv_count = sum(value != 0, na.rm = TRUE)) %>%
  pivot_wider(names_from = Species, values_from = c(asv_count)) %>%
  left_join(.,
            asv_total,
            by = join_by(
              "prevalence" == "prevalence",
              "copy_threshold" == "copy_threshold"
            )) %>%
  pivot_longer(!c(copy_threshold, prevalence),
               names_to = "Species",
               values_to = "ASVs") %>%  
  mutate(
    Species = factor(
      Species,
      levels = c("Total", "Eb", "Pk", "Ha"),
      labels = c("All bat species", "C. bottae", "P. kuhlii", "H. ariel")
      ) ) %>% 
  ggplot(aes(
    x = prevalence,
    y = ASVs,
    group = Species,
    color = Species
  )) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  scale_color_manual(values = species_color,
                     labels = c(
                       expression(italic("All bat species")),
                       expression(italic("C. bottae")),
                       expression(italic("H. ariel")),
                       expression(italic("P. kuhlii"))
                     ))+
  facet_wrap( ~ factor(
    copy_threshold,
    labels = c(
      "t0" = "No cut-off",
      "t1" = "Cut-off 0.0001",
      "t2" = "Cut-off 0.001",
      "t3" = "Cut-off 0.01",
      "t4" = "Cut-off 0.1"
    )
  ), scales = "free_y") +
  labs(x = "Prevalence threshold (%)", y = "Total ASVs", color = "Bat species") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(size = 12,face = "bold"),
    axis.text.y = element_text(size = 12,face = "bold"),
    axis.title = element_text(size = 14, face = "bold"),
    panel.background = element_blank(),
    axis.line = element_line(
      size = 0.5,
      linetype = "solid",
      colour = "black"
    ),
    strip.text = element_text(size = 14,face = "bold"),
    legend.text = element_text(size = 14),
    legend.title = element_text(size = 16),
    legend.position = "bottom",
    legend.box = "horizontal"
  )
ggsave("filtering_thresholds_general.png", plot = p, width = 10, height = 8)

7.2 Total archaeal ASVs

archaea_total <- master_table %>%
  left_join(., genome_metadata, by="genome")   %>% 
  filter(domain=="Archaea") %>%
  group_by(copy_threshold, prevalence, genome) %>%
  summarise(asv_count = sum(value != 0, na.rm = TRUE)) %>%
  group_by(copy_threshold, prevalence) %>%
  summarise(Total = sum(asv_count != 0, na.rm = TRUE))
master_table %>%
  left_join(., genome_metadata, by="genome")   %>% 
  filter(domain=="Archaea") %>%
  group_by(Species, copy_threshold, prevalence) %>% 
  summarise(asv_count = sum(value != 0, na.rm = TRUE)) %>%
  pivot_wider(names_from = Species, values_from = c(asv_count)) %>%
  left_join(.,
            archaea_total,
            by = join_by(
              "prevalence" == "prevalence",
              "copy_threshold" == "copy_threshold"
            )) %>%
  pivot_longer(!c(copy_threshold, prevalence),
               names_to = "Species",
               values_to = "ASVs") %>%
  ggplot(aes(
    x = prevalence,
    y = ASVs,
    group = Species,
    color = Species
  )) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  facet_wrap( ~ factor(
    copy_threshold,
    labels = c(
      "t0" = "No cut-off",
      "t1" = "Cut-off 0.0001",
      "t2" = "Cut-off 0.001",
      "t3" = "Cut-off 0.01",
      "t4" = "Cut-off 0.1"
    )
  ), scales = "free_y") +
  labs(x = "Prevalence threshold (%)", y = "Total ASVs", color = "Bat species") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(size = 12),
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 20, face = "bold"),
    axis.text = element_text(face = "bold", size = 18),
    panel.background = element_blank(),
    axis.line = element_line(
      size = 0.5,
      linetype = "solid",
      colour = "black"
    ),
    strip.text = element_text(size = 20, face = "bold"),
    legend.text = element_text(size = 16),
    legend.title = element_text(size = 18),
    legend.position = "bottom",
    legend.box = "horizontal"
  )

7.3 Single to individuals

total_single <- master_table %>%
  group_by(prevalence, copy_threshold, genome) %>%
  summarise(freq = sum(pa, na.rm = TRUE), .groups = "drop") %>%
  filter(freq == 1) %>%
  group_by(prevalence, copy_threshold) %>%
  summarise(total = n())

master_table %>%
  group_by(Species, copy_threshold, prevalence, genome) %>%
  summarise(freq = sum(pa, na.rm = TRUE), .groups = "drop") %>%
  pivot_wider(names_from = Species, values_from = c(freq)) %>%
  filter(rowSums(across(Eb:Pk)) == 1)  %>%
  pivot_longer(!c(copy_threshold, prevalence, genome),
               names_to = "Species",
               values_to = "single") %>%
  filter(single == 1) %>%
  group_by(Species, copy_threshold, prevalence) %>%
  summarise(single = n()) %>%
  pivot_wider(names_from = Species, values_from = c(single)) %>%
  left_join(
    .,
    total_single,
    by = join_by(
      "prevalence" == "prevalence",
      "copy_threshold" == "copy_threshold"
    )
  ) %>%
  mutate(across(everything(), ~ replace_na(., 0))) %>%
  pivot_longer(!c(copy_threshold, prevalence),
               names_to = "Species",
               values_to = "single") %>%
  pivot_wider(names_from = prevalence, values_from = c(single)) %>%
  select(Species, everything())  %>%
  mutate(
    Species = case_when(
      Species == "Eb" ~ "Eptesicus",
      Species == "Ha" ~ "Hypsugo",
      Species == "Pk" ~ "Pipistrellus",
      Species == "total" ~ "All species",
      TRUE ~ Species  # Keep other values unchanged
    )
  ) %>%
  mutate(
    copy_threshold = case_when(
      copy_threshold == "t0" ~ " no cut-off",
      copy_threshold == "t1" ~ "cut-off 0.0001",
      copy_threshold == "t2" ~ "cut-off 0.001",
      copy_threshold == "t3" ~ "cut-off 0.01",
      copy_threshold == "t4" ~ "cut-off 0.1",
      TRUE ~ copy_threshold
    )
  ) %>%
  rename(
    `Copy threshold` = copy_threshold,
    `No prevalence` = p0,
    `Prevalence 10%` = p10,
    
  ) %>%
  arrange(Species) %>%
  tt()

Species	Copy threshold	No prevalence	Prevalence 10%
All species	no cut-off	2996	1013
All species	cut-off 0.0001	2477	795
All species	cut-off 0.001	1051	346
All species	cut-off 0.01	284	96
All species	cut-off 0.1	60	8
Eptesicus	no cut-off	908	1013
Eptesicus	cut-off 0.0001	693	795
Eptesicus	cut-off 0.001	282	346
Eptesicus	cut-off 0.01	71	96
Eptesicus	cut-off 0.1	5	8
Hypsugo	no cut-off	1059	0
Hypsugo	cut-off 0.0001	893	0
Hypsugo	cut-off 0.001	391	0
Hypsugo	cut-off 0.01	113	0
Hypsugo	cut-off 0.1	27	0
Pipistrellus	no cut-off	1029	0
Pipistrellus	cut-off 0.0001	891	0
Pipistrellus	cut-off 0.001	378	0
Pipistrellus	cut-off 0.01	100	0
Pipistrellus	cut-off 0.1	28	0

7.4 Single to species

master_table %>%
  group_by(Species, copy_threshold, prevalence, genome) %>%
  summarise(freq = sum(pa, na.rm = TRUE), .groups = "drop") %>%
  pivot_wider(names_from = Species, values_from = c(freq)) %>%
  mutate(across(c("Eb", "Pk", "Ha"), ~ as.integer(. > 0))) %>%
  filter(rowSums(across(Eb:Pk)) == 1)  %>%
  pivot_longer(!c(copy_threshold, prevalence, genome),
               names_to = "Species",
               values_to = "unique") %>%
  filter(unique == 1) %>%
  group_by(Species, copy_threshold, prevalence) %>%
  summarise(unique = n()) %>%
  ggplot(aes(
    x = prevalence,
    y = unique,
    group = Species,
    color = Species
  )) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  facet_wrap( ~ factor(
    copy_threshold,
    labels = c(
      "t0" = "No cut-off",
      "t1" = "Cut-off 0.0001",
      "t2" = "Cut-off 0.001",
      "t3" = "Cut-off 0.01",
      "t4" = "Cut-off 0.1"
    )
  ), scales = "free_y") +
  labs(x = "Prevalence threshold (%)", y = " Single to species", color = "Bat species") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(size = 12),
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 20, face = "bold"),
    axis.text = element_text(face = "bold", size = 18),
    panel.background = element_blank(),
    axis.line = element_line(
      size = 0.5,
      linetype = "solid",
      colour = "black"
    ),
    strip.text = element_text(size = 20),
    legend.text = element_text(size = 16),
    legend.title = element_text(size = 18),
    legend.position = "bottom",
    legend.box = "horizontal"
  )

7.5 Alpha Diversity

prev_threshold <- c("p0", "p10", "p20", "p30", "p40")
copy_threshold <- c("t0", "t1", "t2", "t3", "t4")

bats <- c("Eb", "Pk", "Ha")

results <- data.frame(
  alpha = character(),
  Total = numeric(),
  Eptesicus = character(),
  Pipistrellus = character(),
  Hypsugo = character(),
  Copy_threshold = character(),
  Prevalence = character()
)

for (copy in copy_threshold) {
  genome_table <- master_table %>%
    filter(copy_threshold == copy) %>%
    select(-copy_threshold, -pa) %>%
    pivot_wider(names_from = genome, values_from = value)
  for (prev in prev_threshold) {
    prev_table <- genome_table %>%
      filter(prevalence == prev) %>%
      select(-Species, -prevalence) %>%
      column_to_rownames(., "sample") %>%
      filter(rowSums(across(everything())) != 0) %>%
      t() %>%
      as.data.frame()
    richness <- prev_table %>%
      dplyr::select(where(~ !all(. == 0))) %>%
      hilldiv(., q = 0) %>%
      t() %>%
      as.data.frame() %>%
      dplyr::rename(richness = 1) %>%
      rownames_to_column(var = "sample")
    neutral <- prev_table %>%
      dplyr::select(where(~ !all(. == 0))) %>%
      hilldiv(., q = 1) %>%
      t() %>%
      as.data.frame() %>%
      dplyr::rename(neutral = 1) %>%
      rownames_to_column(var = "sample")
    
    alpha_div <- richness %>%
      full_join(neutral, by = join_by(sample == sample))
    
    div <- alpha_div %>%
      pivot_longer(-sample, names_to = "alpha", values_to = "value") %>%
      left_join(sample_metadata, by = join_by(sample == sample)) %>%
      group_by(alpha) %>%
      summarise(
        total_mean = mean(value, na.rm = T),
        total_sd = sd(value, na.rm = T),
        Eb_mean = mean(value[Species == "Eb"], na.rm = T),
        Eb_sd = sd(value[Species == "Eb"], na.rm = T),
        Ha_mean = mean(value[Species == "Ha"], na.rm = T),
        Ha_sd = sd(value[Species == "Ha"], na.rm = T),
        Pk_mean = mean(value[Species == "Pk"], na.rm = T),
        Pk_sd = sd(value[Species == "Pk"], na.rm = T)
      ) %>%
      mutate(
        Total = str_c(round(total_mean, 3), "±", round(total_sd, 3)),
        Eptesicus = str_c(round(Eb_mean, 3), "±", round(Eb_sd, 3)),
        Hypsugo = str_c(round(Ha_mean, 3), "±", round(Ha_sd, 3)),
        Pipistrellus = str_c(round(Pk_mean, 3), "±", round(Pk_sd, 3))
      ) %>%
      dplyr::select(alpha, Total, Eptesicus, Pipistrellus, Hypsugo) %>%
      mutate(Prevalence = prev) %>%
      mutate(Copy_threshold = copy) %>%
      as.data.frame()
    
    results <- rbind(results, div)
  }
} 

results <- results %>%
  mutate(across(everything(), ~ ifelse(is.na(.), 0, .)))

results %>%
  pivot_longer(!c(alpha, Prevalence, Copy_threshold),
               names_to = "Species",
               values_to = "value") %>%
  filter(alpha == "richness") %>%  mutate(
    Species = factor(
      Species,
      levels = c("Total", "Eptesicus", "Pipistrellus", "Hypsugo"),
      labels = c("All bat species", "C. bottae", "P. kuhlii", "H. ariel")
      ) ) %>% 
  ggplot(.,
         aes(
           x = Prevalence,
           y = as.numeric(gsub("±.*", "", value)),
           group = Species,
           color = Species
         )) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  scale_color_manual(values = species_color,
                     labels = c(
                       expression(italic("All bat species")),
                       expression(italic("C. bottae")),
                       expression(italic("H. ariel")),
                       expression(italic("P. kuhlii"))
                     ))+
  facet_wrap( ~ factor(
    Copy_threshold,
    labels = c(
      "t0" = "No cut-off",
      "t1" = "Cut-off 0.0001",
      "t2" = "Cut-off 0.001",
      "t3" = "Cut-off 0.01",
      "t4" = "Cut-off 0.1"
    )), scales = "free_y") +
  labs(
    title = "Alpha Diversity Across Copy filtering and Prevalence Thresholds",
    x = "Prevalence Threshold",
    y = "Richness",
    color = "Species"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(size = 12, face = "bold"),
    axis.text.y = element_text(size = 12, face = "bold"),
    axis.title = element_text(size = 16, face = "bold"),
    panel.background = element_blank(),
    axis.line = element_line(
      size = 0.5,
      linetype = "solid",
      colour = "black"
    ),
    strip.text = element_text(size = 16, face = "bold"),
    legend.text = element_text(size = 16),
    legend.title = element_text(size = 18),
    legend.position = "bottom",
    legend.box = "horizontal"
  )

results %>%
  pivot_longer(!c(alpha, Prevalence, Copy_threshold),
               names_to = "Species",
               values_to = "value") %>%
  filter(alpha == "neutral") %>%
  mutate(
    Species = factor(
      Species,
      levels = c("Total", "Eptesicus", "Pipistrellus", "Hypsugo"),
      labels = c("All bat species", "C. bottae", "P. kuhlii", "H. ariel")
      ) ) %>% 
  ggplot(.,
         aes(
           x = Prevalence,
           y = as.numeric(gsub("±.*", "", value)),
           group = Species,
           color = Species
         )) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  scale_color_manual(values = species_color,
                     labels = c(
                       expression(italic("All bat species")),
                       expression(italic("C. bottae")),
                       expression(italic("H. ariel")),
                       expression(italic("P. kuhlii"))
                     ))+
  facet_wrap( ~ factor(
    Copy_threshold,
    labels = c(
      "t0" = "No cut-off",
      "t1" = "Cut-off 0.0001",
      "t2" = "Cut-off 0.001",
      "t3" = "Cut-off 0.01",
      "t4" = "Cut-off 0.1"
    )
  ), scales = "free_y") +
  labs(
    x = "Prevalence Threshold",
    y = "Neutral",
    color = "Species"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(size = 12, face = "bold"),
    axis.text.y = element_text(size = 12, face = "bold"),
    axis.title = element_text(size = 14, face = "bold"),
    panel.background = element_blank(),
    axis.line = element_line(
      size = 0.5,
      linetype = "solid",
      colour = "black"
    ),
    strip.text = element_text(size = 14, face = "bold"),
    legend.text = element_text(size = 14),
    legend.title = element_text(size = 16),
    legend.position = "bottom",
    legend.box = "horizontal"
  )

7.5.1 Comparison alpha diversity: amplicom different threshold vs metagenomics

prev_threshold <- c("p0", "p10", "p20", "p30", "p40")
copy_threshold <- c("t0", "t1", "t2", "t3", "t4")

bats <- c("Eb", "Pk", "Ha")

alpha_both <- data.frame(
  alpha = character(),
  Total = numeric(),
  Eptesicus = character(),
  Pipistrellus = character(),
  Hypsugo = character(),
  Copy_threshold = character(),
  Prevalence = character()
)

for (copy in copy_threshold) {
  genome_table <- master_table %>%
    filter(copy_threshold == copy) %>%
    select(-copy_threshold, -pa) %>%
    pivot_wider(names_from = genome, values_from = value)
  for (prev in prev_threshold) {
    prev_table <- genome_table %>%
      filter(prevalence == prev) %>%
      select(-Species, -prevalence) %>%
      column_to_rownames(., "sample") %>%
      filter(rowSums(across(everything())) != 0) %>%
      t() %>%
      as.data.frame()
    richness <- prev_table %>%
      dplyr::select(where( ~ !all(. == 0))) %>%
      hilldiv(., q = 0) %>%
      t() %>%
      as.data.frame() %>%
      dplyr::rename(richness = 1) %>%
      rownames_to_column(var = "sample")
    neutral <- prev_table %>%
      dplyr::select(where( ~ !all(. == 0))) %>%
      hilldiv(., q = 1) %>%
      t() %>%
      as.data.frame() %>%
      dplyr::rename(neutral = 1) %>%
      rownames_to_column(var = "sample")
    
    alpha_div <- richness %>%
      full_join(neutral, by = join_by(sample == sample))
    
    div <- alpha_div %>%
      mutate(prevalence = prev) %>%
      mutate(copy_threshold = copy) %>%
      as.data.frame()
    
    alpha_both <- rbind(alpha_both, div)
  }
} 

alpha_both <- alpha_both %>%
  mutate(method = "16S")

write_csv(alpha_both,"resources/amplicon/alpha_twothreshold_table.csv")

*** Statistical differences in richness with different thresholds between methods ***

metagenome_alpha_div <- readRDS("resources/metagenomics/metagenome_alpha_div.rds") %>%
  select(sample, richness, neutral, method)

prev_threshold <- c("p0", "p10", "p20", "p30", "p40")
copy_threshold <- c("t0", "t1", "t2", "t3", "t4")

wilcox_results_table <- data.frame(
  Copy_threshold = character(),
  Prevalence = character(),
  W_statistic = numeric(),
  P_value = numeric(),
  stringsAsFactors = FALSE
)

for (copy in copy_threshold) {
  alpha_Filt <- alpha_both %>%
    filter(copy_threshold == copy) %>%
    select(-copy_threshold)
  for (prev in prev_threshold) {
    all_result <- alpha_Filt %>%
      filter(prevalence == prev) %>%
      select(-prevalence) %>%
      rbind(., metagenome_alpha_div) %>%
        pivot_longer(!c(sample, method),
               names_to = "metric",
               values_to = "diversity") %>% 
      filter(metric=="richness")
    
      wilcox_result <- wilcox.test(diversity ~ method, data = all_result,
                   exact = FALSE)
      W <- wilcox_result[[1]]
      p_value <- wilcox_result[[3]]
    
    wilcox_results_table <- wilcox_results_table %>%
      add_row(
        Copy_threshold = copy,
        Prevalence = prev,
        W_statistic = W,
        P_value = p_value
      )
  }
}

All p values

wilcox_results_table %>%
  mutate(
    Copy_threshold = case_when(
      Copy_threshold == "t0" ~ " no cut-off",
      Copy_threshold == "t1" ~ "cut-off 0.0001",
      Copy_threshold == "t2" ~ "cut-off 0.001",
      Copy_threshold == "t3" ~ "cut-off 0.01",
      Copy_threshold == "t4" ~ "cut-off 0.1",
      TRUE ~ Copy_threshold
    )
  ) %>%
  mutate(
    Prevalence = case_when(
      Prevalence == "p0" ~ "No prevalence",
      Prevalence == "p10" ~ "Prevalence 10%",
      Prevalence == "p20" ~ "Prevalence 20%",
      Prevalence == "p30" ~ "Prevalence 30%",
      Prevalence == "p40" ~ "Prevalence 40%",
      TRUE ~ Prevalence
    )
  ) %>%
  rename(
    `Copy threshold` = Copy_threshold,
    `Prevalence` = Prevalence,
    `W statistic` = W_statistic,
    `P_value` = P_value
  ) %>%
  tt()

Significant p values

wilcox_results_table %>%
  mutate(
    Copy_threshold = case_when(
      Copy_threshold == "t0" ~ " no cut-off",
      Copy_threshold == "t1" ~ "cut-off 0.0001",
      Copy_threshold == "t2" ~ "cut-off 0.001",
      Copy_threshold == "t3" ~ "cut-off 0.01",
      Copy_threshold == "t4" ~ "cut-off 0.1",
      TRUE ~ Copy_threshold
    )
  ) %>%
  mutate(
    Prevalence = case_when(
      Prevalence == "p0" ~ "No prevalence",
      Prevalence == "p10" ~ "Prevalence 10%",
      Prevalence == "p20" ~ "Prevalence 20%",
      Prevalence == "p30" ~ "Prevalence 30%",
      Prevalence == "p40" ~ "Prevalence 40%",
      TRUE ~ Prevalence
    )
  ) %>%
  filter(P_value<0.05) %>% 
  rename(
    `Copy threshold` = Copy_threshold,
    `Prevalence` = Prevalence,
    `W statistic` = W_statistic,
    `P value` = P_value
  ) %>%
  tt()

Non-Significant p values

wilcox_results_table %>%
  mutate(
    Copy_threshold = case_when(
      Copy_threshold == "t0" ~ " no cut-off",
      Copy_threshold == "t1" ~ "cut-off 0.0001",
      Copy_threshold == "t2" ~ "cut-off 0.001",
      Copy_threshold == "t3" ~ "cut-off 0.01",
      Copy_threshold == "t4" ~ "cut-off 0.1",
      TRUE ~ Copy_threshold
    )
  ) %>%
  mutate(
    Prevalence = case_when(
      Prevalence == "p0" ~ "No prevalence",
      Prevalence == "p10" ~ "Prevalence 10%",
      Prevalence == "p20" ~ "Prevalence 20%",
      Prevalence == "p30" ~ "Prevalence 30%",
      Prevalence == "p40" ~ "Prevalence 40%",
      TRUE ~ Prevalence
    )
  ) %>%
  filter(P_value>0.05) %>% 
  rename(
    `Copy threshold` = Copy_threshold,
    `Prevalence` = Prevalence,
    `W statistic` = W_statistic,
    `P value` = P_value
  ) %>%
  tt()

7.5.2 Finding the threshold that gives results most similar to metagenomics

load("resources/metagenomics/data_filtered.Rdata")

metagenomics_diversity <- genome_counts_filt %>% 
  column_to_rownames("genome") %>% 
  hilldiv(.,q=1) %>% 
  as.data.frame() %>% 
  rownames_to_column("filter")

metagenomics_diversity[1,1] <- "metagenomics_metagenomics"

diversity_master <- master_table %>%
  mutate(filter=str_c(copy_threshold,"_",prevalence)) %>%
  select(-copy_threshold,-prevalence) %>% 
  group_by(filter,sample) %>% 
  summarise(diversity=hilldiv(value,q=1)) %>%
  pivot_wider(names_from=sample,values_from=diversity)

diversity_master <- bind_rows(metagenomics_diversity,diversity_master)
diversity_average <- diversity_master %>%
  mutate(mean_diversity = rowMeans(select(., -filter), na.rm = TRUE)) %>% 
  select(filter,mean_diversity)

numeric_data <- diversity_master %>% select(-filter)
metadata <- diversity_master$filter

pca_result <- prcomp(numeric_data, scale. = TRUE)

pca_df <- as.data.frame(pca_result$x) %>%
  mutate(filter = metadata) %>% 
  separate(filter, into = c("copy", "prevalence"), sep = "_", fill = "right", remove = FALSE) %>% 
  left_join(diversity_average,by="filter") %>% 
  mutate(
    copy = case_when(
      copy == "metagenomics" ~ "Metagenomics",
      copy == "t0" ~ "No filtering",
      copy == "t1" ~ "< 0.0001",
      copy == "t2" ~ "< 0.001",
      copy == "t3" ~ "< 0.01",
      copy == "t4" ~ "< 0.1",
      TRUE ~ copy
    )
  ) %>%
  mutate(
    prevalence = case_when(
      prevalence == "metagenomics" ~ NA_character_,
      prevalence == "p0" ~ "No filtering",
      prevalence == "p10" ~ "< 10%",
      prevalence == "p20" ~ "< 20%",
      prevalence == "p30" ~ "< 30%",
      prevalence == "p40" ~ "< 40%",
      TRUE ~ prevalence
    )
  ) %>%
  mutate(
    copy = factor(
      copy,
      levels = c("Metagenomics", "No filtering", "< 0.0001", "< 0.001", "< 0.01", "< 0.1"),
      ) )%>%
  mutate(
    prevalence = factor(
      prevalence,
      levels = c("No filtering", "< 10%", "< 20%", "< 30%", "< 40%"),
      ) ) %>%
  mutate(is_meta = copy == "Metagenomics")


ggplot(pca_df, aes(x = PC1, y = PC2, color = copy, shape=prevalence, label=filter)) +
  geom_point(
    data = subset(pca_df, !is_meta),
    aes(color = copy, shape = prevalence),
    alpha = 0.6, size=5
  ) +
  geom_point(
    data = subset(pca_df, is_meta),
    shape = 21,
    fill = "red",
    color = "black", size=7
  ) +
  geom_label_repel(
    data = subset(pca_df, is_meta),
    aes(label = "Metagenomics"),
    color = "black",
    fontface = "bold",
    size = 4,
    label.size = 0.4,
    nudge_x = 0.05, 
  nudge_y = 0.18 
  )+
  scale_shape_discrete(na.translate = FALSE) +
  theme_minimal() +
    theme(
      axis.text.x = element_text(size = 12),
      axis.text.y = element_text(size = 12),
      axis.title = element_text(size = 18, face = "bold"),
      panel.background = element_blank(),
      axis.line = element_line(size = 0.5, linetype = "solid", colour = "black"),
      legend.text = element_text(size = 16),
      legend.title = element_text(size = 18)
    )+
  labs(
       x = paste0("PC1 (", round(summary(pca_result)$importance[2,1] * 100, 2), "% variance)"),
       y = paste0("PC2 (", round(summary(pca_result)$importance[2,2] * 100, 2), "% variance)"),
    color = "Read fraction",
    shape = "Prevalence"
  )

pca_scores <- as.data.frame(pca_result$x)
reference_row <- pca_scores[diversity_master$filter == "metagenomics_metagenomics", ]
variance_explained <- summary(pca_result)$importance[2, ]

weighted_distances <- apply(pca_result$x, 1, function(row) {
  sum(((row - reference_row)^2) * variance_explained)
})

tibble(filter=diversity_master$filter,distance=weighted_distances) %>% 
    mutate(distance = round(distance, 2)) %>%
  separate(filter, into = c("copy", "prevalence"), sep = "_", fill = "right", remove = FALSE) %>% 
  mutate(
    copy = case_when(
      copy == "metagenomics" ~ "Metagenomics",
      copy == "t0" ~ "No filtering",
      copy == "t1" ~ "< 0.0001",
      copy == "t2" ~ "< 0.001",
      copy == "t3" ~ "< 0.01",
      copy == "t4" ~ "< 0.1",
      TRUE ~ copy
    )
  )%>%
  mutate(
    prevalence = case_when(
      prevalence == "metagenomics" ~ NA_character_,
      prevalence == "p0" ~ "No filtering",
      prevalence == "p10" ~ "< 10%",
      prevalence == "p20" ~ "< 20%",
      prevalence == "p30" ~ "< 30%",
      prevalence == "p40" ~ "< 40%",
      TRUE ~ prevalence
    )
  ) %>% 
  select(-filter) %>% 
  arrange(distance) %>% 
  tt()

copy	prevalence	distance
Metagenomics	NA	0.00
< 0.01	< 10%	0.77
No filtering	< 30%	1.10
< 0.0001	< 30%	1.20
< 0.001	< 20%	1.43
No filtering	< 40%	2.17
< 0.0001	< 40%	2.33
< 0.1	No filtering	2.81
< 0.001	< 30%	4.25
< 0.0001	< 20%	6.74
No filtering	< 20%	7.28
< 0.001	< 40%	8.31
< 0.01	< 20%	8.94
< 0.01	< 30%	15.49
< 0.1	< 10%	15.69
< 0.1	< 20%	17.67
< 0.01	< 40%	18.25
< 0.1	< 30%	18.25
< 0.1	< 40%	18.25
< 0.001	< 10%	24.79
< 0.01	No filtering	37.70
< 0.0001	< 10%	48.48
No filtering	< 10%	50.24
< 0.001	No filtering	172.93
< 0.0001	No filtering	243.76
No filtering	No filtering	248.13

diversity_master %>% 
  filter(filter %in% c("metagenomics_metagenomics","t3_p10")) %>%
  pivot_longer(!filter, names_to = "sample", values_to = "diversity") %>% 
  ggplot(aes(x=filter,y=diversity, group=filter)) +
    geom_boxplot() + 
    geom_jitter() +
    geom_line(aes(group = sample), linetype = "dashed", size = 1)+
  theme_minimal()

Save the table with selected filtering thresholds

load("resources/amplicon/data_nocopyfilt.Rdata")
load("resources/amplicon/master_table.Rdata")

genome_counts_filt_amplicon <- master_table %>%
  mutate(filter = str_c(copy_threshold, "_", prevalence)) %>%
  filter(filter == "t3_p10") %>%
  select(-copy_threshold, -prevalence, -filter, -pa, -Species) %>%
  pivot_wider(names_from = sample, values_from = value) %>%
  mutate(across(everything(), ~ replace_na(., 0))) %>%
  select(genome, where( ~ is.numeric(.) && sum(.) > 0)) %>%
  filter(rowSums(across(-genome, ~ . != 0)) > 0)

sample_metadata_amplicon <- sample_metadata %>%
  filter(sample %in% colnames(genome_counts_filt_amplicon))

genome_metadata_amplicon <- genome_metadata %>%
  filter(genome %in% genome_counts_filt_amplicon$genome)

save(sample_metadata_amplicon, genome_metadata_amplicon, genome_counts_filt_amplicon, file = "resources/amplicon/selected_filtering_rel.Rdata")

load("resources/amplicon/data_nocopyfilt.Rdata")
load("resources/amplicon/selected_filtering.Rdata")

genome_counts_filt_amplicon <- genome_counts_filt %>% 
  filter(genome %in% genome_counts_filt_amplicon$genome) %>% 
  filter(rowSums(across(-genome)) != 0) %>% 
  select(genome, where(~ !all(. == 0))) %>% 
  select(all_of(colnames(genome_counts_filt_amplicon)))

# genome_counts_filt_amplicon_pa <- genome_counts_filt_amplicon %>% 
#   mutate(across(-genome, ~ if_else(. > 0, 1, 0)))
# 
# genome_counts_filt_amplicon <- genome_counts_filt %>%
#   column_to_rownames("genome") %>%
#   `*`(genome_counts_filt_amplicon_pa %>%
#         column_to_rownames("genome")) %>%
#   rownames_to_column("genome") %>% 
#   filter(genome %in% genome_metadata_amplicon$genome) %>% 
#   filter(rowSums(across(-genome)) != 0)

sample_metadata_amplicon <- sample_metadata %>%
  filter(sample %in% colnames(genome_counts_filt_amplicon))

genome_metadata_amplicon <- genome_metadata %>%
  filter(genome %in% genome_counts_filt_amplicon$genome)

phylum_colors <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
  mutate_at(vars(phylum), ~ str_replace(., "[dpcofgs]__", ""))%>%
    right_join(genome_metadata_amplicon, by=join_by(phylum == phylum)) %>%
    arrange(match(genome, genome_metadata_amplicon$genome)) %>%
    select(phylum, colors) %>% 
    unique() %>%
    arrange(phylum) %>%
    pull(colors, name=phylum)

save(sample_metadata_amplicon, genome_metadata_amplicon, genome_counts_filt_amplicon, phylum_colors, file = "resources/amplicon/selected_filtering.Rdata")

load("resources/amplicon/data_nocopyfilt.Rdata")

genome_counts_filt_amplicon <- master_table %>%
  mutate(filter = str_c(copy_threshold, "_", prevalence)) %>%
  filter(filter == "t0_p30") %>%
  select(-copy_threshold, -prevalence, -filter, -pa, -Species) %>%
  pivot_wider(names_from = sample, values_from = value) %>%
  mutate(across(everything(), ~ replace_na(., 0))) %>%
  select(genome, where( ~ is.numeric(.) && sum(.) > 0)) %>%
  filter(rowSums(across(-genome, ~ . != 0)) > 0)

sample_metadata_amplicon <- sample_metadata %>%
  filter(sample %in% colnames(genome_counts_filt_amplicon_t0_p30))

genome_metadata_amplicon <- genome_metadata %>%
  filter(genome %in% genome_counts_filt_amplicon_t0_p30$genome)

save(sample_metadata_amplicon, genome_metadata_amplicon, genome_counts_filt_amplicon,file = "resources/amplicon/second_filtering_t0_p30.Rdata")

7.6 Based on beta diversity

Aggregate counts at family or phylum level in both datasets

load("resources/metagenomics/data_filtered.Rdata")

genome_metadata <- genome_metadata %>%
  mutate(
    class  = if_else(str_trim(class) == "", paste0(phylum, "_Unclassified"), class),
    order  = if_else(str_trim(order) == "",
                     if_else(str_detect(class, "Unclassified"), class, paste0(class, "_Unclassified")),
                     order),
    family = if_else(str_trim(family) == "",
                     if_else(str_detect(order, "Unclassified"), order, paste0(order, "_Unclassified")),
                     family),
    genus  = if_else(str_trim(genus) == "",
                     if_else(str_detect(family, "Unclassified"), family, paste0(family, "_Unclassified")),
                     genus)
  )

genome_tax <- genome_counts_filt %>% 
  left_join(genome_metadata[c(1,3,4,6)], by = "genome")

# Aggregate counts at the Family level
family_level_agg <- genome_tax %>%
  select(-genome) %>%
  group_by(phylum, class, family) %>%
  summarise(across(where(is.numeric), \(x) sum(x, na.rm = TRUE)), .groups = "drop")%>% 
  as.data.frame() %>% 
  select(-c(phylum,class))

meta_dist <- family_level_agg %>%
  column_to_rownames("family") %>%
  hillpair(q = 1)

load("resources/amplicon/data_nocopyfilt.Rdata")
load("resources/amplicon/master_table.Rdata")

filters <- master_table  %>%
  mutate(filter = str_c(copy_threshold, "_", prevalence)) %>%
  distinct(filter) %>%
  pull(filter)

genome_metadata_ampli <- genome_metadata %>%
  mutate(
    class  = if_else(is.na(class), paste0(phylum, "_Unclassified"), class),
    order  = if_else(is.na(order),
                     if_else(str_detect(class, "Unclassified"), class, paste0(class, "_Unclassified")),
                     order),
    family = if_else(is.na(family),
                     if_else(str_detect(order, "Unclassified"), order, paste0(order, "_Unclassified")),
                     family),
    genus  = if_else(is.na(genus),
                     if_else(str_detect(family, "Unclassified"), family, paste0(family, "_Unclassified")),
                     genus)
  )

beta_list <- list()
filter_stats <- tibble()

min_taxa <- 5
min_samples <- 51

for (f in filters) {
  
  cat("Processing:", f, "\n")  
  
  df_subset <- master_table %>%
    mutate(value = ifelse(is.na(value), 0, value)) %>%
    mutate(filter = str_c(copy_threshold, "_", prevalence)) %>%
    filter(filter == f) 
  
  df <- df_subset %>%
    select(sample, genome, value) %>%  
    group_by(genome, sample) %>%
    summarise(value = sum(value, na.rm = TRUE), .groups = "drop") %>%
    pivot_wider(
      names_from = sample,
      values_from = value,
      values_fill = list(value = 0)
    ) %>% 
    filter(rowSums(across(-genome)) != 0) %>% 
    select(genome, where(~ !all(. == 0)))
  
  mat <- df %>%
    left_join(genome_metadata_ampli[c(1, 3, 4, 6)], by = "genome") %>%
    select(-genome) %>%
    group_by(phylum, class, family) %>%
    summarise(across(where(is.numeric), \(x) sum(x, na.rm = TRUE)), .groups = "drop") %>%
    as.data.frame() %>%
    select(-c(phylum, class)) %>%
    select(family, everything()) %>%
    column_to_rownames("family") 
  
    # Count stats BEFORE filtering out
  taxa_count <- nrow(mat)
  sample_count <- ncol(mat)
  
  # QC filters
  if (taxa_count < min_taxa) next
  if (sample_count < min_samples) next
  
  # # 1. Minimum number of families
  # if (nrow(mat) < min_taxa) {
  #   cat("Skipping", f, "- too few taxa:", nrow(mat), "\n")
  #   next
  # }
  # # 2. Require all samples (51)
  # min_samples <- 50
  # if (ncol(mat) < min_samples) {
  #   cat("Skipping", f, "- missing samples:", ncol(mat), "/", expected_samples, "\n")
  #   next
  # }
  # # 3. Remove empty samples (safety)
  # mat <- mat[, colSums(mat) > 0, drop = FALSE]
  
  # 4. Compute beta diversity
  beta <- hillpair(mat, q = 1)
    
  # 5. Check variance in distance matrix
  dist_mat <- as.matrix(beta$S)
  if (sd(dist_mat[upper.tri(dist_mat)]) == 0) {
    cat("Skipping", f, "- no variation in beta diversity\n")
    next
  }
  
  beta_list[[f]] <- beta
  
  filter_stats <- bind_rows(
    filter_stats,
    tibble(
      filter = f,
      taxa = taxa_count,
      samples = sample_count)
  )
}
save(beta_list,filter_stats, file="resources/amplicon/distance_thresholds_beta_family.Rdata")#distance_thresholds_beta_family50.Rdata

load("resources/amplicon/distance_thresholds_beta_family.Rdata")

# Make sure your metagenomics similarity matrix is a proper matrix
meta_S <- as.matrix(meta_dist$S)

mantel_results <- map_dfr(seq_along(beta_list), function(i) {
  
  # Extract similarity matrix for this filter
  S_amp <- as.matrix(beta_list[[i]]$S)
  
  # Find common samples
  common_samples <- intersect(colnames(meta_S), colnames(S_amp))
  
  # Skip if fewer than 2 shared samples (Mantel requires >=2)
  if(length(common_samples) < 2) {
    return(tibble(
      filter = names(beta_list)[i],
      correlation = NA_real_,
      p_value = NA_real_,
      n_samples = length(common_samples)
    ))
  }
  
  # Align both matrices in the same order
  meta_aligned <- meta_S[common_samples, common_samples, drop = FALSE]
  amp_aligned  <- S_amp[common_samples, common_samples, drop = FALSE]
  
# print(names(beta_list)[i])
# print(common_samples)
# print(all(rownames(meta_aligned) == rownames(amp_aligned)))  # should be TRUE
# print(all(colnames(meta_aligned) == colnames(amp_aligned)))  # should be TRUE
  
  # Convert similarity to distance
  meta_dist_mat <- meta_aligned
  amp_dist_mat  <- amp_aligned
  
  # Run Mantel test
  test <- vegan::mantel(
    as.dist(meta_dist_mat),
    as.dist(amp_dist_mat),
    method = "spearman"
  )
  
  # Return results
  tibble(
    filter = names(beta_list)[i],
    correlation = test$statistic,
    p_value = test$signif
  )
})

genome_counts_row <- family_level_agg %>% 
  column_to_rownames("family")

# Metagenomics similarity matrix
meta_S_matrix <- as.matrix(meta_dist$S)
colnames(meta_S_matrix) <- rownames(meta_S_matrix) <- colnames(genome_counts_row)  # assign your sample names

# Create aligned list for each amplicon filter
aligned_list <- purrr::map(names(beta_list), function(filter_name) {
  
  # Amplicon similarity matrix
  S_amp <- as.matrix(beta_list[[filter_name]]$S)
  
  # Assign sample names from master_table
  samples_for_filter <- master_table %>%
    mutate(filter = str_c(copy_threshold, "_", prevalence)) %>%
    filter(filter == filter_name) %>%
    pull(sample) %>% 
    unique()
  
  rownames(S_amp) <- colnames(S_amp) <- samples_for_filter
  
  # Subset metagenomics matrix to same samples
  meta_S_aligned <- meta_S_matrix[samples_for_filter, samples_for_filter, drop = FALSE]
  
  list(meta = meta_S_aligned, amp = S_amp)
})
names(aligned_list) <- names(beta_list)

pca_data <- map_dfr(names(aligned_list), function(filter_name) {
  
  mat_amp  <- aligned_list[[filter_name]]$amp
  mat_meta <- aligned_list[[filter_name]]$meta
  
  # Convert similarity to distance
  dist_amp  <- mat_amp
  dist_meta <- mat_meta
  
  # Flatten upper triangle to a vector
  vec_amp  <- dist_amp[upper.tri(dist_amp)]
  vec_meta <- dist_meta[upper.tri(dist_meta)]
  
  # Return separate rows for amplicon and metagenomics
  tibble(
    filter = filter_name,
    type = "amplicon",
    distances = list(vec_amp)
  ) %>% 
    bind_rows(
      tibble(
        filter = filter_name,
        type = "metagenomics",
        distances = list(vec_meta)
      )
    )
})

# Convert distances column into numeric vectors
dist_list <- lapply(pca_data$distances, function(x) {
  as.numeric(unlist(x))
})

# Combine into a data frame / matrix for PCA
pca_matrix <- do.call(rbind, dist_list)

# Assign rownames
rownames(pca_matrix) <- paste(pca_data$filter, pca_data$type, sep = "_")

pca_result <- prcomp(pca_matrix, scale. = TRUE)

pca_df <- as.data.frame(pca_result$x) %>%
  rownames_to_column("filter_type") %>%
  separate(filter_type, into = c("copy", "prevalence", "type"), sep = "_", fill = "right") %>%
  mutate(
    copy = factor(copy, levels = c("t0","t1","t2","t3","t4")),
    prevalence = factor(prevalence, levels = c("p0","p10","p20","p30","p40")),
    type = factor(type, levels = c("amplicon","metagenomics"))
  )

ggplot(pca_df, aes(x = PC1, y = PC2, color = copy, shape = type)) +
  geom_point(size = 4, alpha = 0.7) +
  geom_text_repel(aes(label = paste(copy, prevalence, sep="_")), size = 3) +
  theme_minimal(base_size = 14) +
  labs(
    title = "PCA of Beta Diversity: Amplicon vs Metagenomics",
    x = paste0("PC1 (", round(summary(pca_result)$importance[2,1]*100,2), "% variance)"),
    y = paste0("PC2 (", round(summary(pca_result)$importance[2,2]*100,2), "% variance)"),
    color = "Copy threshold",
    shape = "Data type"
  ) +
  scale_shape_manual(values = c(16, 17)) +  # circle for amp, triangle for meta
  theme(
    legend.position = "right",
    panel.grid.major = element_line(color = "gray90"),
    panel.grid.minor = element_blank()
  )

pca_scores <- as.data.frame(pca_result$x)
pca_scores_unique <- pca_scores[
  !grepl("metagenomics", rownames(pca_scores)) | 
  rownames(pca_scores) == rownames(pca_scores)[grepl("metagenomics", rownames(pca_scores))][1],
]

reference_row <- pca_scores_unique[grepl("_metagenomics", rownames(pca_scores_unique)), ]
#reference_row <- reference_rows[1, ]

variance_explained <- summary(pca_result)$importance[2, ]

weighted_distances <- apply(pca_scores_unique, 1, function(row) {
  sum(((row - reference_row)^2) * variance_explained)
})

pca_table <- tibble(
  filter_type = rownames(pca_scores_unique),
  distance = weighted_distances
) %>% 
  as.data.frame() %>% 
  rename(filter=filter_type) %>% 
  filter(filter != "t0_p0_metagenomics") %>% 
  mutate(filter = gsub("_amplicon", "", filter))

Summary_table

filter_stats %>%
  left_join(mantel_results, by = "filter") %>%
  left_join(pca_table, by = "filter") %>%
  arrange(distance,desc(correlation)) %>% 
  separate(filter, into = c("copy", "prevalence"), sep = "_", fill = "right") %>%
  mutate(
    copy = case_when(
      copy == "t0" ~ "No filtering",
      copy == "t1" ~ "< 0.0001",
      copy == "t2" ~ "< 0.001",
      copy == "t3" ~ "< 0.01",
      copy == "t4" ~ "< 0.1",
      TRUE ~ copy
    )
  ) %>%
  mutate(
    prevalence = case_when(
      prevalence == "p0" ~ "No filtering",
      prevalence == "p10" ~ "< 10%",
      prevalence == "p20" ~ "< 20%",
      prevalence == "p30" ~ "< 30%",
      prevalence == "p40" ~ "< 40%",
      TRUE ~ prevalence
    )
  )  %>% 
  tt()

copy	prevalence	taxa	samples	correlation	p_value	distance
< 0.1	No filtering	36	51	0.16909669	0.001	708.4683
< 0.01	No filtering	83	51	0.11128305	0.016	1256.9702
< 0.001	< 20%	48	51	0.03270130	0.220	1670.4436
< 0.001	< 10%	99	51	0.07761010	0.069	1670.7044
< 0.0001	< 30%	38	51	0.01610890	0.390	1823.4039
No filtering	< 30%	42	51	0.02399734	0.345	1827.1078
< 0.0001	< 20%	81	51	0.07318950	0.068	1829.7276
< 0.0001	< 10%	155	51	0.08384267	0.047	1830.6092
No filtering	< 10%	178	51	0.08596459	0.030	1830.8377
No filtering	< 20%	88	51	0.06833255	0.088	1851.8249
< 0.001	No filtering	179	51	0.10180009	0.026	2092.3438
< 0.0001	No filtering	299	51	0.10544208	0.024	2374.4954
No filtering	No filtering	329	51	0.10569201	0.020	2388.2505

Save the table with selected filtering thresholds*

load("resources/amplicon/data_nocopyfilt.Rdata")
load("resources/amplicon/master_table.Rdata")

genome_counts_filt_amplicon <- master_table %>%
  mutate(filter = str_c(copy_threshold, "_", prevalence)) %>%
  filter(filter == "t4_p0") %>%
  select(-copy_threshold, -prevalence, -filter, -pa, -Species) %>%
  pivot_wider(names_from = sample, values_from = value) %>%
  mutate(across(everything(), ~ replace_na(., 0))) %>%
  select(genome, where( ~ is.numeric(.) && sum(.) > 0)) %>%
  filter(rowSums(across(-genome, ~ . != 0)) > 0)

sample_metadata_amplicon <- sample_metadata %>%
  filter(sample %in% colnames(genome_counts_filt_amplicon))

genome_metadata_amplicon <- genome_metadata %>%
  filter(genome %in% genome_counts_filt_amplicon$genome)

save(sample_metadata_amplicon, genome_metadata_amplicon, genome_counts_filt_amplicon, file = "resources/amplicon/selected_filtering_beta_rel.Rdata")

load("resources/amplicon/data_nocopyfilt.Rdata")
load("resources/amplicon/selected_filtering_beta_rel.Rdata")

genome_counts_filt_amplicon <- genome_counts_filt %>% 
  filter(genome %in% genome_counts_filt_amplicon$genome) %>% 
  filter(rowSums(across(-genome)) != 0) %>% 
  select(genome, where(~ !all(. == 0))) %>% 
  select(all_of(colnames(genome_counts_filt_amplicon)))

sample_metadata_amplicon <- sample_metadata %>%
  filter(sample %in% colnames(genome_counts_filt_amplicon))

genome_metadata_amplicon <- genome_metadata %>%
  filter(genome %in% genome_counts_filt_amplicon$genome)
phylum_colors <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
  mutate_at(vars(phylum), ~ str_replace(., "[dpcofgs]__", ""))%>%
    right_join(genome_metadata_amplicon, by=join_by(phylum == phylum)) %>%
    arrange(match(genome, genome_metadata_amplicon$genome)) %>%
    select(phylum, colors) %>% 
    unique() %>%
    arrange(phylum) %>%
    pull(colors, name=phylum)

save(sample_metadata_amplicon, genome_metadata_amplicon, genome_counts_filt_amplicon, phylum_colors, file = "resources/amplicon/selected_filtering_beta.Rdata")