Chapter 7 Copy filtering & prevalence
load("resources/amplicon/data_nocopyfilt.Rdata")
load("resources/amplicon/master_table.Rdata")
species_color <- c("#000000","#e5bd5b", "#6b7398", "#76b183")7.1 Total ASVs
asv_total <- master_table %>%
group_by(copy_threshold, prevalence, genome) %>%
summarise(asv_count = sum(value != 0, na.rm = TRUE)) %>%
group_by(copy_threshold, prevalence) %>%
summarise(Total = sum(asv_count != 0, na.rm = TRUE))
p <- master_table %>%
group_by(Species, copy_threshold, prevalence) %>%
summarise(asv_count = sum(value != 0, na.rm = TRUE)) %>%
pivot_wider(names_from = Species, values_from = c(asv_count)) %>%
left_join(.,
asv_total,
by = join_by(
"prevalence" == "prevalence",
"copy_threshold" == "copy_threshold"
)) %>%
pivot_longer(!c(copy_threshold, prevalence),
names_to = "Species",
values_to = "ASVs") %>%
mutate(
Species = factor(
Species,
levels = c("Total", "Eb", "Pk", "Ha"),
labels = c("All bat species", "C. bottae", "P. kuhlii", "H. ariel")
) ) %>%
ggplot(aes(
x = prevalence,
y = ASVs,
group = Species,
color = Species
)) +
geom_line(size = 1) +
geom_point(size = 2) +
scale_color_manual(values = species_color,
labels = c(
expression(italic("All bat species")),
expression(italic("C. bottae")),
expression(italic("H. ariel")),
expression(italic("P. kuhlii"))
))+
facet_wrap( ~ factor(
copy_threshold,
labels = c(
"t0" = "No cut-off",
"t1" = "Cut-off 0.0001",
"t2" = "Cut-off 0.001",
"t3" = "Cut-off 0.01",
"t4" = "Cut-off 0.1"
)
), scales = "free_y") +
labs(x = "Prevalence threshold (%)", y = "Total ASVs", color = "Bat species") +
theme_minimal() +
theme(
axis.text.x = element_text(size = 12,face = "bold"),
axis.text.y = element_text(size = 12,face = "bold"),
axis.title = element_text(size = 14, face = "bold"),
panel.background = element_blank(),
axis.line = element_line(
size = 0.5,
linetype = "solid",
colour = "black"
),
strip.text = element_text(size = 14,face = "bold"),
legend.text = element_text(size = 14),
legend.title = element_text(size = 16),
legend.position = "bottom",
legend.box = "horizontal"
)
ggsave("filtering_thresholds_general.png", plot = p, width = 10, height = 8)7.2 Total archaeal ASVs
archaea_total <- master_table %>%
left_join(., genome_metadata, by="genome") %>%
filter(domain=="Archaea") %>%
group_by(copy_threshold, prevalence, genome) %>%
summarise(asv_count = sum(value != 0, na.rm = TRUE)) %>%
group_by(copy_threshold, prevalence) %>%
summarise(Total = sum(asv_count != 0, na.rm = TRUE))
master_table %>%
left_join(., genome_metadata, by="genome") %>%
filter(domain=="Archaea") %>%
group_by(Species, copy_threshold, prevalence) %>%
summarise(asv_count = sum(value != 0, na.rm = TRUE)) %>%
pivot_wider(names_from = Species, values_from = c(asv_count)) %>%
left_join(.,
archaea_total,
by = join_by(
"prevalence" == "prevalence",
"copy_threshold" == "copy_threshold"
)) %>%
pivot_longer(!c(copy_threshold, prevalence),
names_to = "Species",
values_to = "ASVs") %>%
ggplot(aes(
x = prevalence,
y = ASVs,
group = Species,
color = Species
)) +
geom_line(size = 1) +
geom_point(size = 2) +
facet_wrap( ~ factor(
copy_threshold,
labels = c(
"t0" = "No cut-off",
"t1" = "Cut-off 0.0001",
"t2" = "Cut-off 0.001",
"t3" = "Cut-off 0.01",
"t4" = "Cut-off 0.1"
)
), scales = "free_y") +
labs(x = "Prevalence threshold (%)", y = "Total ASVs", color = "Bat species") +
theme_minimal() +
theme(
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12),
axis.title = element_text(size = 20, face = "bold"),
axis.text = element_text(face = "bold", size = 18),
panel.background = element_blank(),
axis.line = element_line(
size = 0.5,
linetype = "solid",
colour = "black"
),
strip.text = element_text(size = 20, face = "bold"),
legend.text = element_text(size = 16),
legend.title = element_text(size = 18),
legend.position = "bottom",
legend.box = "horizontal"
)7.3 Single to individuals
total_single <- master_table %>%
group_by(prevalence, copy_threshold, genome) %>%
summarise(freq = sum(pa, na.rm = TRUE), .groups = "drop") %>%
filter(freq == 1) %>%
group_by(prevalence, copy_threshold) %>%
summarise(total = n())
master_table %>%
group_by(Species, copy_threshold, prevalence, genome) %>%
summarise(freq = sum(pa, na.rm = TRUE), .groups = "drop") %>%
pivot_wider(names_from = Species, values_from = c(freq)) %>%
filter(rowSums(across(Eb:Pk)) == 1) %>%
pivot_longer(!c(copy_threshold, prevalence, genome),
names_to = "Species",
values_to = "single") %>%
filter(single == 1) %>%
group_by(Species, copy_threshold, prevalence) %>%
summarise(single = n()) %>%
pivot_wider(names_from = Species, values_from = c(single)) %>%
left_join(
.,
total_single,
by = join_by(
"prevalence" == "prevalence",
"copy_threshold" == "copy_threshold"
)
) %>%
mutate(across(everything(), ~ replace_na(., 0))) %>%
pivot_longer(!c(copy_threshold, prevalence),
names_to = "Species",
values_to = "single") %>%
pivot_wider(names_from = prevalence, values_from = c(single)) %>%
select(Species, everything()) %>%
mutate(
Species = case_when(
Species == "Eb" ~ "Eptesicus",
Species == "Ha" ~ "Hypsugo",
Species == "Pk" ~ "Pipistrellus",
Species == "total" ~ "All species",
TRUE ~ Species # Keep other values unchanged
)
) %>%
mutate(
copy_threshold = case_when(
copy_threshold == "t0" ~ " no cut-off",
copy_threshold == "t1" ~ "cut-off 0.0001",
copy_threshold == "t2" ~ "cut-off 0.001",
copy_threshold == "t3" ~ "cut-off 0.01",
copy_threshold == "t4" ~ "cut-off 0.1",
TRUE ~ copy_threshold
)
) %>%
rename(
`Copy threshold` = copy_threshold,
`No prevalence` = p0,
`Prevalence 10%` = p10,
) %>%
arrange(Species) %>%
tt()| Species | Copy threshold | No prevalence | Prevalence 10% |
|---|---|---|---|
| All species | no cut-off | 2996 | 1013 |
| All species | cut-off 0.0001 | 2477 | 795 |
| All species | cut-off 0.001 | 1051 | 346 |
| All species | cut-off 0.01 | 284 | 96 |
| All species | cut-off 0.1 | 60 | 8 |
| Eptesicus | no cut-off | 908 | 1013 |
| Eptesicus | cut-off 0.0001 | 693 | 795 |
| Eptesicus | cut-off 0.001 | 282 | 346 |
| Eptesicus | cut-off 0.01 | 71 | 96 |
| Eptesicus | cut-off 0.1 | 5 | 8 |
| Hypsugo | no cut-off | 1059 | 0 |
| Hypsugo | cut-off 0.0001 | 893 | 0 |
| Hypsugo | cut-off 0.001 | 391 | 0 |
| Hypsugo | cut-off 0.01 | 113 | 0 |
| Hypsugo | cut-off 0.1 | 27 | 0 |
| Pipistrellus | no cut-off | 1029 | 0 |
| Pipistrellus | cut-off 0.0001 | 891 | 0 |
| Pipistrellus | cut-off 0.001 | 378 | 0 |
| Pipistrellus | cut-off 0.01 | 100 | 0 |
| Pipistrellus | cut-off 0.1 | 28 | 0 |
7.4 Single to species
master_table %>%
group_by(Species, copy_threshold, prevalence, genome) %>%
summarise(freq = sum(pa, na.rm = TRUE), .groups = "drop") %>%
pivot_wider(names_from = Species, values_from = c(freq)) %>%
mutate(across(c("Eb", "Pk", "Ha"), ~ as.integer(. > 0))) %>%
filter(rowSums(across(Eb:Pk)) == 1) %>%
pivot_longer(!c(copy_threshold, prevalence, genome),
names_to = "Species",
values_to = "unique") %>%
filter(unique == 1) %>%
group_by(Species, copy_threshold, prevalence) %>%
summarise(unique = n()) %>%
ggplot(aes(
x = prevalence,
y = unique,
group = Species,
color = Species
)) +
geom_line(size = 1) +
geom_point(size = 2) +
facet_wrap( ~ factor(
copy_threshold,
labels = c(
"t0" = "No cut-off",
"t1" = "Cut-off 0.0001",
"t2" = "Cut-off 0.001",
"t3" = "Cut-off 0.01",
"t4" = "Cut-off 0.1"
)
), scales = "free_y") +
labs(x = "Prevalence threshold (%)", y = " Single to species", color = "Bat species") +
theme_minimal() +
theme(
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12),
axis.title = element_text(size = 20, face = "bold"),
axis.text = element_text(face = "bold", size = 18),
panel.background = element_blank(),
axis.line = element_line(
size = 0.5,
linetype = "solid",
colour = "black"
),
strip.text = element_text(size = 20),
legend.text = element_text(size = 16),
legend.title = element_text(size = 18),
legend.position = "bottom",
legend.box = "horizontal"
)7.5 Alpha Diversity
prev_threshold <- c("p0", "p10", "p20", "p30", "p40")
copy_threshold <- c("t0", "t1", "t2", "t3", "t4")
bats <- c("Eb", "Pk", "Ha")
results <- data.frame(
alpha = character(),
Total = numeric(),
Eptesicus = character(),
Pipistrellus = character(),
Hypsugo = character(),
Copy_threshold = character(),
Prevalence = character()
)
for (copy in copy_threshold) {
genome_table <- master_table %>%
filter(copy_threshold == copy) %>%
select(-copy_threshold, -pa) %>%
pivot_wider(names_from = genome, values_from = value)
for (prev in prev_threshold) {
prev_table <- genome_table %>%
filter(prevalence == prev) %>%
select(-Species, -prevalence) %>%
column_to_rownames(., "sample") %>%
filter(rowSums(across(everything())) != 0) %>%
t() %>%
as.data.frame()
richness <- prev_table %>%
dplyr::select(where(~ !all(. == 0))) %>%
hilldiv(., q = 0) %>%
t() %>%
as.data.frame() %>%
dplyr::rename(richness = 1) %>%
rownames_to_column(var = "sample")
neutral <- prev_table %>%
dplyr::select(where(~ !all(. == 0))) %>%
hilldiv(., q = 1) %>%
t() %>%
as.data.frame() %>%
dplyr::rename(neutral = 1) %>%
rownames_to_column(var = "sample")
alpha_div <- richness %>%
full_join(neutral, by = join_by(sample == sample))
div <- alpha_div %>%
pivot_longer(-sample, names_to = "alpha", values_to = "value") %>%
left_join(sample_metadata, by = join_by(sample == sample)) %>%
group_by(alpha) %>%
summarise(
total_mean = mean(value, na.rm = T),
total_sd = sd(value, na.rm = T),
Eb_mean = mean(value[Species == "Eb"], na.rm = T),
Eb_sd = sd(value[Species == "Eb"], na.rm = T),
Ha_mean = mean(value[Species == "Ha"], na.rm = T),
Ha_sd = sd(value[Species == "Ha"], na.rm = T),
Pk_mean = mean(value[Species == "Pk"], na.rm = T),
Pk_sd = sd(value[Species == "Pk"], na.rm = T)
) %>%
mutate(
Total = str_c(round(total_mean, 3), "±", round(total_sd, 3)),
Eptesicus = str_c(round(Eb_mean, 3), "±", round(Eb_sd, 3)),
Hypsugo = str_c(round(Ha_mean, 3), "±", round(Ha_sd, 3)),
Pipistrellus = str_c(round(Pk_mean, 3), "±", round(Pk_sd, 3))
) %>%
dplyr::select(alpha, Total, Eptesicus, Pipistrellus, Hypsugo) %>%
mutate(Prevalence = prev) %>%
mutate(Copy_threshold = copy) %>%
as.data.frame()
results <- rbind(results, div)
}
}
results <- results %>%
mutate(across(everything(), ~ ifelse(is.na(.), 0, .)))results %>%
pivot_longer(!c(alpha, Prevalence, Copy_threshold),
names_to = "Species",
values_to = "value") %>%
filter(alpha == "richness") %>% mutate(
Species = factor(
Species,
levels = c("Total", "Eptesicus", "Pipistrellus", "Hypsugo"),
labels = c("All bat species", "C. bottae", "P. kuhlii", "H. ariel")
) ) %>%
ggplot(.,
aes(
x = Prevalence,
y = as.numeric(gsub("±.*", "", value)),
group = Species,
color = Species
)) +
geom_line(size = 1) +
geom_point(size = 2) +
scale_color_manual(values = species_color,
labels = c(
expression(italic("All bat species")),
expression(italic("C. bottae")),
expression(italic("H. ariel")),
expression(italic("P. kuhlii"))
))+
facet_wrap( ~ factor(
Copy_threshold,
labels = c(
"t0" = "No cut-off",
"t1" = "Cut-off 0.0001",
"t2" = "Cut-off 0.001",
"t3" = "Cut-off 0.01",
"t4" = "Cut-off 0.1"
)), scales = "free_y") +
labs(
title = "Alpha Diversity Across Copy filtering and Prevalence Thresholds",
x = "Prevalence Threshold",
y = "Richness",
color = "Species"
) +
theme_minimal() +
theme(
axis.text.x = element_text(size = 12, face = "bold"),
axis.text.y = element_text(size = 12, face = "bold"),
axis.title = element_text(size = 16, face = "bold"),
panel.background = element_blank(),
axis.line = element_line(
size = 0.5,
linetype = "solid",
colour = "black"
),
strip.text = element_text(size = 16, face = "bold"),
legend.text = element_text(size = 16),
legend.title = element_text(size = 18),
legend.position = "bottom",
legend.box = "horizontal"
)results %>%
pivot_longer(!c(alpha, Prevalence, Copy_threshold),
names_to = "Species",
values_to = "value") %>%
filter(alpha == "neutral") %>%
mutate(
Species = factor(
Species,
levels = c("Total", "Eptesicus", "Pipistrellus", "Hypsugo"),
labels = c("All bat species", "C. bottae", "P. kuhlii", "H. ariel")
) ) %>%
ggplot(.,
aes(
x = Prevalence,
y = as.numeric(gsub("±.*", "", value)),
group = Species,
color = Species
)) +
geom_line(size = 1) +
geom_point(size = 2) +
scale_color_manual(values = species_color,
labels = c(
expression(italic("All bat species")),
expression(italic("C. bottae")),
expression(italic("H. ariel")),
expression(italic("P. kuhlii"))
))+
facet_wrap( ~ factor(
Copy_threshold,
labels = c(
"t0" = "No cut-off",
"t1" = "Cut-off 0.0001",
"t2" = "Cut-off 0.001",
"t3" = "Cut-off 0.01",
"t4" = "Cut-off 0.1"
)
), scales = "free_y") +
labs(
x = "Prevalence Threshold",
y = "Neutral",
color = "Species"
) +
theme_minimal() +
theme(
axis.text.x = element_text(size = 12, face = "bold"),
axis.text.y = element_text(size = 12, face = "bold"),
axis.title = element_text(size = 14, face = "bold"),
panel.background = element_blank(),
axis.line = element_line(
size = 0.5,
linetype = "solid",
colour = "black"
),
strip.text = element_text(size = 14, face = "bold"),
legend.text = element_text(size = 14),
legend.title = element_text(size = 16),
legend.position = "bottom",
legend.box = "horizontal"
)7.5.1 Comparison alpha diversity: amplicom different threshold vs metagenomics
prev_threshold <- c("p0", "p10", "p20", "p30", "p40")
copy_threshold <- c("t0", "t1", "t2", "t3", "t4")
bats <- c("Eb", "Pk", "Ha")
alpha_both <- data.frame(
alpha = character(),
Total = numeric(),
Eptesicus = character(),
Pipistrellus = character(),
Hypsugo = character(),
Copy_threshold = character(),
Prevalence = character()
)
for (copy in copy_threshold) {
genome_table <- master_table %>%
filter(copy_threshold == copy) %>%
select(-copy_threshold, -pa) %>%
pivot_wider(names_from = genome, values_from = value)
for (prev in prev_threshold) {
prev_table <- genome_table %>%
filter(prevalence == prev) %>%
select(-Species, -prevalence) %>%
column_to_rownames(., "sample") %>%
filter(rowSums(across(everything())) != 0) %>%
t() %>%
as.data.frame()
richness <- prev_table %>%
dplyr::select(where( ~ !all(. == 0))) %>%
hilldiv(., q = 0) %>%
t() %>%
as.data.frame() %>%
dplyr::rename(richness = 1) %>%
rownames_to_column(var = "sample")
neutral <- prev_table %>%
dplyr::select(where( ~ !all(. == 0))) %>%
hilldiv(., q = 1) %>%
t() %>%
as.data.frame() %>%
dplyr::rename(neutral = 1) %>%
rownames_to_column(var = "sample")
alpha_div <- richness %>%
full_join(neutral, by = join_by(sample == sample))
div <- alpha_div %>%
mutate(prevalence = prev) %>%
mutate(copy_threshold = copy) %>%
as.data.frame()
alpha_both <- rbind(alpha_both, div)
}
}
alpha_both <- alpha_both %>%
mutate(method = "16S")*** Statistical differences in richness with different thresholds between methods ***
metagenome_alpha_div <- readRDS("resources/metagenomics/metagenome_alpha_div.rds") %>%
select(sample, richness, neutral, method)
prev_threshold <- c("p0", "p10", "p20", "p30", "p40")
copy_threshold <- c("t0", "t1", "t2", "t3", "t4")
wilcox_results_table <- data.frame(
Copy_threshold = character(),
Prevalence = character(),
W_statistic = numeric(),
P_value = numeric(),
stringsAsFactors = FALSE
)
for (copy in copy_threshold) {
alpha_Filt <- alpha_both %>%
filter(copy_threshold == copy) %>%
select(-copy_threshold)
for (prev in prev_threshold) {
all_result <- alpha_Filt %>%
filter(prevalence == prev) %>%
select(-prevalence) %>%
rbind(., metagenome_alpha_div) %>%
pivot_longer(!c(sample, method),
names_to = "metric",
values_to = "diversity") %>%
filter(metric=="richness")
wilcox_result <- wilcox.test(diversity ~ method, data = all_result,
exact = FALSE)
W <- wilcox_result[[1]]
p_value <- wilcox_result[[3]]
wilcox_results_table <- wilcox_results_table %>%
add_row(
Copy_threshold = copy,
Prevalence = prev,
W_statistic = W,
P_value = p_value
)
}
}All p values
wilcox_results_table %>%
mutate(
Copy_threshold = case_when(
Copy_threshold == "t0" ~ " no cut-off",
Copy_threshold == "t1" ~ "cut-off 0.0001",
Copy_threshold == "t2" ~ "cut-off 0.001",
Copy_threshold == "t3" ~ "cut-off 0.01",
Copy_threshold == "t4" ~ "cut-off 0.1",
TRUE ~ Copy_threshold
)
) %>%
mutate(
Prevalence = case_when(
Prevalence == "p0" ~ "No prevalence",
Prevalence == "p10" ~ "Prevalence 10%",
Prevalence == "p20" ~ "Prevalence 20%",
Prevalence == "p30" ~ "Prevalence 30%",
Prevalence == "p40" ~ "Prevalence 40%",
TRUE ~ Prevalence
)
) %>%
rename(
`Copy threshold` = Copy_threshold,
`Prevalence` = Prevalence,
`W statistic` = W_statistic,
`P_value` = P_value
) %>%
tt()Significant p values
wilcox_results_table %>%
mutate(
Copy_threshold = case_when(
Copy_threshold == "t0" ~ " no cut-off",
Copy_threshold == "t1" ~ "cut-off 0.0001",
Copy_threshold == "t2" ~ "cut-off 0.001",
Copy_threshold == "t3" ~ "cut-off 0.01",
Copy_threshold == "t4" ~ "cut-off 0.1",
TRUE ~ Copy_threshold
)
) %>%
mutate(
Prevalence = case_when(
Prevalence == "p0" ~ "No prevalence",
Prevalence == "p10" ~ "Prevalence 10%",
Prevalence == "p20" ~ "Prevalence 20%",
Prevalence == "p30" ~ "Prevalence 30%",
Prevalence == "p40" ~ "Prevalence 40%",
TRUE ~ Prevalence
)
) %>%
filter(P_value<0.05) %>%
rename(
`Copy threshold` = Copy_threshold,
`Prevalence` = Prevalence,
`W statistic` = W_statistic,
`P value` = P_value
) %>%
tt()Non-Significant p values
wilcox_results_table %>%
mutate(
Copy_threshold = case_when(
Copy_threshold == "t0" ~ " no cut-off",
Copy_threshold == "t1" ~ "cut-off 0.0001",
Copy_threshold == "t2" ~ "cut-off 0.001",
Copy_threshold == "t3" ~ "cut-off 0.01",
Copy_threshold == "t4" ~ "cut-off 0.1",
TRUE ~ Copy_threshold
)
) %>%
mutate(
Prevalence = case_when(
Prevalence == "p0" ~ "No prevalence",
Prevalence == "p10" ~ "Prevalence 10%",
Prevalence == "p20" ~ "Prevalence 20%",
Prevalence == "p30" ~ "Prevalence 30%",
Prevalence == "p40" ~ "Prevalence 40%",
TRUE ~ Prevalence
)
) %>%
filter(P_value>0.05) %>%
rename(
`Copy threshold` = Copy_threshold,
`Prevalence` = Prevalence,
`W statistic` = W_statistic,
`P value` = P_value
) %>%
tt()7.5.2 Finding the threshold that gives results most similar to metagenomics
load("resources/metagenomics/data_filtered.Rdata")
metagenomics_diversity <- genome_counts_filt %>%
column_to_rownames("genome") %>%
hilldiv(.,q=1) %>%
as.data.frame() %>%
rownames_to_column("filter")
metagenomics_diversity[1,1] <- "metagenomics_metagenomics"
diversity_master <- master_table %>%
mutate(filter=str_c(copy_threshold,"_",prevalence)) %>%
select(-copy_threshold,-prevalence) %>%
group_by(filter,sample) %>%
summarise(diversity=hilldiv(value,q=1)) %>%
pivot_wider(names_from=sample,values_from=diversity)
diversity_master <- bind_rows(metagenomics_diversity,diversity_master)
diversity_average <- diversity_master %>%
mutate(mean_diversity = rowMeans(select(., -filter), na.rm = TRUE)) %>%
select(filter,mean_diversity)
numeric_data <- diversity_master %>% select(-filter)
metadata <- diversity_master$filter
pca_result <- prcomp(numeric_data, scale. = TRUE)
pca_df <- as.data.frame(pca_result$x) %>%
mutate(filter = metadata) %>%
separate(filter, into = c("copy", "prevalence"), sep = "_", fill = "right", remove = FALSE) %>%
left_join(diversity_average,by="filter") %>%
mutate(
copy = case_when(
copy == "metagenomics" ~ "Metagenomics",
copy == "t0" ~ "No filtering",
copy == "t1" ~ "< 0.0001",
copy == "t2" ~ "< 0.001",
copy == "t3" ~ "< 0.01",
copy == "t4" ~ "< 0.1",
TRUE ~ copy
)
) %>%
mutate(
prevalence = case_when(
prevalence == "metagenomics" ~ NA_character_,
prevalence == "p0" ~ "No filtering",
prevalence == "p10" ~ "< 10%",
prevalence == "p20" ~ "< 20%",
prevalence == "p30" ~ "< 30%",
prevalence == "p40" ~ "< 40%",
TRUE ~ prevalence
)
) %>%
mutate(
copy = factor(
copy,
levels = c("Metagenomics", "No filtering", "< 0.0001", "< 0.001", "< 0.01", "< 0.1"),
) )%>%
mutate(
prevalence = factor(
prevalence,
levels = c("No filtering", "< 10%", "< 20%", "< 30%", "< 40%"),
) ) %>%
mutate(is_meta = copy == "Metagenomics")
ggplot(pca_df, aes(x = PC1, y = PC2, color = copy, shape=prevalence, label=filter)) +
geom_point(
data = subset(pca_df, !is_meta),
aes(color = copy, shape = prevalence),
alpha = 0.6, size=5
) +
geom_point(
data = subset(pca_df, is_meta),
shape = 21,
fill = "red",
color = "black", size=7
) +
geom_label_repel(
data = subset(pca_df, is_meta),
aes(label = "Metagenomics"),
color = "black",
fontface = "bold",
size = 4,
label.size = 0.4,
nudge_x = 0.05,
nudge_y = 0.18
)+
scale_shape_discrete(na.translate = FALSE) +
theme_minimal() +
theme(
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12),
axis.title = element_text(size = 18, face = "bold"),
panel.background = element_blank(),
axis.line = element_line(size = 0.5, linetype = "solid", colour = "black"),
legend.text = element_text(size = 16),
legend.title = element_text(size = 18)
)+
labs(
x = paste0("PC1 (", round(summary(pca_result)$importance[2,1] * 100, 2), "% variance)"),
y = paste0("PC2 (", round(summary(pca_result)$importance[2,2] * 100, 2), "% variance)"),
color = "Read fraction",
shape = "Prevalence"
)pca_scores <- as.data.frame(pca_result$x)
reference_row <- pca_scores[diversity_master$filter == "metagenomics_metagenomics", ]
variance_explained <- summary(pca_result)$importance[2, ]
weighted_distances <- apply(pca_result$x, 1, function(row) {
sum(((row - reference_row)^2) * variance_explained)
})
tibble(filter=diversity_master$filter,distance=weighted_distances) %>%
mutate(distance = round(distance, 2)) %>%
separate(filter, into = c("copy", "prevalence"), sep = "_", fill = "right", remove = FALSE) %>%
mutate(
copy = case_when(
copy == "metagenomics" ~ "Metagenomics",
copy == "t0" ~ "No filtering",
copy == "t1" ~ "< 0.0001",
copy == "t2" ~ "< 0.001",
copy == "t3" ~ "< 0.01",
copy == "t4" ~ "< 0.1",
TRUE ~ copy
)
)%>%
mutate(
prevalence = case_when(
prevalence == "metagenomics" ~ NA_character_,
prevalence == "p0" ~ "No filtering",
prevalence == "p10" ~ "< 10%",
prevalence == "p20" ~ "< 20%",
prevalence == "p30" ~ "< 30%",
prevalence == "p40" ~ "< 40%",
TRUE ~ prevalence
)
) %>%
select(-filter) %>%
arrange(distance) %>%
tt()| copy | prevalence | distance |
|---|---|---|
| Metagenomics | NA | 0.00 |
| < 0.01 | < 10% | 0.77 |
| No filtering | < 30% | 1.10 |
| < 0.0001 | < 30% | 1.20 |
| < 0.001 | < 20% | 1.43 |
| No filtering | < 40% | 2.17 |
| < 0.0001 | < 40% | 2.33 |
| < 0.1 | No filtering | 2.81 |
| < 0.001 | < 30% | 4.25 |
| < 0.0001 | < 20% | 6.74 |
| No filtering | < 20% | 7.28 |
| < 0.001 | < 40% | 8.31 |
| < 0.01 | < 20% | 8.94 |
| < 0.01 | < 30% | 15.49 |
| < 0.1 | < 10% | 15.69 |
| < 0.1 | < 20% | 17.67 |
| < 0.01 | < 40% | 18.25 |
| < 0.1 | < 30% | 18.25 |
| < 0.1 | < 40% | 18.25 |
| < 0.001 | < 10% | 24.79 |
| < 0.01 | No filtering | 37.70 |
| < 0.0001 | < 10% | 48.48 |
| No filtering | < 10% | 50.24 |
| < 0.001 | No filtering | 172.93 |
| < 0.0001 | No filtering | 243.76 |
| No filtering | No filtering | 248.13 |
diversity_master %>%
filter(filter %in% c("metagenomics_metagenomics","t3_p10")) %>%
pivot_longer(!filter, names_to = "sample", values_to = "diversity") %>%
ggplot(aes(x=filter,y=diversity, group=filter)) +
geom_boxplot() +
geom_jitter() +
geom_line(aes(group = sample), linetype = "dashed", size = 1)+
theme_minimal()Save the table with selected filtering thresholds
load("resources/amplicon/data_nocopyfilt.Rdata")
load("resources/amplicon/master_table.Rdata")
genome_counts_filt_amplicon <- master_table %>%
mutate(filter = str_c(copy_threshold, "_", prevalence)) %>%
filter(filter == "t3_p10") %>%
select(-copy_threshold, -prevalence, -filter, -pa, -Species) %>%
pivot_wider(names_from = sample, values_from = value) %>%
mutate(across(everything(), ~ replace_na(., 0))) %>%
select(genome, where( ~ is.numeric(.) && sum(.) > 0)) %>%
filter(rowSums(across(-genome, ~ . != 0)) > 0)
sample_metadata_amplicon <- sample_metadata %>%
filter(sample %in% colnames(genome_counts_filt_amplicon))
genome_metadata_amplicon <- genome_metadata %>%
filter(genome %in% genome_counts_filt_amplicon$genome)
save(sample_metadata_amplicon, genome_metadata_amplicon, genome_counts_filt_amplicon, file = "resources/amplicon/selected_filtering_rel.Rdata")load("resources/amplicon/data_nocopyfilt.Rdata")
load("resources/amplicon/selected_filtering.Rdata")
genome_counts_filt_amplicon <- genome_counts_filt %>%
filter(genome %in% genome_counts_filt_amplicon$genome) %>%
filter(rowSums(across(-genome)) != 0) %>%
select(genome, where(~ !all(. == 0))) %>%
select(all_of(colnames(genome_counts_filt_amplicon)))
# genome_counts_filt_amplicon_pa <- genome_counts_filt_amplicon %>%
# mutate(across(-genome, ~ if_else(. > 0, 1, 0)))
#
# genome_counts_filt_amplicon <- genome_counts_filt %>%
# column_to_rownames("genome") %>%
# `*`(genome_counts_filt_amplicon_pa %>%
# column_to_rownames("genome")) %>%
# rownames_to_column("genome") %>%
# filter(genome %in% genome_metadata_amplicon$genome) %>%
# filter(rowSums(across(-genome)) != 0)
sample_metadata_amplicon <- sample_metadata %>%
filter(sample %in% colnames(genome_counts_filt_amplicon))
genome_metadata_amplicon <- genome_metadata %>%
filter(genome %in% genome_counts_filt_amplicon$genome)
phylum_colors <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
mutate_at(vars(phylum), ~ str_replace(., "[dpcofgs]__", ""))%>%
right_join(genome_metadata_amplicon, by=join_by(phylum == phylum)) %>%
arrange(match(genome, genome_metadata_amplicon$genome)) %>%
select(phylum, colors) %>%
unique() %>%
arrange(phylum) %>%
pull(colors, name=phylum)
save(sample_metadata_amplicon, genome_metadata_amplicon, genome_counts_filt_amplicon, phylum_colors, file = "resources/amplicon/selected_filtering.Rdata")load("resources/amplicon/data_nocopyfilt.Rdata")
genome_counts_filt_amplicon <- master_table %>%
mutate(filter = str_c(copy_threshold, "_", prevalence)) %>%
filter(filter == "t0_p30") %>%
select(-copy_threshold, -prevalence, -filter, -pa, -Species) %>%
pivot_wider(names_from = sample, values_from = value) %>%
mutate(across(everything(), ~ replace_na(., 0))) %>%
select(genome, where( ~ is.numeric(.) && sum(.) > 0)) %>%
filter(rowSums(across(-genome, ~ . != 0)) > 0)
sample_metadata_amplicon <- sample_metadata %>%
filter(sample %in% colnames(genome_counts_filt_amplicon_t0_p30))
genome_metadata_amplicon <- genome_metadata %>%
filter(genome %in% genome_counts_filt_amplicon_t0_p30$genome)
save(sample_metadata_amplicon, genome_metadata_amplicon, genome_counts_filt_amplicon,file = "resources/amplicon/second_filtering_t0_p30.Rdata")7.6 Based on beta diversity
Aggregate counts at family or phylum level in both datasets
load("resources/metagenomics/data_filtered.Rdata")
genome_metadata <- genome_metadata %>%
mutate(
class = if_else(str_trim(class) == "", paste0(phylum, "_Unclassified"), class),
order = if_else(str_trim(order) == "",
if_else(str_detect(class, "Unclassified"), class, paste0(class, "_Unclassified")),
order),
family = if_else(str_trim(family) == "",
if_else(str_detect(order, "Unclassified"), order, paste0(order, "_Unclassified")),
family),
genus = if_else(str_trim(genus) == "",
if_else(str_detect(family, "Unclassified"), family, paste0(family, "_Unclassified")),
genus)
)
genome_tax <- genome_counts_filt %>%
left_join(genome_metadata[c(1,3,4,6)], by = "genome")
# Aggregate counts at the Family level
family_level_agg <- genome_tax %>%
select(-genome) %>%
group_by(phylum, class, family) %>%
summarise(across(where(is.numeric), \(x) sum(x, na.rm = TRUE)), .groups = "drop")%>%
as.data.frame() %>%
select(-c(phylum,class))
meta_dist <- family_level_agg %>%
column_to_rownames("family") %>%
hillpair(q = 1)load("resources/amplicon/data_nocopyfilt.Rdata")
load("resources/amplicon/master_table.Rdata")
filters <- master_table %>%
mutate(filter = str_c(copy_threshold, "_", prevalence)) %>%
distinct(filter) %>%
pull(filter)
genome_metadata_ampli <- genome_metadata %>%
mutate(
class = if_else(is.na(class), paste0(phylum, "_Unclassified"), class),
order = if_else(is.na(order),
if_else(str_detect(class, "Unclassified"), class, paste0(class, "_Unclassified")),
order),
family = if_else(is.na(family),
if_else(str_detect(order, "Unclassified"), order, paste0(order, "_Unclassified")),
family),
genus = if_else(is.na(genus),
if_else(str_detect(family, "Unclassified"), family, paste0(family, "_Unclassified")),
genus)
)
beta_list <- list()
filter_stats <- tibble()
min_taxa <- 5
min_samples <- 51
for (f in filters) {
cat("Processing:", f, "\n")
df_subset <- master_table %>%
mutate(value = ifelse(is.na(value), 0, value)) %>%
mutate(filter = str_c(copy_threshold, "_", prevalence)) %>%
filter(filter == f)
df <- df_subset %>%
select(sample, genome, value) %>%
group_by(genome, sample) %>%
summarise(value = sum(value, na.rm = TRUE), .groups = "drop") %>%
pivot_wider(
names_from = sample,
values_from = value,
values_fill = list(value = 0)
) %>%
filter(rowSums(across(-genome)) != 0) %>%
select(genome, where(~ !all(. == 0)))
mat <- df %>%
left_join(genome_metadata_ampli[c(1, 3, 4, 6)], by = "genome") %>%
select(-genome) %>%
group_by(phylum, class, family) %>%
summarise(across(where(is.numeric), \(x) sum(x, na.rm = TRUE)), .groups = "drop") %>%
as.data.frame() %>%
select(-c(phylum, class)) %>%
select(family, everything()) %>%
column_to_rownames("family")
# Count stats BEFORE filtering out
taxa_count <- nrow(mat)
sample_count <- ncol(mat)
# QC filters
if (taxa_count < min_taxa) next
if (sample_count < min_samples) next
# # 1. Minimum number of families
# if (nrow(mat) < min_taxa) {
# cat("Skipping", f, "- too few taxa:", nrow(mat), "\n")
# next
# }
# # 2. Require all samples (51)
# min_samples <- 50
# if (ncol(mat) < min_samples) {
# cat("Skipping", f, "- missing samples:", ncol(mat), "/", expected_samples, "\n")
# next
# }
# # 3. Remove empty samples (safety)
# mat <- mat[, colSums(mat) > 0, drop = FALSE]
# 4. Compute beta diversity
beta <- hillpair(mat, q = 1)
# 5. Check variance in distance matrix
dist_mat <- as.matrix(beta$S)
if (sd(dist_mat[upper.tri(dist_mat)]) == 0) {
cat("Skipping", f, "- no variation in beta diversity\n")
next
}
beta_list[[f]] <- beta
filter_stats <- bind_rows(
filter_stats,
tibble(
filter = f,
taxa = taxa_count,
samples = sample_count)
)
}
save(beta_list,filter_stats, file="resources/amplicon/distance_thresholds_beta_family.Rdata")#distance_thresholds_beta_family50.Rdataload("resources/amplicon/distance_thresholds_beta_family.Rdata")
# Make sure your metagenomics similarity matrix is a proper matrix
meta_S <- as.matrix(meta_dist$S)
mantel_results <- map_dfr(seq_along(beta_list), function(i) {
# Extract similarity matrix for this filter
S_amp <- as.matrix(beta_list[[i]]$S)
# Find common samples
common_samples <- intersect(colnames(meta_S), colnames(S_amp))
# Skip if fewer than 2 shared samples (Mantel requires >=2)
if(length(common_samples) < 2) {
return(tibble(
filter = names(beta_list)[i],
correlation = NA_real_,
p_value = NA_real_,
n_samples = length(common_samples)
))
}
# Align both matrices in the same order
meta_aligned <- meta_S[common_samples, common_samples, drop = FALSE]
amp_aligned <- S_amp[common_samples, common_samples, drop = FALSE]
# print(names(beta_list)[i])
# print(common_samples)
# print(all(rownames(meta_aligned) == rownames(amp_aligned))) # should be TRUE
# print(all(colnames(meta_aligned) == colnames(amp_aligned))) # should be TRUE
# Convert similarity to distance
meta_dist_mat <- meta_aligned
amp_dist_mat <- amp_aligned
# Run Mantel test
test <- vegan::mantel(
as.dist(meta_dist_mat),
as.dist(amp_dist_mat),
method = "spearman"
)
# Return results
tibble(
filter = names(beta_list)[i],
correlation = test$statistic,
p_value = test$signif
)
})genome_counts_row <- family_level_agg %>%
column_to_rownames("family")
# Metagenomics similarity matrix
meta_S_matrix <- as.matrix(meta_dist$S)
colnames(meta_S_matrix) <- rownames(meta_S_matrix) <- colnames(genome_counts_row) # assign your sample names
# Create aligned list for each amplicon filter
aligned_list <- purrr::map(names(beta_list), function(filter_name) {
# Amplicon similarity matrix
S_amp <- as.matrix(beta_list[[filter_name]]$S)
# Assign sample names from master_table
samples_for_filter <- master_table %>%
mutate(filter = str_c(copy_threshold, "_", prevalence)) %>%
filter(filter == filter_name) %>%
pull(sample) %>%
unique()
rownames(S_amp) <- colnames(S_amp) <- samples_for_filter
# Subset metagenomics matrix to same samples
meta_S_aligned <- meta_S_matrix[samples_for_filter, samples_for_filter, drop = FALSE]
list(meta = meta_S_aligned, amp = S_amp)
})
names(aligned_list) <- names(beta_list)
pca_data <- map_dfr(names(aligned_list), function(filter_name) {
mat_amp <- aligned_list[[filter_name]]$amp
mat_meta <- aligned_list[[filter_name]]$meta
# Convert similarity to distance
dist_amp <- mat_amp
dist_meta <- mat_meta
# Flatten upper triangle to a vector
vec_amp <- dist_amp[upper.tri(dist_amp)]
vec_meta <- dist_meta[upper.tri(dist_meta)]
# Return separate rows for amplicon and metagenomics
tibble(
filter = filter_name,
type = "amplicon",
distances = list(vec_amp)
) %>%
bind_rows(
tibble(
filter = filter_name,
type = "metagenomics",
distances = list(vec_meta)
)
)
})
# Convert distances column into numeric vectors
dist_list <- lapply(pca_data$distances, function(x) {
as.numeric(unlist(x))
})
# Combine into a data frame / matrix for PCA
pca_matrix <- do.call(rbind, dist_list)
# Assign rownames
rownames(pca_matrix) <- paste(pca_data$filter, pca_data$type, sep = "_")
pca_result <- prcomp(pca_matrix, scale. = TRUE)
pca_df <- as.data.frame(pca_result$x) %>%
rownames_to_column("filter_type") %>%
separate(filter_type, into = c("copy", "prevalence", "type"), sep = "_", fill = "right") %>%
mutate(
copy = factor(copy, levels = c("t0","t1","t2","t3","t4")),
prevalence = factor(prevalence, levels = c("p0","p10","p20","p30","p40")),
type = factor(type, levels = c("amplicon","metagenomics"))
)ggplot(pca_df, aes(x = PC1, y = PC2, color = copy, shape = type)) +
geom_point(size = 4, alpha = 0.7) +
geom_text_repel(aes(label = paste(copy, prevalence, sep="_")), size = 3) +
theme_minimal(base_size = 14) +
labs(
title = "PCA of Beta Diversity: Amplicon vs Metagenomics",
x = paste0("PC1 (", round(summary(pca_result)$importance[2,1]*100,2), "% variance)"),
y = paste0("PC2 (", round(summary(pca_result)$importance[2,2]*100,2), "% variance)"),
color = "Copy threshold",
shape = "Data type"
) +
scale_shape_manual(values = c(16, 17)) + # circle for amp, triangle for meta
theme(
legend.position = "right",
panel.grid.major = element_line(color = "gray90"),
panel.grid.minor = element_blank()
)pca_scores <- as.data.frame(pca_result$x)
pca_scores_unique <- pca_scores[
!grepl("metagenomics", rownames(pca_scores)) |
rownames(pca_scores) == rownames(pca_scores)[grepl("metagenomics", rownames(pca_scores))][1],
]
reference_row <- pca_scores_unique[grepl("_metagenomics", rownames(pca_scores_unique)), ]
#reference_row <- reference_rows[1, ]
variance_explained <- summary(pca_result)$importance[2, ]
weighted_distances <- apply(pca_scores_unique, 1, function(row) {
sum(((row - reference_row)^2) * variance_explained)
})
pca_table <- tibble(
filter_type = rownames(pca_scores_unique),
distance = weighted_distances
) %>%
as.data.frame() %>%
rename(filter=filter_type) %>%
filter(filter != "t0_p0_metagenomics") %>%
mutate(filter = gsub("_amplicon", "", filter))Summary_table
filter_stats %>%
left_join(mantel_results, by = "filter") %>%
left_join(pca_table, by = "filter") %>%
arrange(distance,desc(correlation)) %>%
separate(filter, into = c("copy", "prevalence"), sep = "_", fill = "right") %>%
mutate(
copy = case_when(
copy == "t0" ~ "No filtering",
copy == "t1" ~ "< 0.0001",
copy == "t2" ~ "< 0.001",
copy == "t3" ~ "< 0.01",
copy == "t4" ~ "< 0.1",
TRUE ~ copy
)
) %>%
mutate(
prevalence = case_when(
prevalence == "p0" ~ "No filtering",
prevalence == "p10" ~ "< 10%",
prevalence == "p20" ~ "< 20%",
prevalence == "p30" ~ "< 30%",
prevalence == "p40" ~ "< 40%",
TRUE ~ prevalence
)
) %>%
tt()| copy | prevalence | taxa | samples | correlation | p_value | distance |
|---|---|---|---|---|---|---|
| < 0.1 | No filtering | 36 | 51 | 0.16909669 | 0.001 | 708.4683 |
| < 0.01 | No filtering | 83 | 51 | 0.11128305 | 0.016 | 1256.9702 |
| < 0.001 | < 20% | 48 | 51 | 0.03270130 | 0.220 | 1670.4436 |
| < 0.001 | < 10% | 99 | 51 | 0.07761010 | 0.069 | 1670.7044 |
| < 0.0001 | < 30% | 38 | 51 | 0.01610890 | 0.390 | 1823.4039 |
| No filtering | < 30% | 42 | 51 | 0.02399734 | 0.345 | 1827.1078 |
| < 0.0001 | < 20% | 81 | 51 | 0.07318950 | 0.068 | 1829.7276 |
| < 0.0001 | < 10% | 155 | 51 | 0.08384267 | 0.047 | 1830.6092 |
| No filtering | < 10% | 178 | 51 | 0.08596459 | 0.030 | 1830.8377 |
| No filtering | < 20% | 88 | 51 | 0.06833255 | 0.088 | 1851.8249 |
| < 0.001 | No filtering | 179 | 51 | 0.10180009 | 0.026 | 2092.3438 |
| < 0.0001 | No filtering | 299 | 51 | 0.10544208 | 0.024 | 2374.4954 |
| No filtering | No filtering | 329 | 51 | 0.10569201 | 0.020 | 2388.2505 |
Save the table with selected filtering thresholds*
load("resources/amplicon/data_nocopyfilt.Rdata")
load("resources/amplicon/master_table.Rdata")
genome_counts_filt_amplicon <- master_table %>%
mutate(filter = str_c(copy_threshold, "_", prevalence)) %>%
filter(filter == "t4_p0") %>%
select(-copy_threshold, -prevalence, -filter, -pa, -Species) %>%
pivot_wider(names_from = sample, values_from = value) %>%
mutate(across(everything(), ~ replace_na(., 0))) %>%
select(genome, where( ~ is.numeric(.) && sum(.) > 0)) %>%
filter(rowSums(across(-genome, ~ . != 0)) > 0)
sample_metadata_amplicon <- sample_metadata %>%
filter(sample %in% colnames(genome_counts_filt_amplicon))
genome_metadata_amplicon <- genome_metadata %>%
filter(genome %in% genome_counts_filt_amplicon$genome)
save(sample_metadata_amplicon, genome_metadata_amplicon, genome_counts_filt_amplicon, file = "resources/amplicon/selected_filtering_beta_rel.Rdata")load("resources/amplicon/data_nocopyfilt.Rdata")
load("resources/amplicon/selected_filtering_beta_rel.Rdata")
genome_counts_filt_amplicon <- genome_counts_filt %>%
filter(genome %in% genome_counts_filt_amplicon$genome) %>%
filter(rowSums(across(-genome)) != 0) %>%
select(genome, where(~ !all(. == 0))) %>%
select(all_of(colnames(genome_counts_filt_amplicon)))
sample_metadata_amplicon <- sample_metadata %>%
filter(sample %in% colnames(genome_counts_filt_amplicon))
genome_metadata_amplicon <- genome_metadata %>%
filter(genome %in% genome_counts_filt_amplicon$genome)
phylum_colors <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
mutate_at(vars(phylum), ~ str_replace(., "[dpcofgs]__", ""))%>%
right_join(genome_metadata_amplicon, by=join_by(phylum == phylum)) %>%
arrange(match(genome, genome_metadata_amplicon$genome)) %>%
select(phylum, colors) %>%
unique() %>%
arrange(phylum) %>%
pull(colors, name=phylum)
save(sample_metadata_amplicon, genome_metadata_amplicon, genome_counts_filt_amplicon, phylum_colors, file = "resources/amplicon/selected_filtering_beta.Rdata")