Chapter 5 Community composition
5.1 Filter data
Filter samples with high host data
sample_metadata <- sample_metadata %>%
filter(!sample %in% c("EHI02721", "EHI02712", "EHI02700", "EHI02720", "EHI02749", "EHI02719", "EHI02729", "EHI02715", "EHI02722"))
genome_counts_filt <- genome_counts %>%
select(one_of(c("genome",sample_metadata$sample)))%>%
filter(rowSums(. != 0, na.rm = TRUE) > 0) %>%
select_if(~!all(. == 0))
genome_counts <- genome_counts_filt
genome_metadata <- genome_metadata %>%
semi_join(., genome_counts_filt, by = "genome") %>%
arrange(match(genome,genome_counts_filt$genome))
genome_tree <- keep.tip(genome_tree, tip=genome_metadata$genome) # keep only MAG tipsMake a phyloseq object
phylo_samples <- sample_metadata %>%
column_to_rownames("sample") %>%
sample_data() #convert to phyloseq sample_data object
phylo_genome <- genome_counts_filt %>%
column_to_rownames("genome") %>%
otu_table(., taxa_are_rows = TRUE)
phylo_taxonomy <- genome_metadata %>%
column_to_rownames("genome") %>%
as.matrix() %>%
tax_table() #convert to phyloseq tax_table object
phylo_tree <- phy_tree(genome_tree)
physeq_genome <- phyloseq(phylo_genome, phylo_taxonomy, phylo_samples,phylo_tree)
physeq_genome_clr <- microbiome::transform(physeq_genome, 'clr')5.2 Taxonomy overview
5.2.1 Stacked barplot
genome_counts_filt %>%
mutate_at(vars(-genome),~./sum(.)) %>%
pivot_longer(-genome, names_to = "sample", values_to = "count") %>%
left_join(., genome_metadata, by = join_by(genome == genome)) %>%
left_join(., sample_metadata, by = join_by(sample == sample)) %>%
filter(count > 0) %>%
ggplot(., aes(x=sample,y=count, fill=phylum, group=phylum)) +
geom_bar(stat="identity", colour="white", linewidth=0.1) +
scale_fill_manual(values=phylum_colors) +
facet_grid(~ diet, scale="free", space = "free")+
guides(fill = guide_legend(ncol = 1)) +
theme(
axis.title.x = element_blank(),
panel.background = element_blank(),
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.background = element_rect(fill = "white"),
strip.text = element_text(size = 12, lineheight = 0.6),
strip.placement = "outside",
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.line = element_line(
linewidth = 0.5,
linetype = "solid",
colour = "black"
)
) +
labs(fill = "Phylum", y = "Relative abundance", x = "Samples")Number of bacteria phyla
genome_metadata %>%
filter(domain == "d__Bacteria")%>%
dplyr::select(phylum) %>%
unique() %>%
pull() %>%
length()[1] 14
Bacteria phyla in wild individuals
wild_samples <- sample_metadata %>%
filter(region=="Gipuzkoa") %>%
dplyr::select(sample) %>%
pull()
wild_genomes <- genome_counts %>%
column_to_rownames("genome") %>%
select(all_of(wild_samples)) %>%
as.data.frame() %>%
filter(rowSums(across(where(is.numeric)))!=0)%>%
rownames_to_column("genome")%>%
dplyr::select(genome) %>%
pull()
genome_metadata %>%
filter(genome %in% wild_genomes) %>%
filter(domain == "d__Bacteria")%>%
dplyr::select(phylum) %>%
unique() %>%
pull() %>%
length() [1] 14
Bacteria phyla in captive animals
captive_samples <- sample_metadata %>%
filter(region=="Nafarroa") %>%
dplyr::select(sample) %>%
pull()
captive_genomes <- genome_counts %>%
column_to_rownames("genome") %>%
select(all_of(captive_samples)) %>%
as.data.frame() %>%
filter(rowSums(across(where(is.numeric)))!=0)%>%
rownames_to_column("genome")%>%
dplyr::select(genome) %>%
pull()
genome_metadata %>%
filter(genome %in% captive_genomes) %>%
filter(domain == "d__Bacteria")%>%
dplyr::select(phylum) %>%
unique() %>%
pull() %>%
length() [1] 14
Bacteria phyla before grass is included in the diet
captive_pre_samples <- sample_metadata %>%
filter(diet=="Pre_grass") %>%
dplyr::select(sample) %>%
pull()
captive_pre_genomes <- genome_counts %>%
column_to_rownames("genome") %>%
select(all_of(captive_pre_samples)) %>%
as.data.frame() %>%
mutate(row_sum = rowSums(.)) %>%
filter(row_sum != 0) %>%
select(-row_sum)%>%
rownames_to_column("genome")%>%
dplyr::select(genome) %>%
pull()
genome_metadata %>%
filter(genome %in% captive_pre_genomes) %>%
filter(domain == "d__Bacteria")%>%
dplyr::select(phylum) %>%
unique() %>%
pull() %>%
length() [1] 14
Bacteria phyla after grass introduction
captive_post_samples <- sample_metadata %>%
filter(diet=="Post_grass") %>%
dplyr::select(sample) %>%
pull()
captive_post_genomes <- genome_counts %>%
column_to_rownames("genome") %>%
select(all_of(captive_post_samples)) %>%
as.data.frame() %>%
mutate(row_sum = rowSums(.)) %>%
filter(row_sum != 0) %>%
select(-row_sum)%>%
rownames_to_column("genome")%>%
dplyr::select(genome) %>%
pull()
genome_metadata %>%
filter(genome %in% captive_post_genomes) %>%
filter(domain == "d__Bacteria")%>%
dplyr::select(phylum) %>%
unique() %>%
pull() %>%
length() [1] 14
Number of Archaea phyla
genome_metadata %>%
filter(domain == "d__Archaea")%>%
dplyr::select(phylum) %>%
unique() %>%
pull() %>%
length()[1] 1
Archaea phyla in wild individuals
genome_metadata %>%
filter(genome %in% wild_genomes) %>%
filter(domain == "d__Archaea")%>%
dplyr::select(phylum) %>%
unique() %>%
pull() %>%
length()[1] 0
Archaea phyla before grass is included in the diet
genome_metadata %>%
filter(genome %in% captive_pre_genomes) %>%
filter(domain == "d__Archaea")%>%
dplyr::select(phylum) %>%
unique() %>%
pull()[1] "p__Methanobacteriota"
Archaea phyla after grass introduction
genome_metadata %>%
filter(genome %in% captive_post_genomes) %>%
filter(domain == "d__Archaea")%>%
dplyr::select(phylum) %>%
unique() %>%
pull()[1] "p__Methanobacteriota"
5.2.2 Phylum relative abundances
phylum_summary <- genome_counts_filt %>%
mutate_at(vars(-genome),~./sum(.)) %>%
pivot_longer(-genome, names_to = "sample", values_to = "count") %>%
left_join(sample_metadata, by = join_by(sample == sample)) %>%
left_join(genome_metadata, by = join_by(genome == genome)) %>%
group_by(sample,phylum,region, diet) %>%
summarise(relabun=sum(count))phylum_arrange <- phylum_summary %>%
group_by(phylum) %>%
summarise(mean=mean(relabun)) %>%
arrange(-mean) %>%
select(phylum) %>%
pull()
phylum_summary %>%
filter(phylum %in% phylum_arrange) %>%
mutate(phylum=factor(phylum,levels=rev(phylum_arrange))) %>%
ggplot(aes(x=relabun, y=phylum, group=phylum, color=phylum)) +
scale_color_manual(values=phylum_colors[rev(phylum_arrange)]) +
geom_jitter(alpha=0.5) +
theme_minimal() +
theme(legend.position="none") +
labs(y="Phylum",x="Relative abundance")5.2.2.1 Origin: Wild vs Captive
phylum_summary %>%
group_by(phylum) %>%
summarise(total_mean=mean(relabun*100, na.rm=T),
total_sd=sd(relabun*100, na.rm=T),
Wild_mean=mean(relabun[diet=="Wild"]*100, na.rm=T),
Wild_sd=sd(relabun[diet=="Wild"]*100, na.rm=T),
Captive_mean=mean(relabun[diet=="Pre_grass"]*100, na.rm=T),
Captive_sd=sd(relabun[diet=="Pre_grass"]*100, na.rm=T)) %>%
mutate(total=str_c(round(total_mean,3),"±",round(total_sd,3)),
Wild=str_c(round(Wild_mean,3),"±",round(Wild_sd,3)),
Captive=str_c(round(Captive_mean,3),"±",round(Captive_sd,3))) %>%
arrange(-total_mean) %>%
dplyr::select(phylum,total,Wild,Captive)# A tibble: 15 × 4
phylum total Wild Captive
<chr> <chr> <chr> <chr>
1 p__Bacillota_A 57.243±12.618 48.636±13.52 59.846±10.725
2 p__Bacteroidota 25.787±10.113 26.889±11.805 24.986±10.083
3 p__Bacillota 4.118±7.202 5.066±11.152 4.949±5.297
4 p__Spirochaetota 3.911±10.011 11.731±14.792 0.001±0.001
5 p__Verrucomicrobiota 2.264±5.722 1.472±0.715 1.449±1.295
6 p__Pseudomonadota 1.626±3.328 0.511±0.397 3.082±5.382
7 p__Patescibacteria 0.987±1.542 0.212±0.253 2.177±2.223
8 p__Synergistota 0.96±0.852 1.865±0.83 0.45±0.39
9 p__Bacillota_C 0.844±0.795 1.696±0.674 0.386±0.36
10 p__Actinomycetota 0.78±0.985 0.427±0.459 1.217±1.147
11 p__Cyanobacteriota 0.708±2.081 0.277±0.614 0.916±3.024
12 p__Desulfobacterota 0.586±0.396 0.991±0.315 0.41±0.256
13 p__Bacillota_B 0.104±0.145 0.228±0.203 0.042±0.024
14 p__Methanobacteriota 0.082±0.217 0±0 0.09±0.228
15 p__Campylobacterota 0±0 0±0 0±0
5.2.2.2 Origin and diet
phylum_summary %>%
group_by(phylum) %>%
summarise(
total_mean=mean(relabun*100, na.rm=T),
total_sd=sd(relabun*100, na.rm=T),
Wild_mean=mean(relabun[diet=="Wild"]*100, na.rm=T),
Wild_sd=sd(relabun[diet=="Wild"]*100, na.rm=T),
Pre_grass_mean = mean(relabun[diet == "Pre_grass"] * 100, na.rm = T),
Pre_grass_sd = sd(relabun[diet == "Pre_grass"] * 100, na.rm = T),
Post_grass_mean = mean(relabun[diet == "Post_grass"] * 100, na.rm = T),
Post_grass_sd = sd(relabun[diet == "Post_grass"] * 100, na.rm = T)
) %>%
mutate(
total = str_c(round(total_mean, 2), "±", round(total_sd, 2)),
Wild = str_c(round(Wild_mean, 2), "±", round(Wild_sd, 2)),
Pre_grass = str_c(round(Pre_grass_mean, 6), "±", round(Pre_grass_sd, 6)),
Post_grass = str_c(round(Post_grass_mean, 2), "±", round(Post_grass_sd, 2))
) %>%
arrange(-total_mean) %>%
dplyr::select(phylum, total, Wild, Pre_grass, Post_grass) %>%
paged_table()phylum_arrange <- phylum_summary %>%
group_by(phylum) %>%
summarise(mean=sum(relabun)) %>%
arrange(-mean) %>%
select(phylum) %>%
pull()
phylum_summary %>%
left_join(genome_metadata %>% select(phylum,phylum) %>% unique(),by=join_by(phylum==phylum)) %>%
filter(phylum %in% phylum_arrange[1:20]) %>%
mutate(phylum = factor(phylum, levels = rev(phylum_arrange[1:20]))) %>%
filter(relabun > 0) %>%
ggplot(aes(x=relabun, y=phylum, group=phylum, color=phylum)) +
scale_color_manual(values = phylum_colors[-8]) +
geom_jitter(alpha = 0.5) +
facet_grid(. ~ diet) +
theme_minimal() +
labs(y = "phylum", x = "Relative abundance", color = "Phylum")5.2.3 Family
family_summary <- genome_counts_filt %>%
mutate_at(vars(-genome),~./sum(.)) %>%
pivot_longer(-genome, names_to = "sample", values_to = "count") %>%
left_join(sample_metadata, by = join_by(sample == sample)) %>%
left_join(., genome_metadata, by = join_by(genome == genome)) %>%
group_by(sample,family, diet,region) %>%
summarise(relabun=sum(count))
family_summary$diet <- factor(family_summary$diet, levels=c("Pre_grass", "Post_grass", "Wild"))5.2.3.1 Origin: Wild vs Captive
family_summary %>%
group_by(family) %>%
summarise(
total_mean = mean(relabun * 100, na.rm = T),
total_sd = sd(relabun * 100, na.rm = T),
Wild_mean = mean(relabun[diet == "Wild"] * 100, na.rm = T),
Wild_sd = sd(relabun[diet == "Wild"] * 100, na.rm = T),
Cap_mean = mean(relabun[region == "Nafarroa"] * 100, na.rm = T),
Cap_sd = sd(relabun[region == "Nafarroa"] * 100, na.rm = T)
) %>%
mutate(
Total = str_c(round(total_mean, 2), "±", round(total_sd, 2)),
Wild = str_c(round(Wild_mean, 2), "±", round(Wild_sd, 2)),
Captive = str_c(round(Cap_mean, 2), "±", round(Cap_sd, 2))
) %>%
arrange(-total_mean) %>%
dplyr::select(family, Total, Wild, Captive) %>%
paged_table()5.2.3.2 Origin and Diet
family_summary %>%
group_by(family) %>%
summarise(
total_mean = mean(relabun * 100, na.rm = T),
total_sd = sd(relabun * 100, na.rm = T),
Wild_mean = mean(relabun[diet == "Wild"] * 100, na.rm = T),
Wild_sd = sd(relabun[diet == "Wild"] * 100, na.rm = T),
Pre_grass_mean = mean(relabun[diet == "Pre_grass"] * 100, na.rm = T),
Pre_grass_sd = sd(relabun[diet == "Pre_grass"] * 100, na.rm = T),
Post_grass_mean = mean(relabun[diet == "Post_grass"] * 100, na.rm = T),
Post_grass_sd = sd(relabun[diet == "Post_grass"] * 100, na.rm = T)
) %>%
mutate(
Total = str_c(round(total_mean, 2), "±", round(total_sd, 2)),
Wild = str_c(round(Wild_mean, 2), "±", round(Wild_sd, 2)),
Pre_grass = str_c(round(Pre_grass_mean, 2), "±", round(Pre_grass_sd, 2)),
Post_grass = str_c(round(Post_grass_mean, 2), "±", round(Post_grass_sd, 2))
) %>%
arrange(-total_mean) %>%
dplyr::select(family, Total, Wild, Pre_grass, Post_grass) %>%
paged_table()family_arrange <- family_summary %>%
group_by(family) %>%
summarise(mean=sum(relabun)) %>%
arrange(-mean) %>%
select(family) %>%
pull()
family_summary %>%
left_join(genome_metadata %>% select(family, phylum) %>% unique(), by = join_by(family == family)) %>%
filter(family %in% family_arrange[1:20]) %>%
mutate(family = factor(family, levels = rev(family_arrange[1:20]))) %>%
filter(relabun > 0) %>%
ggplot(aes(x=relabun, y=family, group=family, color=phylum)) +
scale_color_manual(values = phylum_colors[-8]) +
geom_jitter(alpha = 0.5) +
facet_grid(. ~ diet) +
theme_minimal() +
labs(y = "Family", x = "Relative abundance", color = "Phylum")5.2.4 Genus
genus_summary <- genome_counts_filt %>%
mutate_at(vars(-genome),~./sum(.)) %>%
pivot_longer(-genome, names_to = "sample", values_to = "count") %>%
left_join(sample_metadata, by = join_by(sample == sample)) %>%
left_join(genome_metadata, by = join_by(genome == genome)) %>%
group_by(sample,phylum,genus, diet) %>%
summarise(relabun=sum(count)) %>%
filter(genus != "g__") %>%
mutate(genus= sub("^g__", "", genus))
genus_summary$diet <- factor(genus_summary$diet, levels=c("Pre_grass", "Post_grass", "Wild"))5.2.5 Origin and diet
genus_summary %>%
group_by(genus) %>%
summarise(
total_mean = mean(relabun * 100, na.rm = T),
total_sd = sd(relabun * 100, na.rm = T),
Wild_mean = mean(relabun[diet == "Wild"] * 100, na.rm = T),
Wild_sd = sd(relabun[diet == "Wild"] * 100, na.rm = T),
Pre_grass_mean = mean(relabun[diet == "Pre_grass"] * 100, na.rm = T),
Pre_grass_sd = sd(relabun[diet == "Pre_grass"] * 100, na.rm = T),
Post_grass_mean = mean(relabun[diet == "Post_grass"] * 100, na.rm = T),
Post_grass_sd = sd(relabun[diet == "Post_grass"] * 100, na.rm = T)
) %>%
mutate(
Total = str_c(round(total_mean, 2), "±", round(total_sd, 2)),
Wild = str_c(round(Wild_mean, 2), "±", round(Wild_sd, 2)),
Pre_grass = str_c(round(Pre_grass_mean, 2), "±", round(Pre_grass_sd, 2)),
Post_grass = str_c(round(Post_grass_mean, 2), "±", round(Post_grass_sd, 2))
) %>%
arrange(-total_mean) %>%
dplyr::select(genus, Total, Wild, Pre_grass, Post_grass) %>%
paged_table()genus_arrange <- genus_summary %>%
group_by(genus) %>%
summarise(mean=sum(relabun)) %>%
filter(genus != "g__")%>%
arrange(-mean) %>%
select(genus) %>%
mutate(genus= sub("^g__", "", genus)) %>%
pull()
genus_summary_sort <- genus_summary %>%
group_by(genus) %>%
summarise(mean=mean(relabun, na.rm=T),sd=sd(relabun, na.rm=T)) %>%
arrange(-mean)
genus_summary %>%
mutate(genus=factor(genus, levels=rev(genus_summary_sort %>% pull(genus)))) %>%
filter(relabun > 0) %>%
ggplot(aes(x=relabun, y=genus, group=genus, color=phylum)) +
scale_color_manual(values=phylum_colors) +
geom_jitter(alpha=0.5) +
facet_grid(.~diet)+
theme_minimal() +
theme(axis.text.y = element_text(size=6))+
labs(y="Family", x="Relative abundance", color="Phylum")5.2.6 Genus and species annotation
Number of MAGs without species-level annotation
genome_metadata %>%
filter(species == "s__") %>%
summarize(Mag_nospecies = n())%>%
select(Mag_nospecies) %>%
pull()[1] 749
total_mag_phylum <- genome_metadata %>%
group_by(phylum) %>%
summarize(count_total = n())
genome_metadata %>%
filter(species == "s__") %>%
group_by(phylum) %>%
summarize(count_nospecies = n()) %>%
left_join(total_mag_phylum, by = join_by(phylum == phylum)) %>%
mutate(percentage=100*count_nospecies/count_total) %>%
tt()| phylum | count_nospecies | count_total | percentage |
|---|---|---|---|
| p__Actinomycetota | 15 | 15 | 100.00000 |
| p__Bacillota | 52 | 53 | 98.11321 |
| p__Bacillota_A | 516 | 526 | 98.09886 |
| p__Bacillota_B | 2 | 2 | 100.00000 |
| p__Bacillota_C | 6 | 9 | 66.66667 |
| p__Bacteroidota | 43 | 127 | 33.85827 |
| p__Campylobacterota | 1 | 1 | 100.00000 |
| p__Cyanobacteriota | 6 | 7 | 85.71429 |
| p__Desulfobacterota | 12 | 12 | 100.00000 |
| p__Patescibacteria | 13 | 13 | 100.00000 |
| p__Pseudomonadota | 32 | 39 | 82.05128 |
| p__Spirochaetota | 2 | 2 | 100.00000 |
| p__Synergistota | 18 | 18 | 100.00000 |
| p__Verrucomicrobiota | 31 | 35 | 88.57143 |
Percentage of MAGs without species-level annotation
nmags <- nrow(genome_counts)
nonspecies <- genome_metadata %>%
filter(species == "s__") %>%
nrow()
perct <- nonspecies*100/nmags
perct[1] 87.09302
Number of MAGs without genera-level annotation
79