Chapter 5 Community composition

5.1 Filter data

load("data/data.Rdata")

Filter samples with high host data

sample_metadata <- sample_metadata %>%
  filter(!sample %in% c("EHI02721", "EHI02712", "EHI02700", "EHI02720", "EHI02749", "EHI02719", "EHI02729", "EHI02715", "EHI02722"))

genome_counts_filt <- genome_counts %>%
  select(one_of(c("genome",sample_metadata$sample)))%>%
  filter(rowSums(. != 0, na.rm = TRUE) > 0) %>%
  select_if(~!all(. == 0))
genome_counts <- genome_counts_filt
genome_metadata <- genome_metadata %>% 
  semi_join(., genome_counts_filt, by = "genome") %>% 
  arrange(match(genome,genome_counts_filt$genome))

genome_tree <- keep.tip(genome_tree, tip=genome_metadata$genome) # keep only MAG tips

Make a phyloseq object

phylo_samples <- sample_metadata %>% 
  column_to_rownames("sample") %>% 
  sample_data() #convert to phyloseq sample_data object
phylo_genome <- genome_counts_filt %>% 
  column_to_rownames("genome") %>% 
  otu_table(., taxa_are_rows = TRUE)
phylo_taxonomy <- genome_metadata %>%
  column_to_rownames("genome") %>% 
  as.matrix() %>% 
  tax_table() #convert to phyloseq tax_table object
phylo_tree <- phy_tree(genome_tree) 

physeq_genome <- phyloseq(phylo_genome, phylo_taxonomy, phylo_samples,phylo_tree)
physeq_genome_clr <- microbiome::transform(physeq_genome, 'clr')
save(sample_metadata, 
     genome_metadata, 
     read_counts, 
     genome_counts, 
     genome_counts_filt, 
     genome_tree,
     physeq_genome,
     physeq_genome_clr,
     genome_gifts_raw, 
     phylum_colors,
     diet_colors,
     file = "data/data_host_filtered.Rdata")
load("data/data_host_filtered.Rdata")

5.2 Taxonomy overview

5.2.1 Stacked barplot

genome_counts_filt %>%
  mutate_at(vars(-genome),~./sum(.)) %>% 
  pivot_longer(-genome, names_to = "sample", values_to = "count") %>% 
  left_join(., genome_metadata, by = join_by(genome == genome)) %>% 
  left_join(., sample_metadata, by = join_by(sample == sample)) %>% 
  filter(count > 0) %>% 
  ggplot(., aes(x=sample,y=count, fill=phylum, group=phylum)) + 
  geom_bar(stat="identity", colour="white", linewidth=0.1) + 
  scale_fill_manual(values=phylum_colors) +
  facet_grid(~ diet, scale="free", space = "free")+
  guides(fill = guide_legend(ncol = 1)) +
  theme(
    axis.title.x = element_blank(),
    panel.background = element_blank(),
    panel.border = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    strip.background = element_rect(fill = "white"),
    strip.text = element_text(size = 12, lineheight = 0.6),
    strip.placement = "outside",
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.line = element_line(
      linewidth = 0.5,
      linetype = "solid",
      colour = "black"
    )
  ) +
  labs(fill = "Phylum", y = "Relative abundance", x = "Samples")

Number of bacteria phyla

genome_metadata %>% 
  filter(domain == "d__Bacteria")%>%
  dplyr::select(phylum) %>%
  unique() %>%
  pull() %>%
  length()
[1] 14

Bacteria phyla in wild individuals

wild_samples <- sample_metadata %>% 
  filter(region=="Gipuzkoa") %>% 
  dplyr::select(sample) %>% 
  pull()

wild_genomes <- genome_counts %>% 
  column_to_rownames("genome") %>% 
  select(all_of(wild_samples)) %>%
  as.data.frame() %>%
  filter(rowSums(across(where(is.numeric)))!=0)%>% 
  rownames_to_column("genome")%>% 
  dplyr::select(genome) %>% 
  pull()

genome_metadata %>% 
  filter(genome %in% wild_genomes) %>% 
  filter(domain == "d__Bacteria")%>%
  dplyr::select(phylum) %>%
  unique() %>%
  pull() %>%
  length() 
[1] 14

Bacteria phyla in captive animals

captive_samples <- sample_metadata %>% 
  filter(region=="Nafarroa") %>% 
  dplyr::select(sample) %>% 
  pull()

captive_genomes <- genome_counts %>% 
  column_to_rownames("genome") %>% 
  select(all_of(captive_samples)) %>%
  as.data.frame() %>%
  filter(rowSums(across(where(is.numeric)))!=0)%>% 
  rownames_to_column("genome")%>% 
  dplyr::select(genome) %>% 
  pull()

genome_metadata %>% 
  filter(genome %in% captive_genomes) %>% 
  filter(domain == "d__Bacteria")%>%
  dplyr::select(phylum) %>%
  unique() %>%
  pull() %>%
  length() 
[1] 14

Bacteria phyla before grass is included in the diet

captive_pre_samples <- sample_metadata %>% 
  filter(diet=="Pre_grass") %>% 
  dplyr::select(sample) %>% 
  pull()

captive_pre_genomes <- genome_counts %>% 
  column_to_rownames("genome") %>% 
  select(all_of(captive_pre_samples)) %>%
  as.data.frame() %>% 
  mutate(row_sum = rowSums(.)) %>%
  filter(row_sum != 0) %>%
  select(-row_sum)%>% 
  rownames_to_column("genome")%>% 
  dplyr::select(genome) %>% 
  pull()

genome_metadata %>% 
  filter(genome %in% captive_pre_genomes) %>% 
  filter(domain == "d__Bacteria")%>%
  dplyr::select(phylum) %>%
  unique() %>%
  pull() %>%
  length() 
[1] 14

Bacteria phyla after grass introduction

captive_post_samples <- sample_metadata %>% 
  filter(diet=="Post_grass") %>% 
  dplyr::select(sample) %>% 
  pull()

captive_post_genomes <- genome_counts %>% 
  column_to_rownames("genome") %>% 
  select(all_of(captive_post_samples)) %>%
  as.data.frame() %>% 
  mutate(row_sum = rowSums(.)) %>%
  filter(row_sum != 0) %>%
  select(-row_sum)%>% 
  rownames_to_column("genome")%>% 
  dplyr::select(genome) %>% 
  pull()

genome_metadata %>% 
  filter(genome %in% captive_post_genomes) %>% 
  filter(domain == "d__Bacteria")%>%
  dplyr::select(phylum) %>%
  unique() %>%
  pull() %>%
  length() 
[1] 14

Number of Archaea phyla

genome_metadata %>% 
  filter(domain == "d__Archaea")%>%
  dplyr::select(phylum) %>%
  unique() %>%
  pull() %>%
  length()
[1] 1

Archaea phyla in wild individuals

genome_metadata %>% 
  filter(genome %in% wild_genomes) %>% 
  filter(domain == "d__Archaea")%>%
  dplyr::select(phylum) %>%
  unique() %>%
  pull() %>%
  length()
[1] 0

Archaea phyla before grass is included in the diet

genome_metadata %>% 
  filter(genome %in% captive_pre_genomes) %>% 
  filter(domain == "d__Archaea")%>%
  dplyr::select(phylum) %>%
  unique() %>%
  pull()
[1] "p__Methanobacteriota"

Archaea phyla after grass introduction

genome_metadata %>% 
  filter(genome %in% captive_post_genomes) %>% 
  filter(domain == "d__Archaea")%>%
  dplyr::select(phylum) %>%
  unique() %>%
  pull()
[1] "p__Methanobacteriota"

5.2.2 Phylum relative abundances

phylum_summary <- genome_counts_filt %>%
  mutate_at(vars(-genome),~./sum(.)) %>% 
  pivot_longer(-genome, names_to = "sample", values_to = "count") %>%
  left_join(sample_metadata, by = join_by(sample == sample)) %>%
  left_join(genome_metadata, by = join_by(genome == genome)) %>%
  group_by(sample,phylum,region, diet) %>%
  summarise(relabun=sum(count))
phylum_arrange <- phylum_summary %>%
    group_by(phylum) %>%
    summarise(mean=mean(relabun)) %>%
    arrange(-mean) %>%
    select(phylum) %>%
    pull()

phylum_summary %>%
    filter(phylum %in% phylum_arrange) %>%
    mutate(phylum=factor(phylum,levels=rev(phylum_arrange))) %>%
    ggplot(aes(x=relabun, y=phylum, group=phylum, color=phylum)) +
        scale_color_manual(values=phylum_colors[rev(phylum_arrange)]) +
        geom_jitter(alpha=0.5) + 
        theme_minimal() + 
        theme(legend.position="none") +
        labs(y="Phylum",x="Relative abundance")

5.2.2.1 Origin: Wild vs Captive

phylum_summary %>%
    group_by(phylum) %>%
    summarise(total_mean=mean(relabun*100, na.rm=T),
              total_sd=sd(relabun*100, na.rm=T),
              Wild_mean=mean(relabun[diet=="Wild"]*100, na.rm=T),
              Wild_sd=sd(relabun[diet=="Wild"]*100, na.rm=T),
              Captive_mean=mean(relabun[diet=="Pre_grass"]*100, na.rm=T),
              Captive_sd=sd(relabun[diet=="Pre_grass"]*100, na.rm=T)) %>%
    mutate(total=str_c(round(total_mean,3),"±",round(total_sd,3)),
           Wild=str_c(round(Wild_mean,3),"±",round(Wild_sd,3)),
           Captive=str_c(round(Captive_mean,3),"±",round(Captive_sd,3))) %>% 
    arrange(-total_mean) %>% 
    dplyr::select(phylum,total,Wild,Captive)
# A tibble: 15 × 4
   phylum               total         Wild          Captive      
   <chr>                <chr>         <chr>         <chr>        
 1 p__Bacillota_A       57.243±12.618 48.636±13.52  59.846±10.725
 2 p__Bacteroidota      25.787±10.113 26.889±11.805 24.986±10.083
 3 p__Bacillota         4.118±7.202   5.066±11.152  4.949±5.297  
 4 p__Spirochaetota     3.911±10.011  11.731±14.792 0.001±0.001  
 5 p__Verrucomicrobiota 2.264±5.722   1.472±0.715   1.449±1.295  
 6 p__Pseudomonadota    1.626±3.328   0.511±0.397   3.082±5.382  
 7 p__Patescibacteria   0.987±1.542   0.212±0.253   2.177±2.223  
 8 p__Synergistota      0.96±0.852    1.865±0.83    0.45±0.39    
 9 p__Bacillota_C       0.844±0.795   1.696±0.674   0.386±0.36   
10 p__Actinomycetota    0.78±0.985    0.427±0.459   1.217±1.147  
11 p__Cyanobacteriota   0.708±2.081   0.277±0.614   0.916±3.024  
12 p__Desulfobacterota  0.586±0.396   0.991±0.315   0.41±0.256   
13 p__Bacillota_B       0.104±0.145   0.228±0.203   0.042±0.024  
14 p__Methanobacteriota 0.082±0.217   0±0           0.09±0.228   
15 p__Campylobacterota  0±0           0±0           0±0          

5.2.2.2 Origin and diet

phylum_summary %>%
  group_by(phylum) %>%
  summarise(
    total_mean=mean(relabun*100, na.rm=T),
    total_sd=sd(relabun*100, na.rm=T),
    Wild_mean=mean(relabun[diet=="Wild"]*100, na.rm=T),
    Wild_sd=sd(relabun[diet=="Wild"]*100, na.rm=T),
    Pre_grass_mean = mean(relabun[diet == "Pre_grass"] * 100, na.rm = T),
    Pre_grass_sd = sd(relabun[diet == "Pre_grass"] * 100, na.rm = T),
    Post_grass_mean = mean(relabun[diet == "Post_grass"] * 100, na.rm = T),
    Post_grass_sd = sd(relabun[diet == "Post_grass"] * 100, na.rm = T)
    ) %>% 
  mutate(
    total = str_c(round(total_mean, 2), "±", round(total_sd, 2)),
    Wild = str_c(round(Wild_mean, 2), "±", round(Wild_sd, 2)),
    Pre_grass = str_c(round(Pre_grass_mean, 6), "±", round(Pre_grass_sd, 6)),
    Post_grass = str_c(round(Post_grass_mean, 2), "±", round(Post_grass_sd, 2))
  ) %>%
  arrange(-total_mean) %>%
  dplyr::select(phylum, total, Wild, Pre_grass, Post_grass) %>%
  paged_table()
phylum_arrange <- phylum_summary %>%
    group_by(phylum) %>%
    summarise(mean=sum(relabun)) %>%
    arrange(-mean) %>%
    select(phylum) %>%
    pull()

phylum_summary %>%
  left_join(genome_metadata %>% select(phylum,phylum) %>% unique(),by=join_by(phylum==phylum)) %>%
  filter(phylum %in% phylum_arrange[1:20]) %>%
  mutate(phylum = factor(phylum, levels = rev(phylum_arrange[1:20]))) %>%
  filter(relabun > 0) %>% 
  ggplot(aes(x=relabun, y=phylum, group=phylum, color=phylum)) +
  scale_color_manual(values = phylum_colors[-8]) +
  geom_jitter(alpha = 0.5) +
  facet_grid(. ~ diet) +
  theme_minimal() +
  labs(y = "phylum", x = "Relative abundance", color = "Phylum")

5.2.3 Family

family_summary <- genome_counts_filt %>%
  mutate_at(vars(-genome),~./sum(.)) %>% 
  pivot_longer(-genome, names_to = "sample", values_to = "count") %>% 
  left_join(sample_metadata, by = join_by(sample == sample)) %>% 
  left_join(., genome_metadata, by = join_by(genome == genome)) %>%
  group_by(sample,family, diet,region) %>%
  summarise(relabun=sum(count))

family_summary$diet <- factor(family_summary$diet, levels=c("Pre_grass", "Post_grass", "Wild"))

5.2.3.1 Origin: Wild vs Captive

family_summary %>%
  group_by(family) %>%
  summarise(
    total_mean = mean(relabun * 100, na.rm = T),
    total_sd = sd(relabun * 100, na.rm = T),
    Wild_mean = mean(relabun[diet == "Wild"] * 100, na.rm = T),
    Wild_sd = sd(relabun[diet == "Wild"] * 100, na.rm = T),
    Cap_mean = mean(relabun[region == "Nafarroa"] * 100, na.rm = T),
    Cap_sd = sd(relabun[region == "Nafarroa"] * 100, na.rm = T)
  ) %>%
  mutate(
    Total = str_c(round(total_mean, 2), "±", round(total_sd, 2)),
    Wild = str_c(round(Wild_mean, 2), "±", round(Wild_sd, 2)),
    Captive = str_c(round(Cap_mean, 2), "±", round(Cap_sd, 2))
  ) %>%
  arrange(-total_mean) %>%
  dplyr::select(family, Total, Wild, Captive) %>%
  paged_table()

5.2.3.2 Origin and Diet

family_summary %>%
  group_by(family) %>%
  summarise(
    total_mean = mean(relabun * 100, na.rm = T),
    total_sd = sd(relabun * 100, na.rm = T),
    Wild_mean = mean(relabun[diet == "Wild"] * 100, na.rm = T),
    Wild_sd = sd(relabun[diet == "Wild"] * 100, na.rm = T),
    Pre_grass_mean = mean(relabun[diet == "Pre_grass"] * 100, na.rm = T),
    Pre_grass_sd = sd(relabun[diet == "Pre_grass"] * 100, na.rm = T),
    Post_grass_mean = mean(relabun[diet == "Post_grass"] * 100, na.rm = T),
    Post_grass_sd = sd(relabun[diet == "Post_grass"] * 100, na.rm = T)
  ) %>%
  mutate(
    Total = str_c(round(total_mean, 2), "±", round(total_sd, 2)),
    Wild = str_c(round(Wild_mean, 2), "±", round(Wild_sd, 2)),
    Pre_grass = str_c(round(Pre_grass_mean, 2), "±", round(Pre_grass_sd, 2)),
    Post_grass = str_c(round(Post_grass_mean, 2), "±", round(Post_grass_sd, 2))
  ) %>%
  arrange(-total_mean) %>%
  dplyr::select(family, Total, Wild, Pre_grass, Post_grass) %>%
  paged_table()
family_arrange <- family_summary %>%
    group_by(family) %>%
    summarise(mean=sum(relabun)) %>%
    arrange(-mean) %>%
    select(family) %>%
    pull()

family_summary %>%
  left_join(genome_metadata %>% select(family, phylum) %>% unique(), by = join_by(family == family)) %>%
  filter(family %in% family_arrange[1:20]) %>%
  mutate(family = factor(family, levels = rev(family_arrange[1:20]))) %>%
  filter(relabun > 0) %>%
  ggplot(aes(x=relabun, y=family, group=family, color=phylum)) +
  scale_color_manual(values = phylum_colors[-8]) +
  geom_jitter(alpha = 0.5) +
  facet_grid(. ~ diet) +
  theme_minimal() +
  labs(y = "Family", x = "Relative abundance", color = "Phylum")

5.2.4 Genus

genus_summary <- genome_counts_filt %>%
  mutate_at(vars(-genome),~./sum(.)) %>% 
  pivot_longer(-genome, names_to = "sample", values_to = "count") %>% 
  left_join(sample_metadata, by = join_by(sample == sample)) %>% 
  left_join(genome_metadata, by = join_by(genome == genome)) %>% 
  group_by(sample,phylum,genus, diet) %>%
  summarise(relabun=sum(count)) %>%
  filter(genus != "g__") %>%
  mutate(genus= sub("^g__", "", genus))
genus_summary$diet <- factor(genus_summary$diet, levels=c("Pre_grass", "Post_grass", "Wild"))

5.2.5 Origin and diet

genus_summary %>%
  group_by(genus) %>%
  summarise(
    total_mean = mean(relabun * 100, na.rm = T),
    total_sd = sd(relabun * 100, na.rm = T),
    Wild_mean = mean(relabun[diet == "Wild"] * 100, na.rm = T),
    Wild_sd = sd(relabun[diet == "Wild"] * 100, na.rm = T),
    Pre_grass_mean = mean(relabun[diet == "Pre_grass"] * 100, na.rm = T),
    Pre_grass_sd = sd(relabun[diet == "Pre_grass"] * 100, na.rm = T),
    Post_grass_mean = mean(relabun[diet == "Post_grass"] * 100, na.rm = T),
    Post_grass_sd = sd(relabun[diet == "Post_grass"] * 100, na.rm = T)
  ) %>%
  mutate(
    Total = str_c(round(total_mean, 2), "±", round(total_sd, 2)),
    Wild = str_c(round(Wild_mean, 2), "±", round(Wild_sd, 2)),
    Pre_grass = str_c(round(Pre_grass_mean, 2), "±", round(Pre_grass_sd, 2)),
    Post_grass = str_c(round(Post_grass_mean, 2), "±", round(Post_grass_sd, 2))
  ) %>%
  arrange(-total_mean) %>%
  dplyr::select(genus, Total, Wild, Pre_grass, Post_grass) %>%
  paged_table()
genus_arrange <- genus_summary %>%
    group_by(genus) %>%
    summarise(mean=sum(relabun)) %>%
    filter(genus != "g__")%>%
    arrange(-mean) %>%
    select(genus) %>%
    mutate(genus= sub("^g__", "", genus)) %>%
    pull()

genus_summary_sort <- genus_summary %>%
    group_by(genus) %>%
    summarise(mean=mean(relabun, na.rm=T),sd=sd(relabun, na.rm=T)) %>%
    arrange(-mean) 

genus_summary %>%
  mutate(genus=factor(genus, levels=rev(genus_summary_sort %>% pull(genus)))) %>%
  filter(relabun > 0) %>%
  ggplot(aes(x=relabun, y=genus, group=genus, color=phylum)) +
  scale_color_manual(values=phylum_colors) +
  geom_jitter(alpha=0.5) + 
  facet_grid(.~diet)+
  theme_minimal() + 
  theme(axis.text.y = element_text(size=6))+
  labs(y="Family", x="Relative abundance", color="Phylum")

5.2.6 Genus and species annotation

Number of MAGs without species-level annotation

genome_metadata %>%
  filter(species == "s__") %>%
  summarize(Mag_nospecies = n())%>%
  select(Mag_nospecies) %>% 
  pull()
[1] 749
total_mag_phylum <- genome_metadata %>%
  group_by(phylum) %>%
  summarize(count_total = n())
genome_metadata %>%
  filter(species == "s__") %>%
  group_by(phylum) %>%
  summarize(count_nospecies = n()) %>% 
  left_join(total_mag_phylum, by = join_by(phylum == phylum)) %>% 
  mutate(percentage=100*count_nospecies/count_total) %>% 
  tt()
phylum count_nospecies count_total percentage
p__Actinomycetota 15 15 100.00000
p__Bacillota 52 53 98.11321
p__Bacillota_A 516 526 98.09886
p__Bacillota_B 2 2 100.00000
p__Bacillota_C 6 9 66.66667
p__Bacteroidota 43 127 33.85827
p__Campylobacterota 1 1 100.00000
p__Cyanobacteriota 6 7 85.71429
p__Desulfobacterota 12 12 100.00000
p__Patescibacteria 13 13 100.00000
p__Pseudomonadota 32 39 82.05128
p__Spirochaetota 2 2 100.00000
p__Synergistota 18 18 100.00000
p__Verrucomicrobiota 31 35 88.57143

Percentage of MAGs without species-level annotation

nmags <- nrow(genome_counts)
nonspecies <- genome_metadata %>%
    filter(species == "s__") %>%
    nrow()
perct <- nonspecies*100/nmags
perct
[1] 87.09302

Number of MAGs without genera-level annotation

nongenera <- genome_metadata %>%
    filter(genus == "g__") %>%
    nrow()
cat(nongenera)
79