Chapter 3 Data statistics

load("data/data.Rdata")
extra_metadata <- read_csv("data/Bombina_microbiome_metadata.csv") %>% 
  rename(animal=sample) %>% 
  rename(sample=EHI_code)
filtered_reads <- read_counts %>% 
  column_to_rownames(., "genome") %>% 
  colSums() %>% 
  as.data.frame() %>% 
  rename("depth"=".") %>% 
  rownames_to_column(., "sample") %>% 
  left_join(extra_metadata, by="sample") %>% 
  arrange(animal) %>% 
  group_by(animal) %>%
  slice_max(order_by = depth, n = 1, with_ties = FALSE)

3.1 Sequencing reads statistics

sample_metadata %>%
  filter(sample %in% filtered_reads$sample) %>% 
  summarise(Total=sum(reads_post_fastp * 150 / 1000000000) %>% round(2), 
            mean=mean(reads_post_fastp * 150 / 1000000000) %>% round(2),
            sd=sd(reads_post_fastp * 150 / 1000000000) %>% round(2)) %>%
  unite("Average",mean, sd, sep = " ± ", remove = TRUE) %>%
  tt()
Total Average
448.68 11.5 ± 5.92

3.2 DNA fractions

sequence_fractions <- read_counts %>%
  select(all_of(c("genome",filtered_reads$sample))) %>% 
  pivot_longer(-genome, names_to = "sample", values_to = "value") %>%
  group_by(sample) %>%
  summarise(mags = sum(value)) %>%
    left_join(sample_metadata, by = join_by(sample == sample)) %>%
    select(sample,mags,metagenomic_bases,host_bases,bases_lost_fastp_percent) %>%
    mutate(mags_bases = mags*146) %>%
    mutate(lowqual_bases = ((metagenomic_bases+host_bases)/(1-bases_lost_fastp_percent))-(metagenomic_bases+host_bases)) %>%
    mutate(unmapped_bases = metagenomic_bases - mags_bases) %>%
    mutate(unmapped_bases = ifelse(unmapped_bases < 0, 0, unmapped_bases)) %>%
    select(sample, lowqual_bases, host_bases, unmapped_bases, mags_bases)

sequence_fractions %>%
  mutate_at(vars(-sample), ~./1000000000) %>%
  rename("Sample"=1, "Low quality"=2, "Mapped to host"=3, "Unmapped"=4, "Mapped to MAGs"=5) %>%
  tt()
Sample Low quality Mapped to host Unmapped Mapped to MAGs
EHI01695 0.3461828 0.04127063 4.7400240 1.38511397
EHI01696 0.4424049 0.22406116 5.2067968 4.16427478
EHI01697 0.4538495 1.90941404 2.6774953 4.97927029
EHI01698 0.3573398 0.09858151 1.5648997 5.72228546
EHI01699 0.7050913 0.96951518 9.2592759 3.23009714
EHI01700 0.6530752 3.56898382 5.4267930 2.69687302
EHI01701 0.6766729 1.02592777 9.5467908 5.35279961
EHI01702 2.0469307 6.03379949 3.6116400 0.73573430
EHI02563 0.1714629 0.64207433 0.9746465 0.23628290
EHI02564 0.3836564 2.85852293 1.6403990 0.51015831
EHI02569 0.3707787 1.83758111 2.4210436 0.23655095
EHI02570 0.4532710 4.17998256 0.9421899 0.06150104
EHI02661 0.5505824 3.58589734 1.1707615 4.16354989
EHI02662 0.9068807 0.23641569 12.7506346 3.66885561
EHI02663 1.6590876 8.68528216 17.5021482 1.77573172
EHI02664 0.8797522 1.67842756 10.8189021 3.59334368
EHI02665 1.3530223 8.45927663 10.8622509 3.56494434
EHI02666 0.7772609 5.05533393 8.5615855 1.44358288
EHI02667 0.6646793 2.25720854 7.5154444 2.54387451
EHI02668 0.7713225 0.61533293 11.1440351 1.60854376
EHI02669 0.6186671 2.19198064 6.2047538 1.73961044
EHI02670 0.7066807 6.48534358 2.8398548 1.35885748
EHI02671 0.6099077 6.69793619 4.4765836 0.25575900
EHI02672 0.7555689 4.27773321 6.8023493 1.22402779
EHI02673 1.1212504 6.00359678 14.4151757 1.25960215
EHI02674 1.3226743 1.14702137 11.5001905 10.53462956
EHI02675 0.7725862 11.50682250 2.0966824 0.23969346
EHI02676 0.5425090 5.29023643 3.2681275 0.40619273
EHI02677 0.7336743 0.36706295 14.2915536 0.28614657
EHI02678 0.6623755 1.47718139 8.1501070 0.98578572
EHI02679 0.6506381 0.43512590 10.2627640 1.23309249
EHI02680 0.3328090 3.88288708 1.4210251 0.14881605
EHI02681 0.3389079 1.76220214 3.4155844 0.64653370
EHI02683 0.3769425 2.40063549 0.9947723 2.00839381
EHI02684 0.4055536 3.04660391 3.1401241 0.21321577
EHI02687 0.4719383 4.82008929 2.4576517 0.25446676
EHI02688 0.6297850 0.79285573 8.3707803 2.60182892
EHI02689 0.2969110 1.11480534 2.0197544 0.19344212
EHI02690 0.5017530 2.70043724 2.3286060 2.47396095
sequence_fractions %>%
    pivot_longer(!sample, names_to = "fraction", values_to = "value") %>%
  left_join(., extra_metadata, by="sample") %>% 
    mutate(value = value / 1000000000) %>%
    mutate(fraction = factor(fraction, levels = c("lowqual_bases","host_bases","unmapped_bases","mags_bases"))) %>%
    ggplot(., aes(x = animal, y = value, fill=fraction)) +
  geom_bar(position="stack", stat = "identity") +
  scale_fill_manual(name="Sequence type",
                    breaks=c("lowqual_bases","host_bases","unmapped_bases","mags_bases"),
                    labels=c("Low quality","Mapped to host","Unmapped","Mapped to MAGs"),
                    values=c("#CCCCCC", "#bcdee1", "#d8b8a3","#93655c"))+
  facet_grid(~ Bd_status, scale="free", space="free")+
  labs(x = "Samples", y = "Amount of data (GB)") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),legend.position = "bottom")

3.3 Recovered microbial fraction

singlem_table <- sequence_fractions %>%
    mutate(mags_proportion = round((mags_bases / (mags_bases + unmapped_bases))*100,2)) %>%
    left_join(sample_metadata, by = join_by(sample == sample))  %>%
    mutate(singlem_proportion = round(singlem_fraction*100,2)) %>%
    select(sample,mags_proportion,singlem_proportion) %>%
    mutate(mags_proportion = ifelse(singlem_proportion == 0, 0, mags_proportion)) %>% #convert zeros to NA
    mutate(singlem_proportion = ifelse(singlem_proportion == 0, NA, singlem_proportion)) %>% #convert zeros to NA
    mutate(singlem_proportion = ifelse(singlem_proportion < mags_proportion, NA, singlem_proportion)) %>% #if singlem is smaller, then NA, to simplify plot
    mutate(singlem_proportion = ifelse(singlem_proportion > 100, 100, singlem_proportion)) #simplify

singlem_table %>%
    pivot_longer(!sample, names_to = "proportion", values_to = "value") %>%
    left_join(sample_metadata, by = join_by(sample == sample))  %>%
    mutate(proportion = factor(proportion, levels = c("mags_proportion","singlem_proportion"))) %>%
    ggplot(., aes(x = value, y = sample, color=proportion)) +
            geom_line(aes(group = sample), color = "#f8a538") +
            geom_point() +
      scale_color_manual(name="Proportion",
                    breaks=c("mags_proportion","singlem_proportion"),
                    labels=c("Recovered","Estimated"),
                    values=c("#52e1e8", "#876b53"))+
      facet_nested(species + sample_type ~ ., scales="free",space="free")+
            theme_classic() +
            labs(y = "Samples", x = "Prokaryotic fraction (%)") +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),
              legend.position = "right",
              strip.background.y=element_rect(color = NA, fill= "#f4f4f4"))