Chapter 3 Data statistics

load("data/data.Rdata")

3.1 filter samples with high host data

sample_metadata <- sample_metadata%>%
  filter(!sample %in% c("EHI02721", "EHI02712", "EHI02700", "EHI02720", "EHI02749", "EHI02719", "EHI02729", "EHI02715", "EHI02722"))

read_counts <- read_counts %>% 
    select(one_of(c("genome",sample_metadata$sample)))%>%
  filter(rowSums(. != 0, na.rm = TRUE) > 0) %>%
  select_if(~!all(. == 0))

genome_metadata <- genome_metadata %>% 
  semi_join(., read_counts, by = "genome") %>% 
  arrange(match(genome,read_counts$genome))

genome_tree <- keep.tip(genome_tree, tip=genome_metadata$genome) # keep only MAG tips

#load("data/genome_gifts.Rdata")

3.2 Sequencing reads statistics

sample_metadata %>% 
    summarise(Total=sum(reads_post_fastp * 150 / 1000000000) %>% round(2), 
              mean=mean(reads_post_fastp * 150 / 1000000000) %>% round(2),
              sd=sd(reads_post_fastp * 150 / 1000000000) %>% round(2)) %>%
    unite("Average",mean, sd, sep = " ± ", remove = TRUE) %>%
    tt()
Total Average
222.06 4.93 ± 1.08

3.3 DNA fractions

sequence_fractions <- read_counts %>%
  pivot_longer(-genome, names_to = "sample", values_to = "value") %>%
  group_by(sample) %>%
  summarise(mags = sum(value)) %>%
    left_join(sample_metadata, by = join_by(sample == sample)) %>%
    select(sample,mags,metagenomic_bases,host_bases,bases_lost_fastp_percent) %>%
    mutate(mags_bases = mags*146) %>%
    mutate(lowqual_bases = ((metagenomic_bases+host_bases)/(1-bases_lost_fastp_percent))-(metagenomic_bases+host_bases)) %>%
    mutate(unmapped_bases = metagenomic_bases - mags_bases) %>%
    mutate(unmapped_bases = ifelse(unmapped_bases < 0, 0, unmapped_bases)) %>%
    select(sample, lowqual_bases, host_bases, unmapped_bases, mags_bases)

sequence_fractions %>%
  mutate_at(vars(-sample), ~./1000000000) %>%
  rename("Sample"=1, "Low quality"=2, "Mapped to host"=3, "Unmapped"=4, "Mapped to MAGs"=5) %>%
  tt()
Sample Low quality Mapped to host Unmapped Mapped to MAGs
EHI01861 0.2815721 0.011886522 1.2899665 2.94492935
EHI01963 0.2379339 0.377956197 0.7686364 2.26737109
EHI01964 0.3498799 2.739970327 0.5635412 1.32056898
EHI01965 0.3583169 0.398887483 1.3439432 3.82431144
EHI01969 0.2444974 0.026139904 0.7635264 2.53338864
EHI01971 0.3382550 0.014993899 1.0255034 3.94601821
EHI01972 0.4703327 0.054667824 1.3684974 5.49040300
EHI01973 0.3347793 0.064729480 1.5130592 4.84065102
EHI01974 0.3671355 0.052747090 0.8921925 2.95848005
EHI01975 0.4305111 0.123580711 1.9003492 5.31791948
EHI01976 0.3856202 0.011574334 1.2666400 3.26276449
EHI01977 0.4158680 0.747098387 1.7867623 3.87494103
EHI01978 0.3929906 0.004864216 1.2394723 3.63470402
EHI01980 0.3711274 0.013798033 1.7409090 5.56779249
EHI01981 0.3633649 0.020488667 1.2293197 3.03525532
EHI01983 0.2418940 0.146776195 0.8284920 2.84914941
EHI01984 0.4431283 0.007477209 1.2629388 5.76052943
EHI01986 0.3104557 0.016788774 0.9597449 3.38641569
EHI01987 0.4292930 0.001120181 0.9160842 3.32218036
EHI01988 0.2963428 0.197271999 1.0599471 3.92577283
EHI01990 0.3186345 0.010910854 1.0992821 3.87615385
EHI01991 0.3161992 0.021159611 1.1871737 2.82147205
EHI01993 0.2026808 0.006054774 0.5074436 2.59915566
EHI01994 0.3359364 0.064796021 0.9475828 3.88007790
EHI02699 0.2717760 0.007940738 1.1803471 3.05661322
EHI02700 0.3187046 3.815365599 0.7469281 0.43060671
EHI02701 0.2059134 0.135887208 0.9201707 2.92496093
EHI02705 0.2326028 0.134644975 1.0103929 3.05008177
EHI02707 0.2133975 0.197419011 1.0367591 2.56043485
EHI02708 0.3287951 0.247517459 1.4970876 3.72574889
EHI02710 0.2905179 0.004863423 1.1544139 3.25985646
EHI02712 0.3735243 4.162982418 0.5973263 0.12825939
EHI02714 0.2229262 0.166057029 0.7697399 3.24492767
EHI02715 0.3350685 3.050470506 0.9265001 0.43417816
EHI02718 0.2124597 1.622070570 0.4994065 1.33393017
EHI02719 0.3330425 2.607725990 1.8321792 0.33907536
EHI02720 0.3031357 3.089773324 0.7364466 0.19044999
EHI02721 0.3228625 3.677630914 0.6704712 0.08601152
EHI02722 0.3095520 2.799929187 0.7276464 0.41217450
EHI02723 0.2730593 0.113227422 0.9750562 3.21338043
EHI02724 0.3551211 0.121469083 0.9809314 3.69454095
EHI02728 0.3109121 1.043367740 0.9818139 2.94984474
EHI02729 0.3043389 2.024187975 1.5671125 0.47668212
EHI02749 0.4389880 4.177705067 0.7870159 0.53921786
EHI02751 0.4202354 3.712236730 0.7804933 1.77651793
sequence_fractions %>%
  mutate_at(vars(-sample), ~./1000000000) %>%
  rename("Sample"=1, "Low quality"=2, "Mapped to host"=3, "Unmapped"=4, "Mapped to MAGs"=5) %>%
  summarise(across(where(is.numeric), mean, na.rm = TRUE)) %>%
  tt()
Low quality Mapped to host Unmapped Mapped to MAGs
0.3247485 0.9344047 1.063094 2.779287
sequence_fractions %>%
    pivot_longer(!sample, names_to = "fraction", values_to = "value") %>%
    mutate(value = value / 1000000000) %>%
    mutate(fraction = factor(fraction, levels = c("lowqual_bases","host_bases","unmapped_bases","mags_bases"))) %>%
  
    ggplot(., aes(x = sample, y = value, fill=fraction)) +
        geom_bar(position="stack", stat = "identity") +
      scale_fill_manual(name="Sequence type",
                    breaks=c("lowqual_bases","host_bases","unmapped_bases","mags_bases"),
                    labels=c("Low quality","Mapped to host","Unmapped","Mapped to MAGs"),
                    values=c("#CCCCCC", "#bcdee1", "#d8b8a3","#93655c"))+
        labs(x = "Samples", y = "Amount of data (GB)") +
        theme_classic() +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),legend.position = "bottom")

3.4 Recovered microbial fraction

singlem_table <- sequence_fractions %>%
    mutate(mags_proportion = round((mags_bases / (mags_bases + unmapped_bases))*100,2)) %>%
  left_join(sample_metadata, by = join_by(sample == sample))  %>%
    mutate(singlem_proportion = round(singlem_fraction,2)) %>%
    select(sample,mags_proportion,singlem_proportion) %>%
    mutate(mags_proportion = ifelse(singlem_proportion == 0, 0, mags_proportion)) %>% #convert zeros to NA
    mutate(singlem_proportion = ifelse(singlem_proportion == 0, NA, singlem_proportion)) %>% #convert zeros to NA
    mutate(singlem_proportion = ifelse(singlem_proportion < mags_proportion, NA, singlem_proportion)) %>% #if singlem is smaller, then NA, to simplify plot
    mutate(singlem_proportion = ifelse(singlem_proportion > 100, 100, singlem_proportion)) #simplify

singlem_table %>%
    pivot_longer(!sample, names_to = "proportion", values_to = "value") %>%
    mutate(proportion = factor(proportion, levels = c("mags_proportion","singlem_proportion"))) %>%
    ggplot(., aes(x = value, y = sample, color=proportion)) +
            geom_line(aes(group = sample), color = "#f8a538") +
            geom_point() +
      scale_color_manual(name="Proportion",
                    breaks=c("mags_proportion","singlem_proportion"),
                    labels=c("Recovered","Estimated"),
                    values=c("#52e1e8", "#876b53"))+
            theme_classic() +
            labs(y = "Samples", x = "Prokaryotic fraction (%)") +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),legend.position = "right")

damr <- singlem_table %>%
  mutate(damr=ifelse(is.na(singlem_proportion),100,mags_proportion/singlem_proportion*100)) %>%
  select(sample,damr)

damr %>% 
  left_join(sample_metadata, by = join_by(sample == sample)) %>%
  filter(diet=="Wild")%>% 
  summarise(mean=mean(damr),sd=sd(damr)) %>% 
  tt()
mean sd
71.22498 34.9342
damr %>% 
  left_join(sample_metadata, by = join_by(sample == sample)) %>%
  filter(region=="Nafarroa")%>% 
  summarise(mean=mean(damr),sd=sd(damr)) %>% 
  tt()
mean sd
100 0
damr %>% 
  tt()
sample damr
EHI01861 100.00000
EHI01963 100.00000
EHI01964 100.00000
EHI01965 100.00000
EHI01969 100.00000
EHI01971 100.00000
EHI01972 100.00000
EHI01973 100.00000
EHI01974 100.00000
EHI01975 100.00000
EHI01976 100.00000
EHI01977 100.00000
EHI01978 100.00000
EHI01980 100.00000
EHI01981 100.00000
EHI01983 100.00000
EHI01984 100.00000
EHI01986 100.00000
EHI01987 100.00000
EHI01988 100.00000
EHI01990 100.00000
EHI01991 100.00000
EHI01993 100.00000
EHI01994 100.00000
EHI02699 100.00000
EHI02700 45.95376
EHI02701 100.00000
EHI02705 100.00000
EHI02707 100.00000
EHI02708 100.00000
EHI02710 100.00000
EHI02712 24.41989
EHI02714 100.00000
EHI02715 40.58764
EHI02718 100.00000
EHI02719 20.44770
EHI02720 24.69655
EHI02721 15.61599
EHI02722 46.16956
EHI02723 100.00000
EHI02724 100.00000
EHI02728 100.00000
EHI02729 30.17208
EHI02749 47.66147
EHI02751 100.00000
damr %>% 
  filter(damr>95) %>%
  nrow()
[1] 36
damr %>% 
  summarise(mean=mean(damr),sd=sd(damr)) %>% 
  tt()
mean sd
86.57166 27.6675