Chapter 4 Data statistics


4.1 Sequencing reads statistics

sample_preprocessing %>% 
    summarise(Total=sum(reads_post_filt * 150 / 1000000000) %>% round(2), 
              mean=mean(reads_post_filt * 150 / 1000000000) %>% round(2),
              sd=sd(reads_post_filt * 150 / 1000000000) %>% round(2)) %>%
    unite("Average",mean, sd, sep = " ± ", remove = TRUE) %>%
Total Average
712.18 7.83 ± 8.08

4.2 DNA fractions

sequence_fractions <- read_counts %>%
  pivot_longer(-genome, names_to = "sample", values_to = "value") %>%
  group_by(sample) %>%
  summarise(mags = sum(value)) %>%
    left_join(sample_preprocessing, by = join_by(sample == sample)) %>%
    select(sample,mags,bases_pre_filt,bases_post_filt,host_bases,metagenomic_bases) %>%
    mutate(mags_bases = mags*150) %>%
    mutate(lowqual_bases = bases_pre_filt - bases_post_filt) %>%
    mutate(unmapped_bases = metagenomic_bases - mags_bases) %>%
    mutate(unmapped_bases = ifelse(unmapped_bases < 0, 0, unmapped_bases)) %>%
    select(sample, lowqual_bases, host_bases, unmapped_bases, mags_bases)

sequence_fractions %>%
  mutate_at(vars(-sample), ~./1000000000) %>%
  rename("Sample"=1, "Low quality"=2, "Mapped to host"=3, "Unmapped"=4, "Mapped to MAGs"=5) %>%
Sample Low quality Mapped to host Unmapped Mapped to MAGs
E18 4.7850204 6.9637290 24.05986617 0.52864665
E25 1.5376473 7.9804608 6.00871944 0.85221990
E27 0.4230030 0.0911577 1.95160083 0.44069160
E30 1.7928992 1.4413299 4.00947792 5.12918235
E31 0.6816818 0.0320553 0.50570530 2.84763450
E34 2.1948529 0.8249310 11.67502744 2.54218275
E44 0.4234418 5.9270472 0.44183940 0.24959295
E48 2.3197861 1.9469865 7.44564809 5.90044110
E56 0.5108719 0.0847401 1.09609707 1.82499015
E58 0.4880990 2.1264312 1.36506017 0.11759265
H06 0.2841785 2.0398764 1.46660757 0.01427955
H07 1.4500404 10.9416210 3.39146197 0.01941135
H08 0.5011987 0.2131668 0.57356153 0.45570180
H09 1.2653958 10.8166410 7.00507483 0.15877860
H10 0.7942211 1.3782558 4.03545973 0.11882340
H12 2.5813911 8.4009948 4.51254046 0.84020835
H16 2.1486462 4.0113864 9.56424671 3.69521370
H19 0.5557962 4.2411288 2.50314959 0.02286150
H20 0.9677842 4.3890999 3.78083553 0.07778835
H23 0.9420728 3.9595926 1.30102424 0.29551170
H25 1.0738812 7.7087100 3.42255644 0.68422815
H32 0.5427161 5.8415880 1.38975295 0.06290070
H34 1.3188004 12.4245048 5.12752821 0.02616405
H37 2.5737627 27.8754720 9.15837446 0.25084785
H39 0.2938515 2.4124689 0.75581958 0.02116935
H40 1.1315544 3.8592006 5.83728129 0.42791340
H43 0.7161526 6.3701718 2.50320009 0.03324300
H45 0.9215094 1.5407343 3.94772969 0.81685470
H47 0.9581476 5.6254728 2.15816743 0.32372385
P01 0.5733407 1.8086574 1.33450269 0.76149270
P05 1.1668645 9.3271338 0.63288528 0.36958890
P09 0.7574713 7.1642826 0.52521364 0.04058895
P14 0.5034011 3.8678286 0.03866504 0.64239540
P25 0.2950900 4.5486294 0.50927300 0.38403045
P33 3.3222731 33.2508960 7.86558092 0.43485525
P34 0.3459813 0.8618043 1.84351514 0.01439085
P36 1.2281449 10.5844332 0.28646419 0.71007735
P43 3.4204299 3.2932512 7.78711290 13.30940625
P45 1.6790221 2.9649723 5.42286382 0.12008145
P47 0.4775963 2.8718187 1.06401988 0.80290395
P48 3.0192702 0.9945204 17.82268997 6.15753405
P51 1.8101944 8.2580928 3.46629537 2.64862665
P53 1.5688483 24.5797020 2.14349384 0.12960855
P58 0.7963638 6.4119048 1.11305236 0.07796910
P60 2.3930517 33.2359848 8.57518064 0.20491080
P64 0.2656280 0.3023889 1.17634560 0.47208750
P65 0.4114934 0.2113239 0.89754587 1.90757895
P67 1.7026024 17.3557032 0.62786285 2.02223145
P72 0.2702016 3.9134910 0.57524887 0.20525790
P75 1.4392490 2.6274414 6.26632261 1.62440925
P79 1.7376207 12.1993284 7.73575004 1.52807535
sequence_fractions %>%
    pivot_longer(!sample, names_to = "fraction", values_to = "value") %>%
    mutate(value = value / 1000000000) %>%
    mutate(fraction = factor(fraction, levels = c("lowqual_bases","host_bases","unmapped_bases","mags_bases"))) %>%
    ggplot(., aes(x = sample, y = value, fill=fraction)) +
        geom_bar(position="stack", stat = "identity") +
      scale_fill_manual(name="Sequence type",
                    labels=c("Low quality","Mapped to host","Unmapped","Mapped to MAGs"),
                    values=c("#CCCCCC", "#bcdee1", "#d8b8a3","#93655c"))+
        labs(x = "Samples", y = "Amount of data (GB)") +
        theme_classic() +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),legend.position = "bottom")

bat_species mean_host_perc sd_host_perc max_host_perc min_host_perc
Eptesicus bottae 17.20026 24.00217 74.55613 0.39599794
Hypsugo ariel 36.45110 16.35553 58.92606 6.53771139
Pipistrellus kuhlii 43.54900 27.90509 78.35920 1.81265533
NA 45.52395 33.97328 92.13648 0.02651141