Chapter 4 Data statistics

4.1 Sequencing reads statistics

sample_metadata %>% 
    summarise(Total=sum(reads_post_fastp * 150 / 1000000000) %>% round(2), 
              mean=mean(reads_post_fastp * 150 / 1000000000) %>% round(2),
              sd=sd(reads_post_fastp * 150 / 1000000000) %>% round(2)) %>%
    unite("Average",mean, sd, sep = " ± ", remove = TRUE) %>%
    tt()
Total Average
652.04 6.21 ± 2.77

4.2 Sequencing depth

sequencing_depth <- read_counts %>%
  column_to_rownames(var = "genome") %>%
  colSums()

4.3 DNA fractions

sequence_fractions <- read_counts %>%
  pivot_longer(-genome, names_to = "sample", values_to = "value") %>%
  group_by(sample) %>%
  summarise(mags = sum(value)) %>%
    left_join(sample_metadata, by = join_by(sample == EHI_number)) %>%
    select(sample,mags,metagenomic_bases,host_bases,bases_lost_fastp_percent) %>%
    mutate(mags_bases = mags*146) %>%
    mutate(lowqual_bases = ((metagenomic_bases+host_bases)/(1-bases_lost_fastp_percent))-(metagenomic_bases+host_bases)) %>%
    mutate(unmapped_bases = metagenomic_bases - mags_bases) %>%
    mutate(unmapped_bases = ifelse(unmapped_bases < 0, 0, unmapped_bases)) %>%
    select(sample, lowqual_bases, host_bases, unmapped_bases, mags_bases)

sequence_fractions %>%
  mutate_at(vars(-sample), ~./1000000000) %>%
  rename("Sample"=1, "Low quality"=2, "Mapped to host"=3, "Unmapped"=4, "Mapped to MAGs"=5) %>%
  tt()
Sample Low quality Mapped to host Unmapped Mapped to MAGs
EHI00069 1.1441521 0.920532571 2.0517957 4.1058263
EHI00070 0.2348919 0.937599880 0.8332809 1.6364476
EHI00072 0.3717090 0.078319449 0.9507999 4.0430716
EHI00073 0.1504762 0.007300164 0.6458066 1.8453666
EHI00074 0.5700275 0.034671349 1.7906564 5.7526051
EHI00075 0.2702205 0.444390387 0.9599202 2.4896265
EHI00076 0.3350547 0.081391903 1.4172925 3.8389468
EHI00077 0.1353495 0.641770627 0.3885242 1.3947384
EHI00079 0.3377906 2.721037061 0.5653554 0.8084465
EHI00080 0.7291964 1.605187153 2.3611523 2.6883254
EHI00081 0.2184180 0.727448090 0.5615342 1.3358755
EHI00085 0.2300087 0.212261834 0.5665817 1.9281604
EHI00086 0.3471176 0.417593612 1.0770869 1.7440344
EHI00088 0.2278663 0.071769210 0.5556301 3.0141044
EHI00089 0.1751910 0.034915171 0.5162821 1.9576155
EHI00091 0.2622745 0.230039482 0.8255133 2.4924602
EHI00092 0.3917724 0.170842450 1.0255541 4.9181624
EHI00093 0.3725739 0.199633318 1.1938025 2.3137473
EHI00095 0.1768550 0.255990148 0.3449738 1.9242079
EHI00097 0.5249214 0.366551383 0.8028468 2.9685729
EHI00098 0.3176395 0.040047629 1.7271586 3.1498120
EHI00100 0.4386907 2.398470758 0.8228622 4.1093643
EHI00101 0.7257361 0.451928365 4.9291736 1.9943098
EHI00103 0.2036111 0.843529516 0.5714195 1.3395113
EHI00104 0.1769983 0.643701669 0.6807099 1.2574891
EHI00105 0.2711051 0.005694785 0.9087805 2.7311372
EHI00106 1.1082626 1.058626752 1.8709694 2.0746217
EHI00107 0.3818355 0.184257358 1.5836979 3.0592637
EHI00108 0.3420046 0.394171066 0.5356632 2.7441687
EHI00110 0.3648881 0.591747269 0.5588077 2.6145261
EHI00111 0.4392502 0.151954279 1.1992388 3.5773008
EHI00112 0.7134927 0.085183185 1.1328903 3.6469985
EHI00113 0.8620893 0.617092903 1.4939464 7.3254896
EHI00114 0.4127445 0.087174427 2.3985081 4.6641659
EHI00115 0.6290438 0.062352261 2.6081814 3.0085937
EHI00116 0.4130247 0.039596686 0.8559983 4.7270478
EHI00117 0.6850197 0.423816725 1.2454024 7.0782364
EHI00118 0.3963952 0.784391517 0.9049477 3.7006953
EHI00119 0.4448016 0.038727564 1.5277021 4.6761692
EHI00120 0.3549466 0.051038023 0.7165254 3.8122875
EHI00121 0.2487524 1.196143747 0.9172490 1.9448859
EHI00122 0.3967333 0.056247498 1.7140526 5.3880487
EHI00124 0.3101617 0.113171041 1.6986936 3.2732918
EHI00125 0.3008464 0.313488992 1.2381544 3.1620784
EHI00128 0.3688688 0.149336152 1.0558709 2.9648899
EHI00129 0.2827708 0.212726094 0.7352845 2.7738704
EHI00130 0.2917071 0.082344877 1.0170695 2.1151693
EHI00131 0.1940832 0.943415382 0.4863996 1.2048589
EHI00133 0.2569782 0.429506925 0.3484713 3.3747982
EHI00134 0.5620395 1.988142143 1.9520876 3.9427347
EHI00137 0.2948339 0.250489910 1.0341318 2.4844583
EHI00138 0.2673241 0.179238573 0.5826993 2.8578460
EHI00139 0.3001940 0.034225947 0.9413847 2.1426906
EHI00176 0.7041777 0.100761385 0.7196926 2.7465205
EHI00177 0.2555111 0.016053624 0.5228890 2.1639513
EHI00178 0.2415179 0.045338274 1.3634101 2.6038297
EHI00179 0.1816121 0.023148016 0.9401620 2.3926575
EHI00180 0.5363481 0.037200326 0.7786810 2.4699635
EHI00181 0.1978972 0.301399301 0.4733087 2.4456488
EHI00422 0.1660426 0.058188268 0.4631427 2.3301737
EHI00426 0.6048076 0.374841259 1.6091057 5.6754435
EHI00428 0.4361908 0.354862015 0.9866519 3.7380622
EHI00433 0.6941201 0.730117194 1.6017157 3.7066518
EHI00438 0.2938182 0.100587490 0.8490354 4.7546269
EHI00441 0.5217550 0.202212815 2.3199030 6.8309541
EHI00451 0.4696437 0.318898071 1.2315115 5.0794188
EHI00456 0.5747753 0.097152439 0.6337192 4.7496272
EHI00458 0.5541280 0.326249839 1.7364884 4.9185106
EHI00462 0.5139086 0.330809687 1.3870295 4.8485639
EHI00464 0.5604449 0.962080506 1.5487451 5.0533364
EHI00465 0.1355889 0.041795770 0.7088416 1.4262076
EHI00467 0.4291035 0.854761759 1.6444666 3.7590494
EHI00470 0.6625583 1.679223689 1.8675738 8.1840405
EHI00472 0.3619652 0.586418138 1.8074071 3.3732380
EHI00473 0.4841385 0.477433785 1.2481742 4.8043613
EHI00477 0.4782444 0.087653441 2.7186490 3.7317223
EHI00479 0.8395454 0.074374376 1.9936025 12.8156007
EHI00480 0.7475415 0.793212202 3.1481051 4.3553735
EHI00481 0.5040521 0.094726409 1.3739735 5.3792865
EHI00483 0.4545170 1.532171008 0.8653232 4.7402692
EHI00484 0.3037426 0.269250378 1.3870200 5.2563238
EHI00488 0.4900356 2.212642337 2.0850126 4.3632065
EHI00490 0.6234248 0.180662818 3.5993286 7.8297890
EHI00493 0.4026458 2.030328521 2.7098633 2.4780854
EHI00496 0.4536382 0.415085470 1.7045368 6.1699650
EHI00499 3.5142773 2.204637514 3.4291586 3.2083353
EHI00504 0.5335873 2.167943278 1.2709452 4.9663386
EHI00506 0.7953759 0.639271744 1.7418071 8.0883653
EHI00507 0.4077748 0.136492692 1.5474048 5.5083429
EHI00512 0.6299478 8.067540048 0.5964127 1.3433168
EHI00514 0.4590217 0.336676348 1.5023882 6.2571308
EHI00518 0.4082296 0.012341521 1.0122951 5.8379043
EHI00524 0.4918121 1.320091255 2.0577845 4.8996823
EHI00525 0.3238390 0.105964164 1.1420157 5.0531429
EHI00527 0.5623443 0.128178226 2.3107744 7.6106934
EHI00529 0.3350620 0.129930230 1.3102065 5.1130080
EHI00536 0.4725239 0.010876654 1.2534820 7.7176039
EHI00537 0.3665586 0.089579032 2.4631156 1.4492223
EHI00538 0.4501201 2.638414328 0.6397030 3.1637601
EHI00541 0.7086242 0.152318595 1.4153324 6.8350770
EHI00547 1.0225445 2.648359289 7.0526142 6.1760904
EHI00566 0.5706734 0.218664855 1.7237625 7.3779017
EHI00567 0.4565067 0.133011864 1.9051503 4.4466414
EHI00568 0.5246686 0.219110833 1.7989251 3.5919103
EHI00569 0.6859077 0.169999886 2.6761990 6.7957645
sequence_fractions %>%
    pivot_longer(!sample, names_to = "fraction", values_to = "value") %>%
    mutate(value = value / 1000000000) %>%
    mutate(fraction = factor(fraction, levels = c("lowqual_bases","host_bases","unmapped_bases","mags_bases"))) %>%
    ggplot(., aes(x = sample, y = value, fill=fraction)) +
        geom_bar(position="stack", stat = "identity") +
      scale_fill_manual(name="Sequence type",
                    breaks=c("lowqual_bases","host_bases","unmapped_bases","mags_bases"),
                    labels=c("Low quality","Mapped to host","Unmapped","Mapped to MAGs"),
                    values=c("#CCCCCC", "#bcdee1", "#d8b8a3","#93655c"))+
        labs(x = "Samples", y = "Amount of data (GB)") +
        theme_classic() +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),legend.position = "bottom")

4.4 Recovered microbial fraction

singlem_table <- sequence_fractions %>%
    mutate(mags_proportion = round((mags_bases / (mags_bases + unmapped_bases))*100,2)) %>%
    left_join(sample_metadata, by = join_by(sample == EHI_number))  %>%
    mutate(singlem_proportion = round(singlem_fraction*100,2)) %>%
    select(sample,mags_proportion,singlem_proportion) %>%
    mutate(mags_proportion = ifelse(singlem_proportion == 0, 0, mags_proportion)) %>% #convert zeros to NA
    mutate(singlem_proportion = ifelse(singlem_proportion == 0, NA, singlem_proportion)) %>% #convert zeros to NA
    mutate(singlem_proportion = ifelse(singlem_proportion < mags_proportion, NA, singlem_proportion)) %>% #if singlem is smaller, then NA, to simplify plot
    mutate(singlem_proportion = ifelse(singlem_proportion > 100, 100, singlem_proportion)) #simplify

singlem_table %>%
    pivot_longer(!sample, names_to = "proportion", values_to = "value") %>%
    left_join(sample_metadata, by = join_by(sample == EHI_number))  %>%
    mutate(proportion = factor(proportion, levels = c("mags_proportion","singlem_proportion"))) %>%
    ggplot(., aes(x = value, y = sample, color=proportion)) +
            geom_line(aes(group = sample), color = "#f8a538") +
            geom_point() +
      scale_color_manual(name="Proportion",
                    breaks=c("mags_proportion","singlem_proportion"),
                    labels=c("Recovered","Estimated"),
                    values=c("#52e1e8", "#876b53"))+
      facet_nested(species + sample_type ~ ., scales="free",space="free")+
            theme_classic() +
            labs(y = "Samples", x = "Prokaryotic fraction (%)") +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),
              legend.position = "right",
              strip.background.y=element_rect(color = NA, fill= "#f4f4f4"))