Chapter 3 Data statistics

3.1 Sequencing reads statistics

load("data/data.Rdata")

3.1.1 Loading data

preprocessing <- read_tsv("data/preprocessing.tsv")
Rows: 48 Columns: 9
── Column specification ─────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): sample
dbl (8): reads_raw, reads_discarded, reads_host, reads_metagenomic, bases_raw, bases_discarded, bases_host, b...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Total number of bases sequenced (Gb)

preprocessing %>% 
    summarise(Total=sum(reads_metagenomic * 150 / 1000000000) %>% round(2), 
              mean=mean(reads_metagenomic * 150 / 1000000000) %>% round(2),
              sd=sd(reads_metagenomic * 150 / 1000000000) %>% round(2)) %>%
    unite("Average",mean, sd, sep = " ± ", remove = TRUE) %>%
    tt()
Total Average
96.56 2.01 ± 2.86

3.1.2 DNA fractions

sequence_fractions <- read_counts %>%
  pivot_longer(-genome, names_to = "sample", values_to = "value") %>%
  group_by(sample) %>%
  summarise(mags = sum(value)) %>%
    left_join(preprocessing, by = join_by(sample == sample)) %>%
    dplyr::select(sample,mags,bases_metagenomic,bases_host ,bases_discarded, bases_raw) %>%
    mutate(mags_bases = mags*146) %>% #total number of bases mapped to mags (multiplying by effective read length)
  mutate(bases_discarded_proportion = bases_discarded/bases_raw) %>%
    mutate(lowqual_bases = ((bases_metagenomic+bases_host)/(1-bases_discarded_proportion))-(bases_metagenomic+bases_host)) %>%
    mutate(unmapped_bases = bases_metagenomic - mags_bases) %>%
    mutate(unmapped_bases = ifelse(unmapped_bases < 0, 0, unmapped_bases)) 
sequence_fractions %>%
    dplyr::select(sample, lowqual_bases, bases_host, unmapped_bases, mags_bases)  %>%
  mutate_at(vars(-sample), ~./1000000000) %>%
  rename("Sample"=1, "Low quality"=2, "Mapped to host"=3, "Unmapped"=4, "Mapped to MAGs"=5) %>%
  tt()
Sample Low quality Mapped to host Unmapped Mapped to MAGs
fox10_D_37 2.6301023 8.1216298 0.88459923 1.65543326
fox10_F_39 2.2970487 2.8406966 2.77779874 4.80441528
fox10_K_38 0.5180657 8.8218626 0.04823171 0.05601246
fox10_M_40 0.5131760 9.5470219 0.22409171 0.36565627
fox14_D_1 1.5586674 7.5129415 0.20575144 0.47793027
fox14_F_3 3.0494709 3.4435576 1.62980735 3.32223394
fox14_K_2 0.5995430 5.6724002 0.02957405 0.03470770
fox14_M_4 0.3790174 5.2019894 0.03456248 0.04290575
fox16_D_33 1.8642653 16.0480617 0.11617587 0.11494288
fox16_F_35 1.8038610 8.1444304 1.65409742 3.60815903
fox16_K_34 0.8935251 6.1766422 0.03076079 0.03541931
fox16_M_36 0.7906496 7.2696400 0.04106251 0.06308295
fox1_D_17 1.0365225 9.5821911 0.41629582 0.68272520
fox1_K_18 1.2751933 13.7506079 0.07496196 0.10288401
fox1_M_20 0.3523016 2.9740678 0.01786800 0.02319049
fox22_D_29 0.8179759 3.9995783 1.18966373 7.16354957
fox22_F_31 1.2420612 0.5808306 3.04907749 6.16145112
fox22_K_30 1.4172912 10.1558721 0.05224991 0.06107603
fox22_M_32 0.7611953 9.4840202 0.31379754 0.58815662
fox24_D_5 2.2316394 10.1593674 0.12140898 0.47912002
fox24_F_7 1.4618429 7.2107388 0.35280228 0.64631557
fox24_K_6 0.5889175 11.1312892 0.05502918 0.06345175
fox24_M_8 0.5168462 9.9273909 0.05831746 0.08046586
fox26_D_41 4.8972087 11.8114623 0.57036670 0.15562082
fox26_F_43 3.2084935 4.3609827 2.43630187 2.91103049
fox26_K_42 0.5383661 9.6877966 0.06740662 0.06352402
fox26_M_44 0.8497864 11.0920109 0.28636408 0.43980879
fox29_D_9 2.7560848 7.8789344 0.17264936 0.36172230
fox29_F_11 1.5935373 5.6150417 0.70333918 0.93580540
fox29_K_10 0.8593511 6.4283843 0.03036465 0.03872504
fox29_M_12 0.6192141 5.8888423 0.03290705 0.04689389
fox30_D_13 3.7178316 9.4089714 0.63316607 0.37918361
fox30_F_15 1.5528117 4.8552355 2.51266195 3.40603999
fox30_K_14 0.5678181 9.7021594 0.05598970 0.06023989
fox30_M_16 0.7024185 10.0939226 0.29300930 0.44862749
fox3_D_25 1.7112057 13.1973431 0.18071781 0.38979255
fox3_F_27 1.5159373 2.0277157 1.24050570 2.83525985
fox3_M_28 1.4034647 7.1535047 0.04753161 0.08069362
fox5_D_10 2.7287199 13.6063868 0.61487849 1.54580318
fox5_F_12 1.8250414 4.3329962 1.55799919 3.72590584
fox5_K_11 0.3455828 3.3354846 0.01929405 0.01995294
fox5_M_9 0.2979118 3.3643827 0.02800099 0.02913255
fox8_D_21 2.3661247 4.0665726 2.66481434 5.19974240
fox8_F_23 2.5074581 2.2107557 2.67704976 4.26260403
fox8_K_22 0.6695787 11.3845138 0.07137123 0.08182030
fox8_M_24 0.5294341 9.9194827 0.17877318 0.33240900
#mean
sequence_fractions %>%
    dplyr::select(sample, lowqual_bases, bases_host, unmapped_bases, mags_bases)  %>%
  mutate_at(vars(-sample), ~./1000000000) %>%
  rename("Sample"=1, "Low quality"=2, "Mapped to host"=3, "Unmapped"=4, "Mapped to MAGs"=5) %>%
  summarise(across(where(is.numeric), mean, na.rm = TRUE)) %>%
  tt()
Warning: There was 1 warning in `summarise()`.
ℹ In argument: `across(where(is.numeric), mean, na.rm = TRUE)`.
Caused by warning:
! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
Supply arguments directly to `.fns` through an anonymous function instead.

  # Previously
  across(a:b, mean, na.rm = TRUE)

  # Now
  across(a:b, \(x) mean(x, na.rm = TRUE))
Low quality Mapped to host Unmapped Mapped to MAGs
1.442664 7.590863 0.6620315 1.269209
#SD
sequence_fractions %>%
    dplyr::select(sample, lowqual_bases, bases_host, unmapped_bases, mags_bases)  %>%
  mutate_at(vars(-sample), ~./1000000000) %>%
  rename("Sample"=1, "Low quality"=2, "Mapped to host"=3, "Unmapped"=4, "Mapped to MAGs"=5) %>%
  summarise(across(where(is.numeric), sd, na.rm = TRUE)) %>% 
  tt()
Low quality Mapped to host Unmapped Mapped to MAGs
1.023265 3.559173 0.9109667 1.876046
sequence_fractions_plot <- sequence_fractions %>%
  dplyr::select(sample, lowqual_bases, bases_host, unmapped_bases, mags_bases)  %>%
    pivot_longer(!sample, names_to = "fraction", values_to = "value") %>%
    mutate(value = value / 1000000000) %>%
    mutate(fraction = factor(fraction, levels = c("lowqual_bases","bases_host","unmapped_bases","mags_bases"))) 
  
ggplot(sequence_fractions_plot, aes(x = sample, y = value, fill=fraction)) +
    geom_bar(position="stack", stat = "identity") +
    scale_fill_manual(name="Sequence type",
                  breaks=c("lowqual_bases","bases_host","unmapped_bases","mags_bases"),
                  labels=c("Low quality","Mapped to host","Unmapped","Mapped to MAGs"),
                  values=c("#CCCCCC", "#bcdee1", "#d8b8a3","#93655c"))+
    labs(x = "Samples", y = "Amount of data (GB)") +
    theme_classic() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),legend.position = "bottom")

sequence_fractions_type <- sequence_fractions_plot %>%
  left_join(sample_metadata) 

ggplot(sequence_fractions_type, aes(x = sample, y = value, fill = fraction)) +
    geom_bar(position = "stack", stat = "identity") +
    scale_fill_manual(name = "Sequence type",
                      breaks = c("lowqual_bases","bases_host","unmapped_bases","mags_bases"),
                      labels = c("Low quality","Mapped to host","Unmapped","Mapped to MAGs"),
                      values = c("#CCCCCC", "#bcdee1", "#d8b8a3","#93655c")) +
    labs(x = "Samples", y = "Amount of data (GB)") +
    theme_classic() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 6),
          legend.position = "bottom") +
    facet_wrap(~ sample_type, scales = "free_x", nrow = 1)

ggplot(sequence_fractions_type, aes(x = sample, y = value, fill = fraction)) +
    geom_bar(position = "stack", stat = "identity") +
    scale_fill_manual(name = "Sequence type",
                      breaks = c("lowqual_bases","bases_host","unmapped_bases","mags_bases"),
                      labels = c("Low quality","Mapped to host","Unmapped","Mapped to MAGs"),
                      values = c("#CCCCCC", "#bcdee1", "#d8b8a3","#93655c")) +
    labs(x = "Samples", y = "Amount of data (GB)") +
    theme_classic() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 6),
          legend.position = "bottom") +
    facet_wrap(~ gut_location, scales = "free_x", nrow = 1)

ggplot(sequence_fractions_type, aes(x = sample, y = value, fill = fraction)) +
    geom_bar(position = "stack", stat = "identity") +
    scale_fill_manual(name = "Sequence type",
                      breaks = c("lowqual_bases","bases_host","unmapped_bases","mags_bases"),
                      labels = c("Low quality","Mapped to host","Unmapped","Mapped to MAGs"),
                      values = c("#CCCCCC", "#bcdee1", "#d8b8a3","#93655c")) +
       labs(x = "Samples", y = "Amount of data (GB)") +
    theme_classic() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 6),
          legend.position = "bottom") +
    facet_nested(~ sample_type + gut_location,
                 scales = "free_x", space = "free_x")

ggplot(sequence_fractions_type, aes(x = sample, y = value, fill = fraction)) +
    geom_bar(position = "stack", stat = "identity") +
    scale_fill_manual(name = "Sequence type",
                      breaks = c("lowqual_bases","bases_host","unmapped_bases","mags_bases"),
                      labels = c("Low quality","Mapped to host","Unmapped","Mapped to MAGs"),
                      values = c("#CCCCCC", "#bcdee1", "#d8b8a3","#93655c")) +
    labs(x = "Samples", y = "Amount of data (GB)") +
    theme_classic() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 6),
          legend.position = "bottom") +
    facet_wrap(~ fox_behaviour, scales = "free_x", nrow = 1)

3.2 Filtering

To remove samples with high host data or low metagenomic data

host_data_plot <- sequence_fractions %>%
  left_join(sample_metadata) %>%
  mutate(host_fraction = bases_host / bases_raw,
         sample = reorder(sample, -host_fraction))  #reorder by decreasing host_fraction
Joining with `by = join_by(sample)`
ggplot(host_data_plot,aes(x = sample, y = host_fraction, fill = gut_location)) +
  geom_col() +
  scale_fill_manual(
  values = c(
    F_colon = "#d6604d",
    M_colon = "#542788",
    D_ileum = "#fdb863",
    K_ileum = "#bb99d8"
  )
)+
  geom_hline(yintercept = 0.5, linetype = "dashed", color = "black")+
  labs(y = "Host fraction", x = "Sample",
       title = "Fraction of host bases per sample")+
   theme_bw() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1)  
  )

threshold <- 0.5

samples_to_remove <- host_data_plot %>%
  filter(host_fraction > threshold) %>%
  pull(sample)

samples_to_remove
 [1] fox10_D_37 fox10_K_38 fox10_M_40 fox14_D_1  fox14_K_2  fox14_M_4  fox16_D_33 fox16_F_35 fox16_K_34
[10] fox16_M_36 fox1_D_17  fox1_K_18  fox1_M_20  fox22_K_30 fox22_M_32 fox24_D_5  fox24_F_7  fox24_K_6 
[19] fox24_M_8  fox26_D_41 fox26_K_42 fox26_M_44 fox29_D_9  fox29_F_11 fox29_K_10 fox29_M_12 fox30_D_13
[28] fox30_K_14 fox30_M_16 fox3_D_25  fox3_M_28  fox5_D_10  fox5_K_11  fox5_M_9   fox8_K_22  fox8_M_24 
attr(,"scores")
 fox1_D_17  fox1_K_18  fox1_M_20 fox10_D_37 fox10_F_39 fox10_K_38 fox10_M_40  fox14_D_1  fox14_F_3  fox14_K_2 
-0.8115655 -0.8981025 -0.8761180 -0.6046012 -0.2216993 -0.9191864 -0.8715206 -0.7659288 -0.2999397 -0.8854033 
 fox14_M_4 fox16_D_33 fox16_F_35 fox16_K_34 fox16_M_36 fox22_D_29 fox22_F_31 fox22_K_30 fox22_M_32  fox24_D_5 
-0.9109764 -0.8774608 -0.5329078 -0.8579955 -0.8821516 -0.3019548 -0.0524776 -0.8576919 -0.8299505 -0.7748819 
 fox24_F_7  fox24_K_6  fox24_M_8 fox26_D_41 fox26_F_43 fox26_K_42 fox26_M_44  fox29_D_9 fox29_F_11 fox29_K_10 
-0.7307543 -0.9258449 -0.9190592 -0.6490895 -0.3298888 -0.9090424 -0.8483041 -0.7013799 -0.6212880 -0.8665419 
fox29_M_12  fox3_D_25  fox3_F_27  fox3_M_28 fox30_D_13 fox30_F_15 fox30_K_14 fox30_M_16  fox5_D_10  fox5_F_12 
-0.8858230 -0.8468042 -0.2655088 -0.8165677 -0.6407747 -0.3802882 -0.9141509 -0.8583687 -0.7319965 -0.3761559 
 fox5_K_11   fox5_M_9  fox8_D_21  fox8_F_23  fox8_K_22  fox8_M_24 
-0.8893286 -0.8956104 -0.2836635 -0.1883364 -0.9188259 -0.8814484 
46 Levels: fox24_K_6 fox10_K_38 fox24_M_8 fox8_K_22 fox30_K_14 fox14_M_4 fox26_K_42 fox1_K_18 ... fox22_F_31
host_data_plot <- sequence_fractions %>%
  left_join(sample_metadata) %>%
  mutate(host_fraction = bases_host / bases_raw,
                bases_metagenomic_GB = bases_metagenomic / 1e9, sample = reorder(sample, -bases_metagenomic))  #reorder by decreasing bases
Joining with `by = join_by(sample)`
ggplot(host_data_plot,aes(x = sample, y = bases_metagenomic_GB, fill = gut_location)) +
  geom_col() +
  scale_fill_manual(
  values = c(
    F_colon = "#d6604d",
    M_colon = "#542788",
    D_ileum = "#fdb863",
    K_ileum = "#bb99d8"
  )
)+
  geom_hline(yintercept = 1, linetype = "dashed", color = "black")+
  labs(y = "Metagenomic bases (Gb)", x = "Sample",
       title = "GB of metagenomic bases per sample")+
   theme_bw() +
  scale_y_continuous(
    breaks = seq(0, max(host_data_plot$bases_metagenomic_GB), by = 1)
  ) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1)  
  )

ggplot(host_data_plot, aes(x= host_fraction, y= bases_metagenomic_GB, color = gut_location))+
   scale_color_manual(
  values = c(
    F_colon = "#d6604d",
    M_colon = "#542788",
    D_ileum = "#fdb863",
    K_ileum = "#bb99d8"
  ))+
  scale_y_continuous(
    breaks = seq(0, max(host_data_plot$bases_metagenomic_GB), by = 1)
  ) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "black")+
  geom_vline(xintercept = 0.75, linetype = "dashed", color = "black")+
  labs(y = "Metagenomic Bases", x = "Host fraction")+
  geom_point()+
  theme_bw()

samples_to_remove_total <- host_data_plot %>%
  filter(bases_metagenomic_GB < 1) %>%  #remove samples with less than 1Gb of metagenomic reads
   filter(host_fraction > 0.75) %>% #remove samples with more than 0.75 host fraction
  pull(sample)

samples_to_remove_total
 [1] fox10_K_38 fox10_M_40 fox14_D_1  fox14_K_2  fox14_M_4  fox16_D_33 fox16_K_34 fox16_M_36 fox1_K_18 
[10] fox1_M_20  fox22_K_30 fox22_M_32 fox24_D_5  fox24_K_6  fox24_M_8  fox26_K_42 fox26_M_44 fox29_K_10
[19] fox29_M_12 fox30_K_14 fox30_M_16 fox3_D_25  fox3_M_28  fox5_K_11  fox5_M_9   fox8_K_22  fox8_M_24 
attr(,"scores")
  fox1_D_17   fox1_K_18   fox1_M_20  fox10_D_37  fox10_F_39  fox10_K_38  fox10_M_40   fox14_D_1   fox14_F_3 
-1099021021  -177845966   -41058495 -2540032497 -7582214024  -104244170  -589747982  -683681706 -4952041291 
  fox14_K_2   fox14_M_4  fox16_D_33  fox16_F_35  fox16_K_34  fox16_M_36  fox22_D_29  fox22_F_31  fox22_K_30 
  -64281749   -77468230  -231118746 -5262256451   -66180098  -104145463 -8353213301 -9210528613  -113325947 
 fox22_M_32   fox24_D_5   fox24_F_7   fox24_K_6   fox24_M_8  fox26_D_41  fox26_F_43  fox26_K_42  fox26_M_44 
 -901954156  -600529006  -999117854  -118480924  -138783312  -725987517 -5347332359  -130930631  -726172874 
  fox29_D_9  fox29_F_11  fox29_K_10  fox29_M_12   fox3_D_25   fox3_F_27   fox3_M_28  fox30_D_13  fox30_F_15 
 -534371657 -1639144575   -69089686   -79800931  -570510362 -4075765550  -128225225 -1012349683 -5918701937 
 fox30_K_14  fox30_M_16   fox5_D_10   fox5_F_12   fox5_K_11    fox5_M_9   fox8_D_21   fox8_F_23   fox8_K_22 
 -116229592  -741636782 -2160681671 -5283905027   -39246993   -57133542 -7864556744 -6939653783  -153191527 
  fox8_M_24 
 -511182182 
46 Levels: fox22_F_31 fox22_D_29 fox8_D_21 fox10_F_39 fox8_F_23 fox30_F_15 fox26_F_43 fox5_F_12 ... fox5_K_11
#Add samples to remove to the Rdata
save(samples_to_remove_total,  file = "data.RData")
host_data_filtered <- host_data_plot %>%
  filter(!sample %in% samples_to_remove_total) %>%
  mutate(host_fraction = bases_host / bases_raw,
         sample = reorder(sample, -host_fraction)) 

ggplot(host_data_filtered,aes(x = sample, y = host_fraction, fill = gut_location)) +
  geom_col() +
  scale_fill_manual(
  values = c(
    F_colon = "#d6604d",
    M_colon = "#542788",
    D_ileum = "#fdb863",
    K_ileum = "#bb99d8"
  )
)+
  geom_hline(yintercept = 0.5, linetype = "dashed", color = "black")+
  labs(y = "Host fraction", x = "Sample",
       title = "Fraction of host bases per sample")+
   theme_bw() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1)  
  )