Chapter 3 Data statistics

3.1 Sequencing reads statistics

load("data/data.Rdata")

3.1.1 Loading data

preprocessing <- read_tsv("data/preprocessing.tsv")

Rows: 48 Columns: 9
── Column specification ─────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): sample
dbl (8): reads_raw, reads_discarded, reads_host, reads_metagenomic, bases_raw, bases_discarded, bases_host, b...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Total number of bases sequenced (Gb)

preprocessing %>% 
    summarise(Total=sum(reads_metagenomic * 150 / 1000000000) %>% round(2), 
              mean=mean(reads_metagenomic * 150 / 1000000000) %>% round(2),
              sd=sd(reads_metagenomic * 150 / 1000000000) %>% round(2)) %>%
    unite("Average",mean, sd, sep = " ± ", remove = TRUE) %>%
    tt()

Total	Average
96.56	2.01 ± 2.86

3.1.2 DNA fractions

sequence_fractions <- read_counts %>%
  pivot_longer(-genome, names_to = "sample", values_to = "value") %>%
  group_by(sample) %>%
  summarise(mags = sum(value)) %>%
    left_join(preprocessing, by = join_by(sample == sample)) %>%
    dplyr::select(sample,mags,bases_metagenomic,bases_host ,bases_discarded, bases_raw) %>%
    mutate(mags_bases = mags*146) %>% #total number of bases mapped to mags (multiplying by effective read length)
  mutate(bases_discarded_proportion = bases_discarded/bases_raw) %>%
    mutate(lowqual_bases = ((bases_metagenomic+bases_host)/(1-bases_discarded_proportion))-(bases_metagenomic+bases_host)) %>%
    mutate(unmapped_bases = bases_metagenomic - mags_bases) %>%
    mutate(unmapped_bases = ifelse(unmapped_bases < 0, 0, unmapped_bases))

sequence_fractions %>%
    dplyr::select(sample, lowqual_bases, bases_host, unmapped_bases, mags_bases)  %>%
  mutate_at(vars(-sample), ~./1000000000) %>%
  rename("Sample"=1, "Low quality"=2, "Mapped to host"=3, "Unmapped"=4, "Mapped to MAGs"=5) %>%
  tt()

Sample	Low quality	Mapped to host	Unmapped	Mapped to MAGs
fox10_D_37	2.6301023	8.1216298	0.88459923	1.65543326
fox10_F_39	2.2970487	2.8406966	2.77779874	4.80441528
fox10_K_38	0.5180657	8.8218626	0.04823171	0.05601246
fox10_M_40	0.5131760	9.5470219	0.22409171	0.36565627
fox14_D_1	1.5586674	7.5129415	0.20575144	0.47793027
fox14_F_3	3.0494709	3.4435576	1.62980735	3.32223394
fox14_K_2	0.5995430	5.6724002	0.02957405	0.03470770
fox14_M_4	0.3790174	5.2019894	0.03456248	0.04290575
fox16_D_33	1.8642653	16.0480617	0.11617587	0.11494288
fox16_F_35	1.8038610	8.1444304	1.65409742	3.60815903
fox16_K_34	0.8935251	6.1766422	0.03076079	0.03541931
fox16_M_36	0.7906496	7.2696400	0.04106251	0.06308295
fox1_D_17	1.0365225	9.5821911	0.41629582	0.68272520
fox1_K_18	1.2751933	13.7506079	0.07496196	0.10288401
fox1_M_20	0.3523016	2.9740678	0.01786800	0.02319049
fox22_D_29	0.8179759	3.9995783	1.18966373	7.16354957
fox22_F_31	1.2420612	0.5808306	3.04907749	6.16145112
fox22_K_30	1.4172912	10.1558721	0.05224991	0.06107603
fox22_M_32	0.7611953	9.4840202	0.31379754	0.58815662
fox24_D_5	2.2316394	10.1593674	0.12140898	0.47912002
fox24_F_7	1.4618429	7.2107388	0.35280228	0.64631557
fox24_K_6	0.5889175	11.1312892	0.05502918	0.06345175
fox24_M_8	0.5168462	9.9273909	0.05831746	0.08046586
fox26_D_41	4.8972087	11.8114623	0.57036670	0.15562082
fox26_F_43	3.2084935	4.3609827	2.43630187	2.91103049
fox26_K_42	0.5383661	9.6877966	0.06740662	0.06352402
fox26_M_44	0.8497864	11.0920109	0.28636408	0.43980879
fox29_D_9	2.7560848	7.8789344	0.17264936	0.36172230
fox29_F_11	1.5935373	5.6150417	0.70333918	0.93580540
fox29_K_10	0.8593511	6.4283843	0.03036465	0.03872504
fox29_M_12	0.6192141	5.8888423	0.03290705	0.04689389
fox30_D_13	3.7178316	9.4089714	0.63316607	0.37918361
fox30_F_15	1.5528117	4.8552355	2.51266195	3.40603999
fox30_K_14	0.5678181	9.7021594	0.05598970	0.06023989
fox30_M_16	0.7024185	10.0939226	0.29300930	0.44862749
fox3_D_25	1.7112057	13.1973431	0.18071781	0.38979255
fox3_F_27	1.5159373	2.0277157	1.24050570	2.83525985
fox3_M_28	1.4034647	7.1535047	0.04753161	0.08069362
fox5_D_10	2.7287199	13.6063868	0.61487849	1.54580318
fox5_F_12	1.8250414	4.3329962	1.55799919	3.72590584
fox5_K_11	0.3455828	3.3354846	0.01929405	0.01995294
fox5_M_9	0.2979118	3.3643827	0.02800099	0.02913255
fox8_D_21	2.3661247	4.0665726	2.66481434	5.19974240
fox8_F_23	2.5074581	2.2107557	2.67704976	4.26260403
fox8_K_22	0.6695787	11.3845138	0.07137123	0.08182030
fox8_M_24	0.5294341	9.9194827	0.17877318	0.33240900

#mean
sequence_fractions %>%
    dplyr::select(sample, lowqual_bases, bases_host, unmapped_bases, mags_bases)  %>%
  mutate_at(vars(-sample), ~./1000000000) %>%
  rename("Sample"=1, "Low quality"=2, "Mapped to host"=3, "Unmapped"=4, "Mapped to MAGs"=5) %>%
  summarise(across(where(is.numeric), mean, na.rm = TRUE)) %>%
  tt()

Warning: There was 1 warning in `summarise()`.
ℹ In argument: `across(where(is.numeric), mean, na.rm = TRUE)`.
Caused by warning:
! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
Supply arguments directly to `.fns` through an anonymous function instead.

  # Previously
  across(a:b, mean, na.rm = TRUE)

  # Now
  across(a:b, \(x) mean(x, na.rm = TRUE))

Low quality	Mapped to host	Unmapped	Mapped to MAGs
1.442664	7.590863	0.6620315	1.269209

#SD
sequence_fractions %>%
    dplyr::select(sample, lowqual_bases, bases_host, unmapped_bases, mags_bases)  %>%
  mutate_at(vars(-sample), ~./1000000000) %>%
  rename("Sample"=1, "Low quality"=2, "Mapped to host"=3, "Unmapped"=4, "Mapped to MAGs"=5) %>%
  summarise(across(where(is.numeric), sd, na.rm = TRUE)) %>% 
  tt()

Low quality	Mapped to host	Unmapped	Mapped to MAGs
1.023265	3.559173	0.9109667	1.876046

sequence_fractions_plot <- sequence_fractions %>%
  dplyr::select(sample, lowqual_bases, bases_host, unmapped_bases, mags_bases)  %>%
    pivot_longer(!sample, names_to = "fraction", values_to = "value") %>%
    mutate(value = value / 1000000000) %>%
    mutate(fraction = factor(fraction, levels = c("lowqual_bases","bases_host","unmapped_bases","mags_bases"))) 
  
ggplot(sequence_fractions_plot, aes(x = sample, y = value, fill=fraction)) +
    geom_bar(position="stack", stat = "identity") +
    scale_fill_manual(name="Sequence type",
                  breaks=c("lowqual_bases","bases_host","unmapped_bases","mags_bases"),
                  labels=c("Low quality","Mapped to host","Unmapped","Mapped to MAGs"),
                  values=c("#CCCCCC", "#bcdee1", "#d8b8a3","#93655c"))+
    labs(x = "Samples", y = "Amount of data (GB)") +
    theme_classic() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),legend.position = "bottom")

sequence_fractions_type <- sequence_fractions_plot %>%
  left_join(sample_metadata) 

ggplot(sequence_fractions_type, aes(x = sample, y = value, fill = fraction)) +
    geom_bar(position = "stack", stat = "identity") +
    scale_fill_manual(name = "Sequence type",
                      breaks = c("lowqual_bases","bases_host","unmapped_bases","mags_bases"),
                      labels = c("Low quality","Mapped to host","Unmapped","Mapped to MAGs"),
                      values = c("#CCCCCC", "#bcdee1", "#d8b8a3","#93655c")) +
    labs(x = "Samples", y = "Amount of data (GB)") +
    theme_classic() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 6),
          legend.position = "bottom") +
    facet_wrap(~ sample_type, scales = "free_x", nrow = 1)

ggplot(sequence_fractions_type, aes(x = sample, y = value, fill = fraction)) +
    geom_bar(position = "stack", stat = "identity") +
    scale_fill_manual(name = "Sequence type",
                      breaks = c("lowqual_bases","bases_host","unmapped_bases","mags_bases"),
                      labels = c("Low quality","Mapped to host","Unmapped","Mapped to MAGs"),
                      values = c("#CCCCCC", "#bcdee1", "#d8b8a3","#93655c")) +
    labs(x = "Samples", y = "Amount of data (GB)") +
    theme_classic() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 6),
          legend.position = "bottom") +
    facet_wrap(~ gut_location, scales = "free_x", nrow = 1)

ggplot(sequence_fractions_type, aes(x = sample, y = value, fill = fraction)) +
    geom_bar(position = "stack", stat = "identity") +
    scale_fill_manual(name = "Sequence type",
                      breaks = c("lowqual_bases","bases_host","unmapped_bases","mags_bases"),
                      labels = c("Low quality","Mapped to host","Unmapped","Mapped to MAGs"),
                      values = c("#CCCCCC", "#bcdee1", "#d8b8a3","#93655c")) +
       labs(x = "Samples", y = "Amount of data (GB)") +
    theme_classic() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 6),
          legend.position = "bottom") +
    facet_nested(~ sample_type + gut_location,
                 scales = "free_x", space = "free_x")

ggplot(sequence_fractions_type, aes(x = sample, y = value, fill = fraction)) +
    geom_bar(position = "stack", stat = "identity") +
    scale_fill_manual(name = "Sequence type",
                      breaks = c("lowqual_bases","bases_host","unmapped_bases","mags_bases"),
                      labels = c("Low quality","Mapped to host","Unmapped","Mapped to MAGs"),
                      values = c("#CCCCCC", "#bcdee1", "#d8b8a3","#93655c")) +
    labs(x = "Samples", y = "Amount of data (GB)") +
    theme_classic() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 6),
          legend.position = "bottom") +
    facet_wrap(~ fox_behaviour, scales = "free_x", nrow = 1)

3.2 Filtering

To remove samples with high host data or low metagenomic data

host_data_plot <- sequence_fractions %>%
  left_join(sample_metadata) %>%
  mutate(host_fraction = bases_host / bases_raw,
         sample = reorder(sample, -host_fraction))  #reorder by decreasing host_fraction

Joining with `by = join_by(sample)`

ggplot(host_data_plot,aes(x = sample, y = host_fraction, fill = gut_location)) +
  geom_col() +
  scale_fill_manual(
  values = c(
    F_colon = "#d6604d",
    M_colon = "#542788",
    D_ileum = "#fdb863",
    K_ileum = "#bb99d8"
  )
)+
  geom_hline(yintercept = 0.5, linetype = "dashed", color = "black")+
  labs(y = "Host fraction", x = "Sample",
       title = "Fraction of host bases per sample")+
   theme_bw() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1)  
  )

threshold <- 0.5

samples_to_remove <- host_data_plot %>%
  filter(host_fraction > threshold) %>%
  pull(sample)

samples_to_remove

 [1] fox10_D_37 fox10_K_38 fox10_M_40 fox14_D_1  fox14_K_2  fox14_M_4  fox16_D_33 fox16_F_35 fox16_K_34
[10] fox16_M_36 fox1_D_17  fox1_K_18  fox1_M_20  fox22_K_30 fox22_M_32 fox24_D_5  fox24_F_7  fox24_K_6 
[19] fox24_M_8  fox26_D_41 fox26_K_42 fox26_M_44 fox29_D_9  fox29_F_11 fox29_K_10 fox29_M_12 fox30_D_13
[28] fox30_K_14 fox30_M_16 fox3_D_25  fox3_M_28  fox5_D_10  fox5_K_11  fox5_M_9   fox8_K_22  fox8_M_24 
attr(,"scores")
 fox1_D_17  fox1_K_18  fox1_M_20 fox10_D_37 fox10_F_39 fox10_K_38 fox10_M_40  fox14_D_1  fox14_F_3  fox14_K_2 
-0.8115655 -0.8981025 -0.8761180 -0.6046012 -0.2216993 -0.9191864 -0.8715206 -0.7659288 -0.2999397 -0.8854033 
 fox14_M_4 fox16_D_33 fox16_F_35 fox16_K_34 fox16_M_36 fox22_D_29 fox22_F_31 fox22_K_30 fox22_M_32  fox24_D_5 
-0.9109764 -0.8774608 -0.5329078 -0.8579955 -0.8821516 -0.3019548 -0.0524776 -0.8576919 -0.8299505 -0.7748819 
 fox24_F_7  fox24_K_6  fox24_M_8 fox26_D_41 fox26_F_43 fox26_K_42 fox26_M_44  fox29_D_9 fox29_F_11 fox29_K_10 
-0.7307543 -0.9258449 -0.9190592 -0.6490895 -0.3298888 -0.9090424 -0.8483041 -0.7013799 -0.6212880 -0.8665419 
fox29_M_12  fox3_D_25  fox3_F_27  fox3_M_28 fox30_D_13 fox30_F_15 fox30_K_14 fox30_M_16  fox5_D_10  fox5_F_12 
-0.8858230 -0.8468042 -0.2655088 -0.8165677 -0.6407747 -0.3802882 -0.9141509 -0.8583687 -0.7319965 -0.3761559 
 fox5_K_11   fox5_M_9  fox8_D_21  fox8_F_23  fox8_K_22  fox8_M_24 
-0.8893286 -0.8956104 -0.2836635 -0.1883364 -0.9188259 -0.8814484 
46 Levels: fox24_K_6 fox10_K_38 fox24_M_8 fox8_K_22 fox30_K_14 fox14_M_4 fox26_K_42 fox1_K_18 ... fox22_F_31

host_data_plot <- sequence_fractions %>%
  left_join(sample_metadata) %>%
  mutate(host_fraction = bases_host / bases_raw,
                bases_metagenomic_GB = bases_metagenomic / 1e9, sample = reorder(sample, -bases_metagenomic))  #reorder by decreasing bases

Joining with `by = join_by(sample)`

ggplot(host_data_plot,aes(x = sample, y = bases_metagenomic_GB, fill = gut_location)) +
  geom_col() +
  scale_fill_manual(
  values = c(
    F_colon = "#d6604d",
    M_colon = "#542788",
    D_ileum = "#fdb863",
    K_ileum = "#bb99d8"
  )
)+
  geom_hline(yintercept = 1, linetype = "dashed", color = "black")+
  labs(y = "Metagenomic bases (Gb)", x = "Sample",
       title = "GB of metagenomic bases per sample")+
   theme_bw() +
  scale_y_continuous(
    breaks = seq(0, max(host_data_plot$bases_metagenomic_GB), by = 1)
  ) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1)  
  )

ggplot(host_data_plot, aes(x= host_fraction, y= bases_metagenomic_GB, color = gut_location))+
   scale_color_manual(
  values = c(
    F_colon = "#d6604d",
    M_colon = "#542788",
    D_ileum = "#fdb863",
    K_ileum = "#bb99d8"
  ))+
  scale_y_continuous(
    breaks = seq(0, max(host_data_plot$bases_metagenomic_GB), by = 1)
  ) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "black")+
  geom_vline(xintercept = 0.75, linetype = "dashed", color = "black")+
  labs(y = "Metagenomic Bases", x = "Host fraction")+
  geom_point()+
  theme_bw()

samples_to_remove_total <- host_data_plot %>%
  filter(bases_metagenomic_GB < 1) %>%  #remove samples with less than 1Gb of metagenomic reads
   filter(host_fraction > 0.75) %>% #remove samples with more than 0.75 host fraction
  pull(sample)

samples_to_remove_total

 [1] fox10_K_38 fox10_M_40 fox14_D_1  fox14_K_2  fox14_M_4  fox16_D_33 fox16_K_34 fox16_M_36 fox1_K_18 
[10] fox1_M_20  fox22_K_30 fox22_M_32 fox24_D_5  fox24_K_6  fox24_M_8  fox26_K_42 fox26_M_44 fox29_K_10
[19] fox29_M_12 fox30_K_14 fox30_M_16 fox3_D_25  fox3_M_28  fox5_K_11  fox5_M_9   fox8_K_22  fox8_M_24 
attr(,"scores")
  fox1_D_17   fox1_K_18   fox1_M_20  fox10_D_37  fox10_F_39  fox10_K_38  fox10_M_40   fox14_D_1   fox14_F_3 
-1099021021  -177845966   -41058495 -2540032497 -7582214024  -104244170  -589747982  -683681706 -4952041291 
  fox14_K_2   fox14_M_4  fox16_D_33  fox16_F_35  fox16_K_34  fox16_M_36  fox22_D_29  fox22_F_31  fox22_K_30 
  -64281749   -77468230  -231118746 -5262256451   -66180098  -104145463 -8353213301 -9210528613  -113325947 
 fox22_M_32   fox24_D_5   fox24_F_7   fox24_K_6   fox24_M_8  fox26_D_41  fox26_F_43  fox26_K_42  fox26_M_44 
 -901954156  -600529006  -999117854  -118480924  -138783312  -725987517 -5347332359  -130930631  -726172874 
  fox29_D_9  fox29_F_11  fox29_K_10  fox29_M_12   fox3_D_25   fox3_F_27   fox3_M_28  fox30_D_13  fox30_F_15 
 -534371657 -1639144575   -69089686   -79800931  -570510362 -4075765550  -128225225 -1012349683 -5918701937 
 fox30_K_14  fox30_M_16   fox5_D_10   fox5_F_12   fox5_K_11    fox5_M_9   fox8_D_21   fox8_F_23   fox8_K_22 
 -116229592  -741636782 -2160681671 -5283905027   -39246993   -57133542 -7864556744 -6939653783  -153191527 
  fox8_M_24 
 -511182182 
46 Levels: fox22_F_31 fox22_D_29 fox8_D_21 fox10_F_39 fox8_F_23 fox30_F_15 fox26_F_43 fox5_F_12 ... fox5_K_11

#Add samples to remove to the Rdata
save(samples_to_remove_total,  file = "data.RData")

host_data_filtered <- host_data_plot %>%
  filter(!sample %in% samples_to_remove_total) %>%
  mutate(host_fraction = bases_host / bases_raw,
         sample = reorder(sample, -host_fraction)) 

ggplot(host_data_filtered,aes(x = sample, y = host_fraction, fill = gut_location)) +
  geom_col() +
  scale_fill_manual(
  values = c(
    F_colon = "#d6604d",
    M_colon = "#542788",
    D_ileum = "#fdb863",
    K_ileum = "#bb99d8"
  )
)+
  geom_hline(yintercept = 0.5, linetype = "dashed", color = "black")+
  labs(y = "Host fraction", x = "Sample",
       title = "Fraction of host bases per sample")+
   theme_bw() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1)  
  )