Loading data
preprocessing <- read_tsv("data/preprocessing.tsv")
Rows: 48 Columns: 9
── Column specification ─────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): sample
dbl (8): reads_raw, reads_discarded, reads_host, reads_metagenomic, bases_raw, bases_discarded, bases_host, b...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Total number of bases sequenced (Gb)
preprocessing %>%
summarise(Total=sum(reads_metagenomic * 150 / 1000000000) %>% round(2),
mean=mean(reads_metagenomic * 150 / 1000000000) %>% round(2),
sd=sd(reads_metagenomic * 150 / 1000000000) %>% round(2)) %>%
unite("Average",mean, sd, sep = " ± ", remove = TRUE) %>%
tt()
| Total |
Average |
| 96.56 |
2.01 ± 2.86 |
How many samples do we have of each group (behaviour)?
sample_metadata %>%
group_by(fox_behaviour) %>%
summarize(n_samples = n()) %>%
tt()
| fox_behaviour |
n_samples |
| aggr |
20 |
| tame |
20 |
| unsel |
8 |
How many MAGs?
genome_metadata %>%
summarize(n_genomes = n())%>%
tt()
genome_metadata %>%
summarise(
mean_c = mean(completeness, na.rm = TRUE) %>% round(2),
sd_c = sd(completeness, na.rm = TRUE) %>% round(2),
mean_con = mean(contamination, na.rm = TRUE) %>% round(2),
sd_con = sd(contamination, na.rm = TRUE) %>% round(2)
) %>%
unite("Completeness", mean_c, sd_c, sep = " ± ") %>%
unite("Contamination", mean_con, sd_con, sep = " ± ") %>%
tt()
| Completeness |
Contamination |
| 92.6 ± 6.75 |
1.92 ± 2.11 |
DNA fractions
sequence_fractions <- read_counts %>%
pivot_longer(-genome, names_to = "sample", values_to = "value") %>%
group_by(sample) %>%
summarise(mags = sum(value)) %>%
left_join(preprocessing, by = join_by(sample == sample)) %>%
dplyr::select(sample,mags,bases_metagenomic,bases_host ,bases_discarded, bases_raw) %>%
mutate(mags_bases = mags*146) %>% #total number of bases mapped to mags (multiplying by effective read length)
mutate(bases_discarded_proportion = bases_discarded/bases_raw) %>%
mutate(lowqual_bases = ((bases_metagenomic+bases_host)/(1-bases_discarded_proportion))-(bases_metagenomic+bases_host)) %>%
mutate(unmapped_bases = bases_metagenomic - mags_bases) %>%
mutate(unmapped_bases = ifelse(unmapped_bases < 0, 0, unmapped_bases))
sequence_fractions %>%
dplyr::select(sample, lowqual_bases, bases_host, unmapped_bases, mags_bases) %>%
mutate_at(vars(-sample), ~./1000000000) %>%
rename("Sample"=1, "Low quality"=2, "Mapped to host"=3, "Unmapped"=4, "Mapped to MAGs"=5) %>%
tt()
| Sample |
Low quality |
Mapped to host |
Unmapped |
Mapped to MAGs |
| fox10_D_37 |
2.6301023 |
8.1216298 |
0.88459923 |
1.65543326 |
| fox10_F_39 |
2.2970487 |
2.8406966 |
2.77779874 |
4.80441528 |
| fox10_K_38 |
0.5180657 |
8.8218626 |
0.04823171 |
0.05601246 |
| fox10_M_40 |
0.5131760 |
9.5470219 |
0.22409171 |
0.36565627 |
| fox14_D_1 |
1.5586674 |
7.5129415 |
0.20575144 |
0.47793027 |
| fox14_F_3 |
3.0494709 |
3.4435576 |
1.62980735 |
3.32223394 |
| fox14_K_2 |
0.5995430 |
5.6724002 |
0.02957405 |
0.03470770 |
| fox14_M_4 |
0.3790174 |
5.2019894 |
0.03456248 |
0.04290575 |
| fox16_D_33 |
1.8642653 |
16.0480617 |
0.11617587 |
0.11494288 |
| fox16_F_35 |
1.8038610 |
8.1444304 |
1.65409742 |
3.60815903 |
| fox16_K_34 |
0.8935251 |
6.1766422 |
0.03076079 |
0.03541931 |
| fox16_M_36 |
0.7906496 |
7.2696400 |
0.04106251 |
0.06308295 |
| fox1_D_17 |
1.0365225 |
9.5821911 |
0.41629582 |
0.68272520 |
| fox1_K_18 |
1.2751933 |
13.7506079 |
0.07496196 |
0.10288401 |
| fox1_M_20 |
0.3523016 |
2.9740678 |
0.01786800 |
0.02319049 |
| fox22_D_29 |
0.8179759 |
3.9995783 |
1.18966373 |
7.16354957 |
| fox22_F_31 |
1.2420612 |
0.5808306 |
3.04907749 |
6.16145112 |
| fox22_K_30 |
1.4172912 |
10.1558721 |
0.05224991 |
0.06107603 |
| fox22_M_32 |
0.7611953 |
9.4840202 |
0.31379754 |
0.58815662 |
| fox24_D_5 |
2.2316394 |
10.1593674 |
0.12140898 |
0.47912002 |
| fox24_F_7 |
1.4618429 |
7.2107388 |
0.35280228 |
0.64631557 |
| fox24_K_6 |
0.5889175 |
11.1312892 |
0.05502918 |
0.06345175 |
| fox24_M_8 |
0.5168462 |
9.9273909 |
0.05831746 |
0.08046586 |
| fox26_D_41 |
4.8972087 |
11.8114623 |
0.57036670 |
0.15562082 |
| fox26_F_43 |
3.2084935 |
4.3609827 |
2.43630187 |
2.91103049 |
| fox26_K_42 |
0.5383661 |
9.6877966 |
0.06740662 |
0.06352402 |
| fox26_M_44 |
0.8497864 |
11.0920109 |
0.28636408 |
0.43980879 |
| fox29_D_9 |
2.7560848 |
7.8789344 |
0.17264936 |
0.36172230 |
| fox29_F_11 |
1.5935373 |
5.6150417 |
0.70333918 |
0.93580540 |
| fox29_K_10 |
0.8593511 |
6.4283843 |
0.03036465 |
0.03872504 |
| fox29_M_12 |
0.6192141 |
5.8888423 |
0.03290705 |
0.04689389 |
| fox30_D_13 |
3.7178316 |
9.4089714 |
0.63316607 |
0.37918361 |
| fox30_F_15 |
1.5528117 |
4.8552355 |
2.51266195 |
3.40603999 |
| fox30_K_14 |
0.5678181 |
9.7021594 |
0.05598970 |
0.06023989 |
| fox30_M_16 |
0.7024185 |
10.0939226 |
0.29300930 |
0.44862749 |
| fox3_D_25 |
1.7112057 |
13.1973431 |
0.18071781 |
0.38979255 |
| fox3_F_27 |
1.5159373 |
2.0277157 |
1.24050570 |
2.83525985 |
| fox3_M_28 |
1.4034647 |
7.1535047 |
0.04753161 |
0.08069362 |
| fox5_D_10 |
2.7287199 |
13.6063868 |
0.61487849 |
1.54580318 |
| fox5_F_12 |
1.8250414 |
4.3329962 |
1.55799919 |
3.72590584 |
| fox5_K_11 |
0.3455828 |
3.3354846 |
0.01929405 |
0.01995294 |
| fox5_M_9 |
0.2979118 |
3.3643827 |
0.02800099 |
0.02913255 |
| fox8_D_21 |
2.3661247 |
4.0665726 |
2.66481434 |
5.19974240 |
| fox8_F_23 |
2.5074581 |
2.2107557 |
2.67704976 |
4.26260403 |
| fox8_K_22 |
0.6695787 |
11.3845138 |
0.07137123 |
0.08182030 |
| fox8_M_24 |
0.5294341 |
9.9194827 |
0.17877318 |
0.33240900 |
#mean
sequence_fractions %>%
dplyr::select(sample, lowqual_bases, bases_host, unmapped_bases, mags_bases) %>%
mutate_at(vars(-sample), ~./1000000000) %>%
rename("Sample"=1, "Low quality"=2, "Mapped to host"=3, "Unmapped"=4, "Mapped to MAGs"=5) %>%
summarise(across(where(is.numeric), mean, na.rm = TRUE)) %>%
tt()
| Low quality |
Mapped to host |
Unmapped |
Mapped to MAGs |
| 1.442664 |
7.590863 |
0.6620315 |
1.269209 |
#SD
sequence_fractions %>%
dplyr::select(sample, lowqual_bases, bases_host, unmapped_bases, mags_bases) %>%
mutate_at(vars(-sample), ~./1000000000) %>%
rename("Sample"=1, "Low quality"=2, "Mapped to host"=3, "Unmapped"=4, "Mapped to MAGs"=5) %>%
summarise(across(where(is.numeric), sd, na.rm = TRUE)) %>%
tt()
| Low quality |
Mapped to host |
Unmapped |
Mapped to MAGs |
| 1.023265 |
3.559173 |
0.9109667 |
1.876046 |
sequence_fractions_plot <- sequence_fractions %>%
dplyr::select(sample, lowqual_bases, bases_host, unmapped_bases, mags_bases) %>%
pivot_longer(!sample, names_to = "fraction", values_to = "value") %>%
mutate(value = value / 1000000000) %>%
mutate(fraction = factor(fraction, levels = c("lowqual_bases","bases_host","unmapped_bases","mags_bases")))
ggplot(sequence_fractions_plot, aes(x = sample, y = value, fill=fraction)) +
geom_bar(position="stack", stat = "identity") +
scale_fill_manual(name="Sequence type",
breaks=c("lowqual_bases","bases_host","unmapped_bases","mags_bases"),
labels=c("Low quality","Mapped to host","Unmapped","Mapped to MAGs"),
values=c("#CCCCCC", "#bcdee1", "#d8b8a3","#93655c"))+
labs(x = "Samples", y = "Amount of data (GB)") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),legend.position = "bottom")

sequence_fractions_type <- sequence_fractions_plot %>%
left_join(sample_metadata)
ggplot(sequence_fractions_type, aes(x = sample, y = value, fill = fraction)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_manual(name = "Sequence type",
breaks = c("lowqual_bases","bases_host","unmapped_bases","mags_bases"),
labels = c("Low quality","Mapped to host","Unmapped","Mapped to MAGs"),
values = c("#CCCCCC", "#bcdee1", "#d8b8a3","#93655c")) +
labs(x = "Samples", y = "Amount of data (GB)") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 6),
legend.position = "bottom") +
facet_wrap(~ sample_type, scales = "free_x", nrow = 1)

ggplot(sequence_fractions_type, aes(x = sample, y = value, fill = fraction)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_manual(name = "Sequence type",
breaks = c("lowqual_bases","bases_host","unmapped_bases","mags_bases"),
labels = c("Low quality","Mapped to host","Unmapped","Mapped to MAGs"),
values = c("#CCCCCC", "#bcdee1", "#d8b8a3","#93655c")) +
labs(x = "Samples", y = "Amount of data (GB)") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 6),
legend.position = "bottom") +
facet_wrap(~ gut_location, scales = "free_x", nrow = 1)

ggplot(sequence_fractions_type, aes(x = sample, y = value, fill = fraction)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_manual(name = "Sequence type",
breaks = c("lowqual_bases","bases_host","unmapped_bases","mags_bases"),
labels = c("Low quality","Mapped to host","Unmapped","Mapped to MAGs"),
values = c("#CCCCCC", "#bcdee1", "#d8b8a3","#93655c")) +
labs(x = "Samples", y = "Amount of data (GB)") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 6),
legend.position = "bottom") +
facet_nested(~ sample_type + gut_location,
scales = "free_x", space = "free_x")

host_df <- sequence_fractions_type %>%
filter(fraction == "bases_host")
#check normality:
shapiro.test(host_df$value)
Shapiro-Wilk normality test
data: host_df$value
W = 0.97962, p-value = 0.5903
#check if different between gut locations:
kruskal.test(value ~ gut_location, data = host_df)
Kruskal-Wallis rank sum test
data: value by gut_location
Kruskal-Wallis chi-squared = 14.852, df = 3, p-value = 0.001948
#check which differences:
pairwise.wilcox.test(host_df$value, host_df$gut_location, p.adjust.method = "fdr")
Pairwise comparisons using Wilcoxon rank sum exact test
data: host_df$value and host_df$gut_location
D_ileum F_colon K_ileum
F_colon 0.0031 - -
K_ileum 0.6947 0.0031 -
M_colon 0.2967 0.0112 0.4552
P value adjustment method: fdr
mag_df <- sequence_fractions_type %>%
filter(fraction == "mags_bases")
#check normality:
shapiro.test(mag_df$value)
Shapiro-Wilk normality test
data: mag_df$value
W = 0.70118, p-value = 2.254e-08
#check if different between gut locations:
kruskal.test(value ~ gut_location, data = mag_df)
Kruskal-Wallis rank sum test
data: value by gut_location
Kruskal-Wallis chi-squared = 32.004, df = 3, p-value = 5.224e-07
#check which differences:
pairwise.wilcox.test(mag_df$value, mag_df$gut_location, p.adjust.method = "fdr")
Pairwise comparisons using Wilcoxon rank sum exact test
data: mag_df$value and mag_df$gut_location
D_ileum F_colon K_ileum
F_colon 0.0155 - -
K_ileum 4.4e-06 5.7e-06 -
M_colon 0.0044 4.4e-06 0.1507
P value adjustment method: fdr
host_summary <- host_df %>%
group_by(gut_location) %>%
summarise(
median = median(value),
IQR = IQR(value)
)
mag_summary <- mag_df %>%
group_by(gut_location) %>%
summarise(
median = median(value),
IQR = IQR(value)
)
host_summary
# A tibble: 4 × 3
gut_location median IQR
<chr> <dbl> <dbl>
1 D_ileum 9.50 4.37
2 F_colon 4.33 2.71
3 K_ileum 9.69 4.34
4 M_colon 8.38 4.20
# A tibble: 4 × 3
gut_location median IQR
<chr> <dbl> <dbl>
1 D_ileum 0.479 1.20
2 F_colon 3.41 1.12
3 K_ileum 0.0602 0.0264
4 M_colon 0.0806 0.338
ggplot(sequence_fractions_type, aes(x = sample, y = value, fill = fraction)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_manual(name = "Sequence type",
breaks = c("lowqual_bases","bases_host","unmapped_bases","mags_bases"),
labels = c("Low quality","Mapped to host","Unmapped","Mapped to MAGs"),
values = c("#CCCCCC", "#bcdee1", "#d8b8a3","#93655c")) +
labs(x = "Samples", y = "Amount of data (GB)") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 6),
legend.position = "bottom") +
facet_wrap(~ fox_behaviour, scales = "free_x", nrow = 1)
