Chapter 3 Data statistics
load("data/data.Rdata")
extra_metadata <- read_csv("data/Bombina_microbiome_metadata.csv") %>%
rename(animal=sample) %>%
rename(sample=EHI_code)filtered_reads <- read_counts %>%
column_to_rownames(., "genome") %>%
colSums() %>%
as.data.frame() %>%
rename("depth"=".") %>%
rownames_to_column(., "sample") %>%
left_join(extra_metadata, by="sample") %>%
arrange(animal) %>%
group_by(animal) %>%
slice_max(order_by = depth, n = 1, with_ties = FALSE)3.1 Sequencing reads statistics
sample_metadata %>%
filter(sample %in% filtered_reads$sample) %>%
summarise(Total=sum(reads_post_fastp * 150 / 1000000000) %>% round(2),
mean=mean(reads_post_fastp * 150 / 1000000000) %>% round(2),
sd=sd(reads_post_fastp * 150 / 1000000000) %>% round(2)) %>%
unite("Average",mean, sd, sep = " ± ", remove = TRUE) %>%
tt()| Total | Average |
|---|---|
| 448.68 | 11.5 ± 5.92 |
3.2 DNA fractions
sequence_fractions <- read_counts %>%
select(all_of(c("genome",filtered_reads$sample))) %>%
pivot_longer(-genome, names_to = "sample", values_to = "value") %>%
group_by(sample) %>%
summarise(mags = sum(value)) %>%
left_join(sample_metadata, by = join_by(sample == sample)) %>%
select(sample,mags,metagenomic_bases,host_bases,bases_lost_fastp_percent) %>%
mutate(mags_bases = mags*146) %>%
mutate(lowqual_bases = ((metagenomic_bases+host_bases)/(1-bases_lost_fastp_percent))-(metagenomic_bases+host_bases)) %>%
mutate(unmapped_bases = metagenomic_bases - mags_bases) %>%
mutate(unmapped_bases = ifelse(unmapped_bases < 0, 0, unmapped_bases)) %>%
select(sample, lowqual_bases, host_bases, unmapped_bases, mags_bases)
sequence_fractions %>%
mutate_at(vars(-sample), ~./1000000000) %>%
rename("Sample"=1, "Low quality"=2, "Mapped to host"=3, "Unmapped"=4, "Mapped to MAGs"=5) %>%
tt()| Sample | Low quality | Mapped to host | Unmapped | Mapped to MAGs |
|---|---|---|---|---|
| EHI01695 | 0.3461828 | 0.04127063 | 4.7400240 | 1.38511397 |
| EHI01696 | 0.4424049 | 0.22406116 | 5.2067968 | 4.16427478 |
| EHI01697 | 0.4538495 | 1.90941404 | 2.6774953 | 4.97927029 |
| EHI01698 | 0.3573398 | 0.09858151 | 1.5648997 | 5.72228546 |
| EHI01699 | 0.7050913 | 0.96951518 | 9.2592759 | 3.23009714 |
| EHI01700 | 0.6530752 | 3.56898382 | 5.4267930 | 2.69687302 |
| EHI01701 | 0.6766729 | 1.02592777 | 9.5467908 | 5.35279961 |
| EHI01702 | 2.0469307 | 6.03379949 | 3.6116400 | 0.73573430 |
| EHI02563 | 0.1714629 | 0.64207433 | 0.9746465 | 0.23628290 |
| EHI02564 | 0.3836564 | 2.85852293 | 1.6403990 | 0.51015831 |
| EHI02569 | 0.3707787 | 1.83758111 | 2.4210436 | 0.23655095 |
| EHI02570 | 0.4532710 | 4.17998256 | 0.9421899 | 0.06150104 |
| EHI02661 | 0.5505824 | 3.58589734 | 1.1707615 | 4.16354989 |
| EHI02662 | 0.9068807 | 0.23641569 | 12.7506346 | 3.66885561 |
| EHI02663 | 1.6590876 | 8.68528216 | 17.5021482 | 1.77573172 |
| EHI02664 | 0.8797522 | 1.67842756 | 10.8189021 | 3.59334368 |
| EHI02665 | 1.3530223 | 8.45927663 | 10.8622509 | 3.56494434 |
| EHI02666 | 0.7772609 | 5.05533393 | 8.5615855 | 1.44358288 |
| EHI02667 | 0.6646793 | 2.25720854 | 7.5154444 | 2.54387451 |
| EHI02668 | 0.7713225 | 0.61533293 | 11.1440351 | 1.60854376 |
| EHI02669 | 0.6186671 | 2.19198064 | 6.2047538 | 1.73961044 |
| EHI02670 | 0.7066807 | 6.48534358 | 2.8398548 | 1.35885748 |
| EHI02671 | 0.6099077 | 6.69793619 | 4.4765836 | 0.25575900 |
| EHI02672 | 0.7555689 | 4.27773321 | 6.8023493 | 1.22402779 |
| EHI02673 | 1.1212504 | 6.00359678 | 14.4151757 | 1.25960215 |
| EHI02674 | 1.3226743 | 1.14702137 | 11.5001905 | 10.53462956 |
| EHI02675 | 0.7725862 | 11.50682250 | 2.0966824 | 0.23969346 |
| EHI02676 | 0.5425090 | 5.29023643 | 3.2681275 | 0.40619273 |
| EHI02677 | 0.7336743 | 0.36706295 | 14.2915536 | 0.28614657 |
| EHI02678 | 0.6623755 | 1.47718139 | 8.1501070 | 0.98578572 |
| EHI02679 | 0.6506381 | 0.43512590 | 10.2627640 | 1.23309249 |
| EHI02680 | 0.3328090 | 3.88288708 | 1.4210251 | 0.14881605 |
| EHI02681 | 0.3389079 | 1.76220214 | 3.4155844 | 0.64653370 |
| EHI02683 | 0.3769425 | 2.40063549 | 0.9947723 | 2.00839381 |
| EHI02684 | 0.4055536 | 3.04660391 | 3.1401241 | 0.21321577 |
| EHI02687 | 0.4719383 | 4.82008929 | 2.4576517 | 0.25446676 |
| EHI02688 | 0.6297850 | 0.79285573 | 8.3707803 | 2.60182892 |
| EHI02689 | 0.2969110 | 1.11480534 | 2.0197544 | 0.19344212 |
| EHI02690 | 0.5017530 | 2.70043724 | 2.3286060 | 2.47396095 |
sequence_fractions %>%
pivot_longer(!sample, names_to = "fraction", values_to = "value") %>%
left_join(., extra_metadata, by="sample") %>%
mutate(value = value / 1000000000) %>%
mutate(fraction = factor(fraction, levels = c("lowqual_bases","host_bases","unmapped_bases","mags_bases"))) %>%
ggplot(., aes(x = animal, y = value, fill=fraction)) +
geom_bar(position="stack", stat = "identity") +
scale_fill_manual(name="Sequence type",
breaks=c("lowqual_bases","host_bases","unmapped_bases","mags_bases"),
labels=c("Low quality","Mapped to host","Unmapped","Mapped to MAGs"),
values=c("#CCCCCC", "#bcdee1", "#d8b8a3","#93655c"))+
facet_grid(~ Bd_status, scale="free", space="free")+
labs(x = "Samples", y = "Amount of data (GB)") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),legend.position = "bottom")3.3 Recovered microbial fraction
singlem_table <- sequence_fractions %>%
mutate(mags_proportion = round((mags_bases / (mags_bases + unmapped_bases))*100,2)) %>%
left_join(sample_metadata, by = join_by(sample == sample)) %>%
mutate(singlem_proportion = round(singlem_fraction*100,2)) %>%
select(sample,mags_proportion,singlem_proportion) %>%
mutate(mags_proportion = ifelse(singlem_proportion == 0, 0, mags_proportion)) %>% #convert zeros to NA
mutate(singlem_proportion = ifelse(singlem_proportion == 0, NA, singlem_proportion)) %>% #convert zeros to NA
mutate(singlem_proportion = ifelse(singlem_proportion < mags_proportion, NA, singlem_proportion)) %>% #if singlem is smaller, then NA, to simplify plot
mutate(singlem_proportion = ifelse(singlem_proportion > 100, 100, singlem_proportion)) #simplify
singlem_table %>%
pivot_longer(!sample, names_to = "proportion", values_to = "value") %>%
left_join(sample_metadata, by = join_by(sample == sample)) %>%
mutate(proportion = factor(proportion, levels = c("mags_proportion","singlem_proportion"))) %>%
ggplot(., aes(x = value, y = sample, color=proportion)) +
geom_line(aes(group = sample), color = "#f8a538") +
geom_point() +
scale_color_manual(name="Proportion",
breaks=c("mags_proportion","singlem_proportion"),
labels=c("Recovered","Estimated"),
values=c("#52e1e8", "#876b53"))+
facet_nested(species + sample_type ~ ., scales="free",space="free")+
theme_classic() +
labs(y = "Samples", x = "Prokaryotic fraction (%)") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),
legend.position = "right",
strip.background.y=element_rect(color = NA, fill= "#f4f4f4"))