Chapter 2 Prepare data
2.1 Load data
Load the original data files outputted by the bioinformatic pipeline.
2.2 Create working objects
Transform the original data files into working objects for downstream analyses.
2.2.4 Transform reads into genome counts
2.3 Load data statistics
2.3.1 Raw reads
raw_reads <-
"resources/report/by_step/reads_data/multiqc_general_stats.txt.xz" %>%
read_tsv() %>%
select(
sample = Sample,
raw_reads = `FastQC_mqc-generalstats-fastqc-total_sequences`
) %>%
mutate(
sample = sample %>% str_remove_all("_1$") %>% str_remove_all("_2$")
) %>%
summarise(raw_reads = sum(raw_reads), .by = sample)
2.3.2 Quality-filtered reads
fastp_reads <-
"resources/report/by_step/preprocessing_data/multiqc_general_stats.txt.xz" %>%
read_tsv() %>%
filter(str_detect(Sample, "fastp")) %>%
select(
sample = Sample,
trimmed_reads = `FastQC_mqc-generalstats-fastqc-total_sequences`
) %>%
mutate(
sample =
sample %>%
str_remove_all("_[u12]+$") %>%
str_remove_all("^fastp \\| ")
) %>%
summarise(trimmed_reads = sum(trimmed_reads), .by = sample)
2.3.3 Host-mapped reads
host_mapped <-
"resources/report/by_step/preprocessing_data/multiqc_general_stats.txt.xz" %>%
read_tsv() %>%
filter(!str_detect(Sample, "fastp")) %>%
select(
sample = Sample,
host_mapped = `Samtools_mqc-generalstats-samtools-reads_mapped`,
mapping_total = `Samtools_mqc-generalstats-samtools-raw_total_sequences`
) %>%
mutate(
host_unmapped = mapping_total - host_mapped
) %>%
filter(!is.na(host_mapped)) %>%
separate(
col = sample,
into = c("host_name", "sample"),
sep = " \\| "
) %>%
rename(mapped = host_mapped, unmapped = host_unmapped) %>%
select(-mapping_total) %>%
pivot_longer(-host_name:-sample) %>%
mutate(
name = str_glue("{name}_{host_name}")
) %>%
select(-host_name) %>%
pivot_wider()
2.3.4 Prokaryotic fraction
singlem <-
"resources/singlem/microbial_fraction.tsv.xz" %>%
read_tsv() %>%
distinct() %>%
mutate(
sample = sample %>% str_remove_all("_1$"),
read_fraction = read_fraction %>% str_remove("%") %>% as.numeric(),
read_fraction = read_fraction / 100
) %>%
select(
sample,
singlem_prokaryotic_bases = bacterial_archaeal_bases,
singlem_metagenome_size = metagenome_size,
singlem_read_fraction = read_fraction,
)
2.4 Prepare color scheme
AlberdiLab projects use unified color schemes developed for the Earth Hologenome Initiative, to facilitate figure interpretation.
phylum_colors <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
arrange(match(genome, genome_tree$tip.label)) %>%
select(phylum, colors) %>%
unique() %>%
arrange(phylum) %>%
pull(colors, name=phylum)