Chapter 2 Prepare data
2.1 Load data
Load the original data files outputted by the bioinformatic pipeline.
2.1.2 Read counts
read_counts_raw <- read_tsv("data/genome.count.tsv") %>%
rename(genome=1)
#Transformation of read_counts to combine data from both sequence rounds
merge_and_rename <- function(read_counts_raw) {
read_counts_raw %>%
# Gather the columns into long format
pivot_longer(cols = -genome, names_to = "col") %>%
# Extract prefix
mutate(prefix = gsub("^(.*?)_.*", "\\1", col)) %>%
# Group by prefix and genome, then summarize
group_by(prefix, genome) %>%
summarize(value = sum(value)) %>%
# Spread the data back to wide format
pivot_wider(names_from = prefix, values_from = value)
}
read_counts <- merge_and_rename(read_counts_raw)2.1.3 Genome base hits
genome_hits_raw <- read_tsv("data/genome.covered_bases.tsv") %>%
rename(genome=1)
#Transformation of genome_hits to combine data from both sequence rounds
merge_and_rename <- function(genome_hits_raw) {
genome_hits_raw %>%
# Gather the columns into long format
pivot_longer(cols = -genome, names_to = "col") %>%
# Extract prefix
mutate(prefix = gsub("^(.*?)_.*", "\\1", col)) %>%
# Group by prefix and genome, then summarize
group_by(prefix, genome) %>%
summarize(value = sum(value)) %>%
# Spread the data back to wide format
pivot_wider(names_from = prefix, values_from = value)
}
genome_hits <- merge_and_rename(genome_hits_raw)2.1.5 Genome quality
genome_quality <- read_tsv("data/quality_report.tsv") %>%
select(
genome = Name,
completeness = Completeness,
contamination = Contamination,
length = Genome_Size,
gc = GC_Content
)
genome_quality<-genome_quality %>%
mutate (genome=str_remove_all(genome,".fa"))
#Filter MAGs with over 70% completeness and less than 10% contamination
genome_quality <- genome_quality %>%
filter(completeness > 70 & contamination < 10)2.1.6 Genome tree
genome_tree <- read_tree("data/gtdbtk.backbone.bac120.classify.tree")
genome_tree$tip.label <- str_replace_all(genome_tree$tip.label,"'", "") #remove single quotes in MAG names
#Filter genome_taxonomy to keep MAGs with over 70% completeness and less than 10% contamination
genome_taxonomy <- genome_taxonomy %>%
semi_join(genome_quality, by = "genome")
genome_tree <- keep.tip(genome_tree, tip=genome_taxonomy$genome) # keep only MAG tips2.2 Create working objects
Transform the original data files into working objects for downstream analyses.
2.3 Filtering
Two samples are removed from the analysis due to their low sequencing depth.
#Counts_raw
columns_to_exclude <- c("AD91", "AC85","AD16","AD23", "AD25") # Columns to exclude
read_counts <- read_counts %>%
select(-columns_to_exclude)
#Coverage_table
genome_coverage <- genome_coverage %>%
select(-columns_to_exclude)
#Metadata
sample_metadata <- sample_metadata %>%
filter(Tube_code != "AC85") %>%
filter(Tube_code != "AD16") %>%
filter(Tube_code != "AD23") %>%
filter(Tube_code != "AD25") %>%
filter(Tube_code != "AD91")2.3.1 Filter reads by coverage
#Filter read_counts for the MAGs with over 70% completeness and less than 10% contamination
read_counts <- read_counts %>%
semi_join(genome_quality, by = "genome")
min_coverage=0.3
read_counts_filt <- genome_coverage %>%
mutate(across(where(is.numeric), ~ ifelse(. > min_coverage, 1, 0))) %>%
mutate(across(-1, ~ . * read_counts[[cur_column()]])) 2.4 Load data statistics
2.4.1 Raw reads
raw_reads <-
"data/multiqc_general_stats_2.txt" %>%
read_tsv() %>%
select(
sample = Sample,
raw_reads = `total_sequences`
) %>%
mutate(sample_prefix = str_extract(sample, "^[^_]+")) %>%
group_by(sample_prefix) %>%
summarize(raw_reads = sum(raw_reads, na.rm = TRUE)) %>%
rename(sample = sample_prefix) %>%
mutate(sample = str_remove(sample, "^fastp \\|\\s*")) %>%
filter(sample %in% sample_metadata$Tube_code)2.4.2 Quality-filtered reads
fastp_reads <-
"data/multiqc_general_stats.txt" %>%
read_tsv() %>%
select(
sample = Sample,
trimmed_reads = `total_sequences`
) %>%
mutate(sample_prefix = str_extract(sample, "^[^_]+")) %>%
group_by(sample_prefix) %>%
summarize(trimmed_reads = sum(trimmed_reads, na.rm = TRUE)) %>%
rename(sample = sample_prefix) %>%
filter(!str_detect(sample, "nonlizard \\|")) %>%
filter(!str_detect(sample, "lizard \\|")) %>%
filter(!str_detect(sample, "refseq500 \\|")) %>%
mutate(sample = str_remove(sample, "^fastp \\|\\s*")) %>%
filter(sample %in% sample_metadata$Tube_code)2.4.3 Host-mapped reads
host_mapped <-
"data/multiqc_general_stats.txt" %>%
read_tsv() %>%
filter(str_detect(Sample, "lizard")) %>%
select(
sample = Sample,
host_mapped = `reads_mapped`,
mapping_total = `raw_total_sequences`
) %>%
mutate(
host_unmapped = mapping_total - host_mapped
) %>%
filter(!is.na(host_mapped)) %>%
separate(
col = sample,
into = c("host_name", "sample"),
sep = " \\| "
) %>%
rename(mapped = host_mapped, unmapped = host_unmapped) %>%
select(-mapping_total) %>%
pivot_longer(-host_name:-sample) %>%
mutate(
name = str_glue("{name}_{host_name}")
) %>%
select(-host_name) %>%
pivot_wider() %>%
mutate(sample = str_extract(sample, "^[^_]+")) %>%
group_by(sample) %>%
summarize(
mapped_lizard = sum(mapped_lizard),
unmapped_lizard = sum(unmapped_lizard)
) %>%
filter(sample %in% sample_metadata$Tube_code)2.4.4 Prokaryotic fraction
singlem <-
"data/singleM.tsv" %>%
read_tsv() %>%
distinct() %>%
mutate(
sample = sample %>% str_remove_all("_1$"),
read_fraction = read_fraction %>% str_remove("%") %>% as.numeric(),
read_fraction = read_fraction / 100
) %>%
select(
sample,
singlem_prokaryotic_bases = bacterial_archaeal_bases,
singlem_metagenome_size = metagenome_size,
singlem_read_fraction = read_fraction,
) %>%
mutate(sample_prefix = str_extract(sample, "^[^_]+")) %>%
group_by(sample_prefix) %>%
summarize(
singlem_prokaryotic_bases = sum(singlem_prokaryotic_bases),
singlem_metagenome_size = sum(singlem_metagenome_size),
singlem_read_fraction = mean(singlem_read_fraction)
) %>%
rename(sample = sample_prefix)%>%
filter(sample %in% sample_metadata$Tube_code)2.4.5 MAG-mapped reads
mag_mapping <-
read_tsv("data/contig.count.tsv.xz") %>%
pivot_longer(-sequence_id) %>%
summarise(value = sum(value), .by = "name") %>%
rename(sample = name, mapped_mags = value) %>%
mutate(sample_prefix = str_extract(sample, "^[^_]+")) %>%
group_by(sample_prefix) %>%
summarize(
mapped_mags = sum(mapped_mags)
) %>%
rename(sample = sample_prefix)%>%
filter(sample %in% sample_metadata$Tube_code)2.5 Prepare color scheme
AlberdiLab projects use unified color schemes developed for the Earth Hologenome Initiative, to facilitate figure interpretation.
phylum_colors <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
arrange(match(genome, genome_tree$tip.label)) %>%
select(phylum, colors) %>%
unique() %>%
arrange(phylum) %>%
pull(colors, name=phylum)