2 Data preparation
2.1 Metadata
bin_metadata <- read_delim("data/bin_metadata.tsv.gz") %>%
rename(genome=bin_id) %>%
mutate(comp_cont = completeness - contamination) %>%
mutate(overall_strategy = case_when(
overall_strategy == "individual_individual_binning" ~ "single_coverage",
overall_strategy == "cobinning_longitudinal" ~ "multicoverage_animal",
overall_strategy == "cobinning_treatment" ~ "multicoverage_timepoint_all",
overall_strategy == "cobinning_cage_treatment" ~ "multicoverage_timepoint_cage",
overall_strategy == "coassembly_longitudinal" ~ "coassembly_animal",
overall_strategy == "coassembly_treatment" ~ "coassembly_timepoint_all",
overall_strategy == "coassembly_cage_treatment" ~ "coassembly_timepoint_cage",
overall_strategy == "multi_split_longitudinal" ~ "multisplit_animal",
overall_strategy == "multi_split_treatment" ~ "multisplit_timepoint_all",
overall_strategy == "multi_split_cage_treatment" ~ "multisplit_timepoint_cage",
TRUE ~ overall_strategy)) %>%
mutate(assembly = case_when(
assembly == "individual" ~ "single_coverage",
assembly == "cobinning" ~ "multi_coverage",
assembly == "coassembly" ~ "coassembly",
assembly == "multi_split" ~ "multi_split",
TRUE ~ assembly)) %>%
mutate(strategy = case_when(
strategy == "individual_binning" ~ "NA",
strategy == "longitudinal" ~ "animal",
strategy == "treatment" ~ "timepoint_all",
strategy == "cage_treatment" ~ "timepoint_cage",
TRUE ~ strategy)) %>%
mutate(genome = str_replace(genome, "individual", "single_coverage"),
genome = str_replace(genome, "cobinning_long", "multicoverage_animal"),
genome = str_replace(genome, "cobinning_treat", "multicoverage_timepoint_all"),
genome = str_replace(genome, "cobinning_cage_treat", "multicoverage_timepoint_cage"),
genome = str_replace(genome, "coassembly_long", "coassembly_animal"),
genome = str_replace(genome, "coassembly_treat", "coassembly_timepoint_all"),
genome = str_replace(genome, "coassembly_cage_treat", "coassembly_timepoint_cage"),
genome = str_replace(genome, "vamb_long", "multisplit_animal"),
genome = str_replace(genome, "vamb_treat", "multisplit_timepoint_all"),
genome = str_replace(genome, "vamb_cage_treat", "multisplit_timepoint_cage"))
assemblies <- c("single_coverage",
"multi_coverage",
"coassembly",
"multi_split")
strategies <- c("single_coverage",
"multicoverage_animal",
"multicoverage_timepoint_all",
"multicoverage_timepoint_cage",
"coassembly_animal",
"coassembly_timepoint_all",
"coassembly_timepoint_cage",
"multisplit_animal",
"multisplit_timepoint_all",
"multisplit_timepoint_cage")2.2 Read counts
import_count_data <- function(strategy) {
read_delim(file = paste0('data/count_tables/count_table_',strategy,'.tsv.gz')) %>%
rename(genome=Genome) %>%
filter(genome != "unmapped") %>%
pivot_longer(!genome,
names_to = c("sample", ".value"),
names_sep = " ") %>%
rename("read_count" = "Read") %>%
rename("covered_fraction" = "Covered") %>%
rename("MAG_length" = "Length") %>%
mutate(genome = str_replace_all(genome, "^", paste0(strategy, "_")),
genome = str_replace_all(genome, "_OP_OP", "_OP"),
genome = str_replace_all(genome, "vamb_long_C.*_C", "vamb_long_C"),
sample = str_remove(sample, "_subsam_M"),
sample = str_remove(sample, "_subsam"),
sample = str_replace(sample, "CD","CT")) %>%
left_join(bin_metadata, by = "genome") %>%
select(genome, sample, read_count, covered_fraction, MAG_length, primary_cluster,
secondary_cluster, overall_strategy, assembly, strategy)
}
bin_counts <- purrr::map(strategies,import_count_data) %>%
list_rbind()2.6 Cluster functions
cluster_kegg <- read_delim("data/cluster_kegg.tsv") %>%
select(-genome) %>%
group_by(secondary_cluster,overall_strategy) %>%
summarise(across(where(is.numeric), max)) %>%
mutate(overall_strategy = case_when(
overall_strategy == "individual_individual_binning" ~ "single_coverage",
overall_strategy == "cobinning_longitudinal" ~ "multicoverage_animal",
overall_strategy == "cobinning_treatment" ~ "multicoverage_timepoint_all",
overall_strategy == "cobinning_cage_treatment" ~ "multicoverage_timepoint_cage",
overall_strategy == "coassembly_longitudinal" ~ "coassembly_animal",
overall_strategy == "coassembly_treatment" ~ "coassembly_timepoint_all",
overall_strategy == "coassembly_cage_treatment" ~ "coassembly_timepoint_cage",
overall_strategy == "multi_split_longitudinal" ~ "multisplit_animal",
overall_strategy == "multi_split_treatment" ~ "multisplit_timepoint_all",
overall_strategy == "multi_split_cage_treatment" ~ "multisplit_timepoint_cage",
TRUE ~ overall_strategy))2.7 Color scheme
AlberdiLab projects use unified color schemes developed for the Earth Hologenome Initiative, to facilitate figure interpretation.
phylum_colors <- cluster_taxonomy %>%
left_join(read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>% mutate(phylum=gsub("p__","",phylum)), by=join_by(Phylum == phylum)) %>%
select(Phylum, colors) %>%
unique() %>%
arrange(Phylum) %>%
select(colors) %>%
pull()strategy_colors <- c(single_coverage="#5254A3",
multicoverage_animal="#637939",
multicoverage_timepoint_all="#8CA252",
multicoverage_timepoint_cage="#B5CF6B",
coassembly_animal="#8C6D31",
coassembly_timepoint_all="#BD9E39",
coassembly_timepoint_cage="#E7BA52",
multisplit_animal="#843C39",
multisplit_timepoint_all="#AD494A",
multisplit_timepoint_cage="#D6616B")