Chapter 2 Data preparation
2.1 Load data
Load the original data files outputted by the bioinformatic pipeline.
2.1.5 Genome taxonomy
genome_taxonomy <- read_tsv("data/genome_taxonomy.tsv") %>%
rename(genome = 1)
genome_taxonomy_expanded <- genome_taxonomy %>%
separate(classification,
into = c("domain", "phylum", "class", "order", "family", "genus", "species"),
sep = ";",
fill = "right", remove = FALSE) %>%
mutate(across(domain:species, ~ str_replace(.x, "^[a-z]__", "")))%>%
mutate(phylum = case_when(
phylum == "Actinobacteriota" ~ "Actinomycetota",
phylum == "Firmicutes" ~ "Bacillota",
phylum == "Firmicutes_A" ~ "Bacillota_A",
phylum == "Firmicutes_C" ~ "Bacillota_C",
phylum == "Cyanobacteria" ~ "Cyanobacteriota",
phylum == "Proteobacteria" ~ "Pseudomonadota",
TRUE ~ phylum))
2.2 Create working objects
Transform the original data files into working objects for downstream analyses.
2.3 Prepare color scheme
AlberdiLab projects use unified color schemes developed for the Earth Hologenome Initiative, to facilitate figure interpretation.
phylum_colors <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
mutate(phylum=str_remove_all(phylum, "p__")) %>%
right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
arrange(match(genome, genome_tree$tip.label)) %>%
dplyr::select(phylum, colors) %>%
unique() %>%
arrange(phylum) %>%
pull(colors, name=phylum)
location_colors <- c('#3D5C61','#41B6C0','#90C8C5','#E5D388','#BFA366','#6E5244')
origin_colors <- c("#bd70ae","#949293")