Chapter 2 Data preparation

2.1 Load data

Load the original data files outputted by the bioinformatic pipeline.

2.1.1 Sample metadata

sample_metadata <- read_csv("data/sample_metadata.csv") %>%
  rename(sample =1, 
         gut_location = 2,
         fox_behaviour = 3, 
         fox_sex = 4,
         sample_type = 5, 
         year = 6)

2.1.2 Read counts

read_counts <- read_tsv("data/counts.tsv") %>%
    rename(genome=1)%>%
    select(any_of(c("genome", sample_metadata$sample)))

2.1.3 Filter read counts

read_counts_filt <- read_counts %>%
  filter(rowSums(. != 0, na.rm = TRUE) > 0) %>%
  select_if(~!all(. == 0)) %>%
  select(one_of(c("genome",sample_metadata$sample)))

2.1.4 Genome info

genome_info <- read_csv("data/genomeInfo.csv") %>%
   mutate(genome = gsub(".fa$", "", genome)) %>%
   semi_join(., read_counts, by = "genome") %>%
   arrange(match(genome,read_counts$genome))

2.1.5 Genome taxonomy

genome_taxonomy <- read_tsv("data/genome_taxonomy.tsv") %>%
  rename(genome = 1)

genome_taxonomy_expanded <- genome_taxonomy %>% 
  separate(classification,
           into = c("domain", "phylum", "class", "order", "family", "genus", "species"),
           sep = ";",
           fill = "right", remove = FALSE) %>%
  mutate(across(domain:species, ~ str_replace(.x, "^[a-z]__", "")))%>%
  mutate(phylum = case_when(
    phylum == "Actinobacteriota" ~ "Actinomycetota",
    phylum == "Firmicutes" ~ "Bacillota",
    phylum == "Firmicutes_A" ~ "Bacillota_A",
    phylum == "Firmicutes_C" ~ "Bacillota_C",
    phylum == "Cyanobacteria" ~ "Cyanobacteriota",
    phylum == "Proteobacteria" ~ "Pseudomonadota",
    TRUE ~ phylum))

2.1.6 Genome metadata

genome_metadata <- genome_taxonomy_expanded %>%
  left_join(genome_info, by = "genome")

2.1.7 Genome coverage

genome_coverage <- read_tsv("data/bases.tsv") %>%
    rename(genome=1) %>%
    select(one_of(c("genome",sample_metadata$sample))) %>%
    left_join(genome_info %>% select(genome, length), by = "genome") %>%
    mutate(across(where(is.numeric), ~ .x / length))

2.1.8 Genome tree

genome_tree <- read_tree("data/gtdbtk.backbone.bac120.classify.tree")
genome_tree$tip.label <- str_replace_all(genome_tree$tip.label,"'", "") #remove single quotes in MAG names
genome_tree <- keep.tip(genome_tree, tip=genome_metadata$genome) # keep only MAG tips

2.1.9 Genome annotations

genome_annotations <- read_tsv("data/gene_annotations.tsv.xz") %>%
  mutate(genome = sub("\\^.*", "", gene)) %>%
  select(1, genome, everything())

2.2 Create working objects

Transform the original data files into working objects for downstream analyses.

2.2.1 Filter reads by coverage

min_coverage=0.3
read_counts_filt <- genome_coverage %>%
  select(-length) %>% 
  mutate(across(where(is.numeric), ~ ifelse(. > min_coverage, 1, 0))) %>%
  mutate(across(-1, ~ . * read_counts[[cur_column()]]))

2.2.2 Transform reads into genome counts

readlength=150
genome_counts <- read_counts %>%
  mutate(across(where(is.numeric), ~ . / (genome_metadata$length / readlength) ))

readlength=150
genome_counts_filt <- read_counts_filt %>%
  mutate(across(where(is.numeric), ~ . / (genome_metadata$length / readlength) ))

2.2.3 Distill annotations into GIFTs

genome_gifts <- distill(genome_annotations,GIFT_db,genomecol= 2, annotcol=c(6,7,8,9), verbosity = F)

2.3 Prepare color scheme

AlberdiLab projects use unified color schemes developed for the Earth Hologenome Initiative, to facilitate figure interpretation.

phylum_colors <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
    mutate(phylum=str_remove_all(phylum, "p__")) %>%
    right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
    arrange(match(genome, genome_tree$tip.label)) %>%
    dplyr::select(phylum, colors) %>% 
    unique() %>%
    arrange(phylum) %>%
    pull(colors, name=phylum)

location_colors <- c('#9467bd','#7b3294','#ff7f0e','#e66101','#BFA366','#6E5244')

behaviour_colors <- c(
  tame = "#1f77b4",   
  aggr = "#d62728",   
  unsel = "#2ca02c"   
)

2.4 Wrap working objects

All working objects are wrapped into a single Rdata object to facilitate downstream usage.

save(sample_metadata, 
     genome_metadata, 
     read_counts, 
     genome_annotations,
     genome_counts, 
     genome_counts_filt, 
     genome_tree,
     genome_gifts, 
     phylum_colors,
     location_colors,
     behaviour_colors,
     file = "data/data.Rdata")