Chapter 3 Prepare data

3.1 Load data

Load the original data files outputted by the bioinformatic pipeline.

3.1.1 Sample metadata

sample_metadata <- read_tsv("data/sample_metadata.tsv")

3.1.2 Read counts

read_counts <- read_tsv("data/metagenomics/read_counts.tsv")

3.1.3 Genome coverage

genome_coverage <- read_tsv("data/metagenomics/genome_coverage.tsv")

3.1.4 Genome taxonomy

genome_taxonomy <- read_tsv("data/metagenomics/genome_taxonomy.tsv") %>%
    select(user_genome,classification) %>%
      separate(classification, c("domain","phylum","class","order","family","genus","species"),  sep =";") %>%
      rename(genome=1) %>%
      mutate(genome = str_remove(genome, "\\.fa$"))

3.1.5 Genome quality

genome_quality <- read_tsv("data/metagenomics/genome_quality.tsv")

3.1.6 Genome metadata

genome_metadata <- inner_join(genome_taxonomy,genome_quality,by=join_by(genome==genome)) %>%
    select(-c(lineage,binner)) %>%
    rename(length=size)

3.1.7 Genome tree

genome_tree <- read_tree("data/metagenomics/genome_tree.tre")
genome_tree$tip.label <- str_replace_all(genome_tree$tip.label,"'", "") #remove single quotes in MAG names
genome_tree$tip.label <- str_remove(genome_tree$tip.label, "\\.fa$") #remove .fa suffix
genome_tree <- keep.tip(genome_tree, tip=genome_taxonomy$genome) # keep only MAG tips

3.1.8 Genome annotations

genome_annotations <- read_tsv("data/metagenomics/genome_annotations.tsv.xz") %>%
    rename(gene=1, genome=2, contig=3)

3.1.9 Preprocessing statistics

sample_preprocessing <- read_tsv("data/metagenomics/preprocessing_stats.tsv") %>%
  mutate(host_bases=host_reads*300)

3.2 Create working objects

Transform the original data files into working objects for downstream analyses.

3.2.1 Filter reads by coverage

min_coverage=0.3
read_counts_filt <- genome_coverage %>%
  mutate(across(where(is.numeric), ~ ifelse(. > min_coverage, 1, 0))) %>%
  mutate(across(-1, ~ . * read_counts[[cur_column()]])) 

3.2.2 Transform reads into genome counts

readlength=150
genome_counts <- read_counts %>%
  mutate(across(where(is.numeric), ~ . / (genome_metadata$length / readlength) ))
readlength=150
genome_counts_filt <- read_counts_filt %>%
  mutate(across(where(is.numeric), ~ . / (genome_metadata$length / readlength) ))

3.2.3 Distill annotations into GIFTs

genome_gifts <- distill(genome_annotations,GIFT_db,genomecol=2,annotcol=c(9,10,19), verbosity=F)

Identifiers in the annotation table: 1810 
Identifiers in the database: 1547 
Identifiers in both: 259 
Percentage of annotation table identifiers used for distillation: 14.31%
Percentage of database identifiers used for distillation: 16.74%

3.3 Prepare color scheme

AlberdiLab projects use unified color schemes developed for the Earth Hologenome Initiative, to facilitate figure interpretation.

phylum_colors <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
    right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
    arrange(match(genome, genome_tree$tip.label)) %>%
    select(phylum, colors) %>% 
    unique() %>%
    arrange(phylum) %>%
    pull(colors, name=phylum)

3.4 Wrap working objects

All working objects are wrapped into a single Rdata object to facilitate downstream usage.

save(sample_metadata, 
     sample_preprocessing,
     genome_metadata, 
     read_counts, 
     genome_counts, 
     genome_counts_filt, 
     genome_tree,
     genome_gifts,
     phylum_colors,
     file = "data/metagenomics/data.Rdata")