Chapter 2 Prepare data

2.1 Load data

Load the original data files outputted by the bioinformatic pipeline.

2.1.1 Sample metadata

sample_metadata <- read_tsv("data/DMB0117_metadata.tsv.gz") %>%
    rename(sample=1)
individuals = read.csv("vulture_individuals.csv")

#Add State data
sample_metadata$state = substr(word(sample_metadata$region),1,nchar(word(sample_metadata$region))-1)
#Join sample number with metadata
sample_metadata = left_join(sample_metadata, individuals, by = "sample")

removed_samples = sample_metadata[which(sample_metadata$metagenomic_bases==0),]$sample
sample_metadata = sample_metadata[-which(sample_metadata$metagenomic_bases==0),]

2.1.2 Read counts

read_counts <- read_tsv("data/DMB0117_counts.tsv.gz") %>%
    rename(genome=1)
read_counts = read_counts[,-which(match(colnames(read_counts), removed_samples)!=0)]

2.1.3 Genome base hits

genome_coverage <- read_tsv("data/DMB0117_coverage.tsv.gz") %>%
    rename(genome=1)
genome_coverage = genome_coverage[,-which(match(colnames(genome_coverage), removed_samples)!=0)]

2.1.4 Genome taxonomy

genome_metadata <- read_tsv("data/DMB0117_mag_info.tsv.gz") %>%
    rename(length=mag_size)

2.1.5 Genome tree

genome_tree <- read_tree("data/DMB0117.tree")
genome_tree$tip.label <- str_replace_all(genome_tree$tip.label,"'", "") #remove single quotes in MAG names
genome_tree <- keep.tip(genome_tree, tip=genome_metadata$genome) # keep only MAG tips

2.1.6 Genome annotations

Downloading individual annotation files from ERDA using information in Airtable and writing them to a single compressed table takes a while. The following chunk only needs to be run once, to generate the genome_annotations table that is saved in the data directory. Note that the airtable connection requires a personal access token.

 airtable("MAGs", "appWbHBNLE6iAsMRV") %>% #get base ID from Airtable browser URL
   read_airtable(., fields = c("ID","mag_name","number_genes","anno_url"), id_to_col = TRUE) %>% #get 3 columns from MAGs table
   filter(mag_name %in% paste0(genome_metadata$genome,".fa")) %>% #filter by MAG name
   filter(number_genes > 0) %>% #genes need to exist
   select(anno_url) %>% #list MAG annotation urls
   pull() %>%
   read_tsv() %>% #load all tables
   rename(gene=1, genome=2, contig=3) %>% #rename first 3 columns
   write_tsv(file="data/genome_annotations.tsv.xz") #write to overall compressed file

genome_annotations <- read_tsv("data/genome_annotations.tsv.xz") %>%
    rename(gene=1, genome=2, contig=3)

2.2 Create working objects

Transform the original data files into working objects for downstream analyses.

2.2.1 Filter reads by coverage

min_coverage=0.3
read_counts_filt <- genome_coverage %>%
  mutate(across(where(is.numeric), ~ ifelse(. > min_coverage, 1, 0))) %>%
  mutate(across(-1, ~ . * read_counts[[cur_column()]]))

2.2.2 Transform reads into genome counts

readlength=150
genome_counts <- read_counts %>%
  mutate(across(where(is.numeric), ~ . / (genome_metadata$length / readlength) ))

readlength=150
genome_counts_filt <- read_counts_filt %>%
  mutate(across(where(is.numeric), ~ . / (genome_metadata$length / readlength) ))

2.2.3 Distill annotations into GIFTs

genome_gifts <- distill(genome_annotations,GIFT_db2,genomecol=2,annotcol=c(9,10,19), verbosity=F)


Identifiers in the annotation table: 1362 
Identifiers in the database: 1547 
Identifiers in both: 228 
Percentage of annotation table identifiers used for distillation: 16.74%
Percentage of database identifiers used for distillation: 14.74%

2.3 Prepare color scheme

AlberdiLab projects use unified color schemes developed for the Earth Hologenome Initiative, to facilitate figure interpretation.

colors = read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv")
phylum_colors <- colors %>%
    right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
    arrange(match(genome, genome_tree$tip.label)) %>%
    select(phylum, colors) %>% 
    unique() %>%
    arrange(phylum) %>%
    pull(colors, name=phylum)
#Colors for samples
species_colors = c("#d83034", "#0d0d0d")
sampletype_colors = c("#f45f74", "#008dff")
sex_colors = c("#bdd373", "#003a7d")
state_colors = c("#D55E00", "#CC79A7")

2.4 Wrap working objects

All working objects are wrapped into a single Rdata object to facilitate downstream usage.

save(sample_metadata, 
     genome_metadata, 
     read_counts, 
     genome_counts, 
     genome_counts_filt, 
     genome_tree,
     genome_gifts, 
     phylum_colors,
     file = "data/data.Rdata")