Chapter 2 Prepare data
2.1 Load data
Load the original data files outputted by the bioinformatic pipeline.
2.1.1 Sample metadata
sample_metadata <- read_tsv("data/DMB0117_metadata.tsv.gz") %>%
rename(sample=1)
individuals = read.csv("vulture_individuals.csv")
#Add State data
sample_metadata$state = substr(word(sample_metadata$region),1,nchar(word(sample_metadata$region))-1)
#Join sample number with metadata
sample_metadata = left_join(sample_metadata, individuals, by = "sample")
removed_samples = sample_metadata[which(sample_metadata$metagenomic_bases==0),]$sample
sample_metadata = sample_metadata[-which(sample_metadata$metagenomic_bases==0),]
2.1.6 Genome annotations
Downloading individual annotation files from ERDA using information in Airtable and writing them to a single compressed table takes a while. The following chunk only needs to be run once, to generate the genome_annotations table that is saved in the data directory. Note that the airtable connection requires a personal access token.
airtable("MAGs", "appWbHBNLE6iAsMRV") %>% #get base ID from Airtable browser URL
read_airtable(., fields = c("ID","mag_name","number_genes","anno_url"), id_to_col = TRUE) %>% #get 3 columns from MAGs table
filter(mag_name %in% paste0(genome_metadata$genome,".fa")) %>% #filter by MAG name
filter(number_genes > 0) %>% #genes need to exist
select(anno_url) %>% #list MAG annotation urls
pull() %>%
read_tsv() %>% #load all tables
rename(gene=1, genome=2, contig=3) %>% #rename first 3 columns
write_tsv(file="data/genome_annotations.tsv.xz") #write to overall compressed file
2.2 Create working objects
Transform the original data files into working objects for downstream analyses.
2.2.2 Transform reads into genome counts
2.3 Prepare color scheme
AlberdiLab projects use unified color schemes developed for the Earth Hologenome Initiative, to facilitate figure interpretation.
colors = read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv")
phylum_colors <- colors %>%
right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
arrange(match(genome, genome_tree$tip.label)) %>%
select(phylum, colors) %>%
unique() %>%
arrange(phylum) %>%
pull(colors, name=phylum)
#Colors for samples
species_colors = c("#d83034", "#0d0d0d")
sampletype_colors = c("#f45f74", "#008dff")
sex_colors = c("#bdd373", "#003a7d")
state_colors = c("#D55E00", "#CC79A7")