Chapter 2 Prepare data

2.1 Load data

Load the original data files outputted by the bioinformatic pipeline.

2.1.1 Sample metadata

sample_metadata <- read.csv("data/Pyrenees_metadata_all_v2.csv",sep=",",header=T)%>%
  filter(EHI_number != "EHI00102") %>%
  filter(EHI_number != "EHI00182") %>%
  filter(EHI_number !="EHI00435") %>%
  filter(EHI_number !="EHI00126") #genome not P.muralis

2.1.2 Read counts

read_counts <- read_tsv("data/DMB0113_counts.tsv") %>%
  rename(genome = 1) %>%
  select(-EHI00102, -EHI00182,-EHI00435,-EHI00126) #remove samples

2.1.3 Genome taxonomy

genome_metadata <- read_tsv("data/DMB0113_mag_info.tsv") %>%
    rename(length=mag_size)

2.1.4 Genome base hits

genome_coverage <- read_tsv("data/DMB0113_coverage.tsv") %>%
  rename(genome = 1) %>%
  select(-EHI00102, -EHI00182,-EHI00435,-EHI00126) %>% #remove samples
  semi_join(genome_metadata, by = "genome")

2.1.5 Genome tree

genome_tree <- read_tree("data/DMB0113.tree")
genome_tree$tip.label <- str_replace_all(genome_tree$tip.label,"'", "") #remove single quotes in MAG names
genome_tree <- keep.tip(genome_tree, tip=genome_metadata$genome) # keep only MAG tips

2.1.6 Genome annotations

Distill annotations already into GIFTs

genome_gifts_raw="data/GIFTs.tsv"
genome_gifts <- read.table(genome_gifts_raw,header=T, sep="\t", row.names=1)

2.2 Create working objects

Transform the original data files into working objects for downstream analyses.

2.2.1 Filter reads by coverage

read_counts <- read_counts %>%
  semi_join(genome_metadata, by = "genome")

min_coverage=0.3
read_counts_filt <- genome_coverage %>%
  mutate(across(where(is.numeric), ~ ifelse(. > min_coverage, 1, 0))) %>%
  mutate(across(-1, ~ . * read_counts[[cur_column()]])) 

2.2.2 Transform reads into genome counts

readlength=150
genome_counts <- read_counts %>%
  mutate(across(where(is.numeric), ~ . / (genome_metadata$length / readlength) ))
readlength=150
genome_counts_filt <- read_counts_filt %>%
  mutate(across(where(is.numeric), ~ . / (genome_metadata$length / readlength) ))

2.3 Prepare color scheme

AlberdiLab projects use unified color schemes developed for the Earth Hologenome Initiative, to facilitate figure interpretation.

phylum_colors <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
    right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
    arrange(match(genome, genome_tree$tip.label)) %>%
    select(phylum, colors) %>% 
    unique() %>%
    arrange(phylum) %>%
    pull(colors, name=phylum)

2.4 Wrap working objects

All working objects are wrapped into a single Rdata object to facilitate downstream usage.

save(sample_metadata, 
     genome_metadata, 
     read_counts, 
     genome_counts, 
     genome_counts_filt, 
     genome_tree,
    genome_gifts, 
     phylum_colors,
     file = "data/data.Rdata")