Chapter 2 Prepare data

2.1 Load data

Load the original data files outputted by the bioinformatic pipeline.

2.1.1 Sample metadata

sample_metadata <- read_tsv("resources/metadata.tsv")

2.1.2 Read counts

read_counts <- read_tsv("resources/coverm/genome.count.tsv.xz") %>%
    rename(genome=1)

2.1.3 Genome base hits

genome_hits <- read_tsv("resources/coverm/genome.covered_bases.tsv.xz") %>%
    rename(genome=1)

2.1.4 Genome taxonomy

genome_taxonomy <- read_tsv("resources/gtdbtk/gtdbtk.summary.tsv.xz") %>%
  select(mag_id = user_genome, classification) %>%
  separate(
    classification,
    into = c("domain", "phylum", "class", "order", "family", "genus", "species"),
    sep = ";") %>%
    rename(genome=1)

2.1.5 Genome quality

genome_quality <- read_tsv("resources/checkm2/quality_report.tsv.xz") %>%
  select(
    genome = Name, 
    completeness = Completeness, 
    contamination = Contamination,
    length = Genome_Size, 
    gc = GC_Content
  )

2.1.6 Genome tree

genome_tree <- read_tree("resources/gtdbtk/gtdbtk.backbone.bac120.classify.tree")
genome_tree$tip.label <- str_replace_all(genome_tree$tip.label,"'", "") #remove single quotes in MAG names
genome_tree <- keep.tip(genome_tree, tip=genome_taxonomy$genome) # keep only MAG tips

2.1.7 Genome annotations

genome_annotations <- read_tsv("resources/dram/annotations.tsv.xz") %>%
    rename(gene=1, genome=2, contig=3)

2.2 Create working objects

Transform the original data files into working objects for downstream analyses.

2.2.1 Merge genome taxonomy and quality

genome_metadata <- genome_taxonomy %>%
  left_join(genome_quality,by=join_by(genome==genome)) #join quality

2.2.2 Calculate genome coverage

genome_coverage <- genome_hits %>%
  mutate(across(where(is.numeric), ~ ./genome_metadata$length))

2.2.3 Filter reads by coverage

min_coverage=0.3
read_counts_filt <- genome_coverage %>%
  mutate(across(where(is.numeric), ~ ifelse(. > min_coverage, 1, 0))) %>%
  mutate(across(-1, ~ . * read_counts[[cur_column()]])) 

2.2.4 Transform reads into genome counts

readlength=150
genome_counts <- read_counts %>%
  mutate(across(where(is.numeric), ~ . / (genome_metadata$length / readlength) ))
readlength=150
genome_counts_filt <- read_counts_filt %>%
  mutate(across(where(is.numeric), ~ . / (genome_metadata$length / readlength) ))

2.2.5 Distill annotations into GIFTs

genome_gifts <- distill(genome_annotations,GIFT_db,genomecol=2,annotcol=c(9,10,19), verbosity=F)

Identifiers in the annotation table: 1866 
Identifiers in the database: 1547 
Identifiers in both: 275 
Percentage of annotation table identifiers used for distillation: 14.74%
Percentage of database identifiers used for distillation: 17.78%

2.3 Load data statistics

2.3.1 Raw reads

raw_reads <-
  "resources/report/by_step/reads_data/multiqc_general_stats.txt.xz" %>%
  read_tsv() %>%
  select(
    sample = Sample,
    raw_reads = `FastQC_mqc-generalstats-fastqc-total_sequences`
  ) %>%
  mutate(
    sample = sample %>% str_remove_all("_1$") %>% str_remove_all("_2$")
  ) %>%
  summarise(raw_reads = sum(raw_reads), .by = sample)

2.3.2 Quality-filtered reads

fastp_reads <-
  "resources/report/by_step/preprocessing_data/multiqc_general_stats.txt.xz" %>%
  read_tsv() %>%
  filter(str_detect(Sample, "fastp")) %>%
  select(
    sample = Sample,
    trimmed_reads = `FastQC_mqc-generalstats-fastqc-total_sequences`
  ) %>%
  mutate(
    sample =
      sample %>%
        str_remove_all("_[u12]+$") %>%
        str_remove_all("^fastp \\| ")
  ) %>%
  summarise(trimmed_reads = sum(trimmed_reads), .by = sample)

2.3.3 Host-mapped reads

host_mapped <-
  "resources/report/by_step/preprocessing_data/multiqc_general_stats.txt.xz" %>%
  read_tsv() %>%
  filter(!str_detect(Sample, "fastp")) %>%
  select(
    sample = Sample,
    host_mapped = `Samtools_mqc-generalstats-samtools-reads_mapped`,
    mapping_total = `Samtools_mqc-generalstats-samtools-raw_total_sequences`
  ) %>%
  mutate(
    host_unmapped = mapping_total - host_mapped
  ) %>%
  filter(!is.na(host_mapped)) %>%
  separate(
    col = sample,
    into = c("host_name", "sample"),
    sep = " \\| "
  ) %>%
  rename(mapped = host_mapped, unmapped = host_unmapped) %>%
  select(-mapping_total) %>%
  pivot_longer(-host_name:-sample) %>%
  mutate(
    name = str_glue("{name}_{host_name}")
  ) %>%
  select(-host_name) %>%
  pivot_wider()

2.3.4 Prokaryotic fraction

singlem <-
  "resources/singlem/microbial_fraction.tsv.xz" %>%
  read_tsv() %>%
  distinct() %>%
  mutate(
    sample = sample %>% str_remove_all("_1$"),
    read_fraction = read_fraction %>% str_remove("%") %>% as.numeric(),
    read_fraction = read_fraction / 100
  ) %>%
  select(
    sample,
    singlem_prokaryotic_bases = bacterial_archaeal_bases,
    singlem_metagenome_size = metagenome_size,
    singlem_read_fraction = read_fraction,
  )

2.3.5 Metagenomic complexity

nonpareil <-
  "resources/nonpareil/nonpareil.tsv.xz" %>%
  read_tsv() %>%
  select(sample=sample_id, nonpareil_c = C, nonpareil_diversity = diversity)

2.3.6 MAG-mapped reads

mag_mapping <-
  "resources/coverm/contig.count.tsv.xz" %>%
  read_tsv() %>%
  pivot_longer(-sequence_id) %>%
  summarise(value = sum(value), .by = "name") %>%
  rename(sample = name, mapped_mags = value)

2.3.7 Wrap data statistics

data_stats <- raw_reads %>%
  left_join(fastp_reads) %>%
  left_join(host_mapped) %>%
  left_join(singlem) %>%
  left_join(nonpareil) %>%
  left_join(mag_mapping)

2.4 Prepare color scheme

AlberdiLab projects use unified color schemes developed for the Earth Hologenome Initiative, to facilitate figure interpretation.

phylum_colors <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
    right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
    arrange(match(genome, genome_tree$tip.label)) %>%
    select(phylum, colors) %>% 
    unique() %>%
    arrange(phylum) %>%
    pull(colors, name=phylum)

2.5 Wrap working objects

All working objects are wrapped into a single Rdata object to facilitate downstream usage.

save(sample_metadata, 
     genome_metadata, 
     read_counts, 
     genome_counts, 
     genome_counts_filt, 
     genome_tree,
     genome_gifts, 
     phylum_colors,
     data_stats,
     file = "resources/data.Rdata")