Chapter 2 Prepare data

2.1 Metagenomics

2.1.1 Load raw data

counts_raw <- read_tsv("resources/metagenomics/all_species_unfiltered_count_table.txt") %>% 
  rename(genome=1)

genomes_length <- counts_raw[,c(1,4)] %>% 
  rename(length=2)

2.1.2 Read counts

## Melt dataframe
counts_raw1 <- counts_raw                  
colnames(counts_raw1) <- gsub("\\.", " ", colnames(counts_raw1))

molten_table <- counts_raw1 %>%
  pivot_longer(!genome,
               names_to = c("sample", ".value"),
               names_sep = " ") %>%
  dplyr::rename("read_count" = "Read") %>%
  dplyr::rename("covered_fraction" = "Covered") %>%
  dplyr::rename("MAG_length" = "Length")

read_counts <- molten_table %>%
  select(genome, sample, read_count) %>%
  pivot_wider(names_from = "sample", values_from = "read_count")%>% 
  column_to_rownames(., "genome") %>%
  filter(rowSums(. != 0, na.rm = TRUE) > 0) %>%
  select_if(~!all(. == 0))%>% 
  rownames_to_column(., "genome")
smf <- read_tsv("resources/metagenomics/microbial_fraction_estimation.tsv") %>% 
  mutate(sample = str_remove(sample, "_M_1")) %>% 
  dplyr::rename(metagenome_bases=metagenome_size)

preprocessing <- read_tsv("resources/metagenomics/preprocessing_report.tsv") %>% 
  mutate(host_bases=bases_post_filt-metagenomic_bases) %>% 
  mutate(lowqual_bases=bases_pre_filt-bases_post_filt)

microbial_fraction <- molten_table %>%
  select(genome, sample, read_count) %>% 
  group_by(sample) %>% 
  summarise(mapped_bases=sum(read_count*146)) %>% 
  inner_join(smf, by="sample") %>% 
  inner_join(preprocessing %>% select(sample,lowqual_bases,host_bases), by="sample") %>% 
  mutate(unmapped_bases=metagenome_bases-mapped_bases) %>% 
  mutate(estimated_microbial_fraction=bacterial_archaeal_bases/metagenome_bases*100) %>% 
  mutate(mapped_microbial_fraction=mapped_bases/metagenome_bases*100) %>% 
  select(sample,lowqual_bases,host_bases,metagenome_bases,mapped_bases,unmapped_bases,bacterial_archaeal_bases,estimated_microbial_fraction,mapped_microbial_fraction)

2.1.3 Genome counts

genome_counts_filt <- molten_table %>%
  mutate(read_count = if_else(covered_fraction < 0.3, 0, read_count)) %>%
  mutate(read_count = read_count*150/MAG_length) %>%
  select(genome, sample, read_count) %>%
  pivot_wider(names_from = sample, values_from = read_count) %>%
  rename(genome=1)%>% 
  column_to_rownames(., "genome") %>%
  filter(rowSums(. != 0, na.rm = TRUE) > 0) %>%
  select_if(~!all(. == 0)) %>% 
  rownames_to_column(., "genome")

readlength=150
genome_counts <- read_counts %>% 
  arrange(match(genome,genomes_length$genome))%>%
  mutate(across(where(is.numeric), ~ . / (genomes_length$length / readlength) ))%>% 
  column_to_rownames(., "genome") %>%
  filter(rowSums(. != 0, na.rm = TRUE) > 0) %>%
  select_if(~!all(. == 0)) %>% 
  rownames_to_column(., "genome")

2.1.4 Sample metadata

sample_metadata <- read_tsv("resources/metagenomics/metadata.tsv") %>%
  rename(sample=1)
genome_counts_filt_r <- genome_counts_filt %>% 
  column_to_rownames(., "genome")
sample_metadata <- sample_metadata[match(colnames(genome_counts_filt_r), sample_metadata$sample), ]

2.1.5 Genome metadata

Unify the taxonomy

genome_metadata <- read_tsv("resources/metagenomics/all_together_gtdbtk.tsv") %>%
  rename(genome=1) %>%
  mutate(genome = str_replace(genome, "\\.fa$", "")) %>%
  semi_join(., genome_counts_filt, by = "genome") %>% 
  arrange(match(genome,genome_counts_filt$genome)) %>% 
  mutate(classification = str_replace_all(classification, ".__", "")) %>%
  separate(col = classification, sep = ";", into = c("domain", "phylum", "class", "order", "family", "genus", "species"))%>% 
  left_join(., genomes_length, by=join_by(genome==genome))%>%
    mutate(phylum = case_when(
        phylum == "Bacillota_A" ~ "Bacillota",
        phylum == "Bacillota_C" ~ "Bacillota",
        TRUE ~ phylum))


  # mutate(phylum = case_when(
  #       phylum == "Actinobacteriota" ~ "Actinomycetota",
  #       phylum == "Firmicutes" ~ "Bacillota",
  #       phylum == "Firmicutes_A" ~ "Bacillota_A",
  #       phylum == "Firmicutes_C" ~ "Bacillota_C",
  #       phylum == "Cyanobacteria" ~ "Cyanobacteriota",
  #       phylum == "Proteobacteria" ~ "Pseudomonadota",
  #       TRUE ~ phylum))

2.1.6 Genome tree

genome_tree <- read_tree("resources/metagenomics/coassembly_species_gtdbtk_full.tree")
genome_tree$tip.label <- gsub(".fa","",genome_tree$tip.label) 
genome_tree <- keep.tip(genome_tree, tip=genome_metadata$genome) 

2.1.7 Preprocessing info

unmapped <- read_tsv("resources/metagenomics/all_species_mapping_rate.txt")
mapping <- unmapped[c(1),] %>% 
 t() %>%
  row_to_names(row_number = 1) %>% 
 as.data.frame() %>%
  mutate_if(is.character, as.numeric) %>% 
  rownames_to_column("sample") %>% 
  mutate(mapped=100-unmapped)

preprocess_info <- read_tsv("resources/metagenomics/preprocessing_report.tsv") %>% 
  arrange(match(sample,sample_metadata$sample)) %>% 
  mutate(host_percent=100*(host_reads)/(reads_post_filt)) %>% 
  left_join(., mapping, by=join_by(sample==sample))

preprocess_info <- preprocess_info[match(sample_metadata$sample, preprocess_info$sample), ]

2.1.8 Genome annotations

genome_annotations <- read_tsv("resources/metagenomics/all_annotations.tsv.xz")

2.1.8.1 Genome functions distillation

genome_gifts <- distill(genome_annotations,GIFT_db,genomecol=2,annotcol=c(9,10,19), verbosity=F)
genome_gifts <- genome_gifts[rownames(genome_gifts) %in% genome_counts_filt$genome,]
genome_counts_filt$genome %in% rownames(genome_gifts) 
rownames(genome_gifts) %in% genome_counts_filt$genome

2.1.9 Phyloseq object

#phyloseq object without structural zeros
phylo_samples <- sample_metadata %>% 
  column_to_rownames("sample") %>% 
  sample_data() 

phylo_counts <- genome_counts_filt %>% 
  column_to_rownames("genome") %>% 
  otu_table(., taxa_are_rows = TRUE) 

phylo_taxonomy <- genome_metadata %>% 
  dplyr::select(domain,phylum,class,order,family,genus,species,genome,length) %>%
  column_to_rownames("genome") %>% 
  as.matrix() %>% 
  tax_table() 

tree <- phy_tree(genome_tree)

genome_data <- phyloseq(phylo_counts, phylo_taxonomy, phylo_samples, tree)

2.1.10 Prepare color scheme

phylum_colors <- read_tsv(
  "https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
  mutate_at(vars(phylum), ~ str_replace(., "[dpcofgs]__", "")) %>%
  right_join(genome_metadata, by = join_by(phylum == phylum)) %>%
  arrange(match(genome, genome_tree$tip.label)) %>%
  select(phylum, colors) %>%
  unique() %>%
  arrange(phylum) %>%
  pull(colors, name = phylum)
save(sample_metadata,
  genome_metadata,
  read_counts,
  genome_counts,
  genome_counts_filt,
  preprocess_info,
  genome_tree,
  genome_gifts,
  microbial_fraction,
  phylum_colors,
  file = "resources/metagenomics/data.Rdata")

2.2 Amplicon

2.2.1 Load data

read_counts <- read_tsv("resources/amplicon/ASVs_count_overlap10_nochim0.tsv") %>% 
  dplyr::rename(asv = 1) %>%
  as.data.frame() %>% 
  column_to_rownames(., "asv")

genome_metadata_16S <- read_tsv("resources/amplicon/taxonomy_species_tax_overlap10_nochim0.tsv") %>%
  dplyr::rename(asv = 1) %>%
  as.data.frame() %>% 
  column_to_rownames(., "asv")

metadata <- read_csv("resources/amplicon/metadata.csv")%>%
  filter(Sample %in% colnames(read_counts)) %>%
  tibble::column_to_rownames(., "Sample")

2.2.2 Create a phyloseq object

count_phy <- otu_table(read_counts, taxa_are_rows = T)
sample_info_tab_phy <- sample_data(metadata)
asv_tax.m <- as.matrix(genome_metadata_16S)
TAX = tax_table(asv_tax.m)
physeq.raw = phyloseq(count_phy, TAX, sample_info_tab_phy)
physeq.raw <- prune_taxa(taxa_sums(physeq.raw) > 0, physeq.raw)
physeq.raw <- prune_samples(sample_sums(physeq.raw) > 0, physeq.raw)

physeq_sample <- subset_samples(physeq.raw, Control == "True sample")
physeq_sample <- prune_taxa(taxa_sums(physeq_sample) > 0, physeq_sample)

2.2.3 Remove contaminants

sample_data(physeq.raw)$is.neg <- sample_data(physeq.raw)$Control == "Control sample"
contamdf.prev <- isContaminant(physeq.raw, method = "prevalence", neg = "is.neg")
table(contamdf.prev$contaminant)
contam_asvs <- row.names(contamdf.prev[contamdf.prev$contaminant == TRUE, ])
contaminants <- asv_tax.m[row.names(asv_tax.m) %in% contam_asvs, ]
physeq <- subset_samples(physeq.raw, Control == "True sample")
physeq <- prune_taxa(taxa_sums(physeq) > 0, physeq)
goodTaxa <- setdiff(taxa_names(physeq), contam_asvs)
physeq.final <- prune_taxa(goodTaxa, physeq)

2.2.4 Basic filtering

read_counts_final <- data.frame(physeq.final@otu_table)
count_filtdepth <- read_counts_final[, colSums(read_counts_final) > 1000]
count_phy_filt <- otu_table(count_filtdepth, taxa_are_rows = T)

otu_table(physeq.final) <- count_phy_filt

#taxonomy filtering

physeq.final <- subset_taxa(physeq.final, Kingdom != "Eukaryota")
physeq.final <- prune_taxa(taxa_sums(physeq.final) > 0, physeq.final)

physeq_bacteria <- subset_taxa(physeq.final, Kingdom == "Bacteria")
physeq_others <- subset_taxa(physeq.final, Kingdom != "Bacteria")
physeq_archaea <- subset_taxa(physeq.final, Kingdom == "Archaea")
physeq_phylum <- subset_taxa(physeq_bacteria, !is.na(Phylum))
physeq_phylumNA <- subset_taxa(physeq_bacteria, is.na(Phylum))
physeq_class <- subset_taxa(physeq_phylum, !is.na(Class))
physeq_classNA <- subset_taxa(physeq_phylum, is.na(Class))

2.2.5 Removing chloroplast and mitochondria

taxonomy <- physeq.final@tax_table
is.mitochondria <- physeq.final@tax_table[, "Family"] %in% "Mitochondria"
taxonomy <- taxonomy[!is.mitochondria, ]
is.chloroplast <- taxonomy[, "Order"] %in% "Chloroplast"
taxonomy <- taxonomy[!is.chloroplast, ]
taxonomy.m <- as.matrix(taxonomy)
tax_table(physeq.final) <- taxonomy.m
physeq.final.filt <- prune_taxa(taxa_sums(physeq.final) > 0, physeq.final)

2.2.6 Save

saveRDS(physeq.final.filt,"resources/amplicon/physeq_final_nocopyfilter.RData")

2.3 Merging amplicon and metagenomic data

physeq_16S <- readRDS("resources/amplicon/physeq_final_nocopyfilter.RData")
physeq_16S <- prune_taxa(taxa_sums(physeq_16S)>0, physeq_16S)
#list duplicated data
remov_samples <- c("P19_r1", "P19_r2", "P10_r2", "P10_r1", "E06_r3", "E06_r2", "E15_r1", "E29_r1", "E29_r2", "E37_r2", "E37_r3", "E47_r2", "E50_r1", "H09_r1", "H13_r2", "H39_r1", "P35_r3", "P64_r2", "P64_r3")

asv_16S_filt <- physeq_16S@otu_table %>%
  as.data.frame() %>%
  rename_with(~ sub("^[^.]+\\.", "", .x)) %>%
  select(-any_of(remov_samples)) %>%
  rename_with(~ gsub("_r[1-3]$", "", .x)) %>%
  filter(rowSums(. > 0) != 0) %>%
  rownames_to_column("asv")

#Prepare the samples data and filter
sample_metadata_16S <- data.frame(physeq_16S@sam_data) %>%
  rownames_to_column("sample_id") %>%
  # split sample_id into id + sample, drop id
  separate(sample_id, into = c("id", "sample"), sep = "\\.", extra = "merge", fill = "right") %>%
  select(-id) %>%
  # split Sp_Biome into two columns
  tidyr::separate_wider_delim(Sp_Biome, "_", names = c("Sp", "Biome")) %>%
  # remove unwanted samples
  filter(!sample %in% remov_samples) %>%
  # clean replicate suffixes in sample names
  mutate(sample = gsub("_r[1-3]$", "", sample))

genome_metadata_16S <- physeq_16S@tax_table %>%
  as.data.frame() %>%
  rownames_to_column("asv") %>%
  filter(asv %in% asv_16S_filt$asv)
# save(sample_metadata_16S, 
#      genome_metadata_16S,
#      file = "resources/amplicon/data_withoutcopy.Rdata")

2.3.1 Filtering the metagenomic data to match amplicon data

load("resources/metagenomics/data.Rdata")

#keep samples present in metagenomic study
sample_metadata_16S_filt <- sample_metadata_16S[sample_metadata_16S$sample %in% sample_metadata$sample,]

#keep samples present in 16S study
sample_metadata <- sample_metadata[sample_metadata$sample %in% sample_metadata_16S$sample,]
genome_counts_filt <- genome_counts_filt %>% 
  select(any_of(c("genome",intersect(sample_metadata$sample, colnames(genome_counts_filt))))) %>% 
  filter(rowSums(across(where(is.numeric)))!=0) %>%
  select(genome, where(~ !all(. == 0)))
  
genome_metadata <- genome_metadata %>% 
  filter(genome %in% genome_counts_filt$genome)

genome_metadata <- genome_metadata %>%
  mutate(
    phylum = case_when(
      phylum == "Bacillota_A" ~ "Bacillota",
      phylum == "Bacillota_C" ~ "Bacillota",
      TRUE ~ phylum
    )) %>%
  mutate(
    family = if_else(
      (genus %in% c("Morganella", "Arsenophonus")) & family == "Enterobacteriaceae", "Morganellaceae", family
  )) %>% 
  mutate(
    order = case_when(order == "Enterobacterales_A" ~ "Enterobacterales", TRUE ~ order
    )) %>% 
  mutate(
    family = case_when(family == "Burkholderiaceae_B" ~ "Burkholderiaceae", TRUE ~ family
    )) %>% 
  mutate(
    genus = case_when(
      genus == "Enterococcus_B" ~ "Enterococcus",
      genus == "Fusobacterium_A" ~ "Fusobacterium",
      genus == "Enterococcus_D" ~ "Enterococcus",
      genus == "Citrobacter_A" ~ "Citrobacter",
      TRUE ~ genus
    )) %>% 
  mutate(species = sub("^[^ ]+\\s+", "", species))
 
preprocess_info <- preprocess_info %>% 
  filter(sample %in% sample_metadata$sample)

genome_tree <- keep.tip(genome_tree, tip=genome_metadata$genome) 

genome_gifts <- genome_gifts[rownames(genome_gifts) %in% genome_counts_filt$genome,]
save(sample_metadata, 
     genome_metadata, 
     read_counts, 
     genome_counts,
     genome_counts_filt,
     preprocess_info,
     genome_tree, 
     genome_gifts,
     microbial_fraction,
     phylum_colors,
     file = "resources/metagenomics/data_filtered.Rdata")

2.3.2 Matching the 16S data to metagenomic data

phylo_samples <- sample_metadata_16S_filt %>% 
  remove_rownames() %>% 
  column_to_rownames(., "sample") %>% 
  sample_data() 

phylo_counts <- asv_16S_filt %>% 
  column_to_rownames(., "asv") %>% 
  otu_table(., taxa_are_rows = TRUE) 

phylo_taxonomy <- genome_metadata_16S %>% 
  column_to_rownames(., "asv")%>% 
  as.matrix() %>% 
  tax_table() 

physeq_16S_filt <- phyloseq(phylo_counts,  phylo_samples, phylo_taxonomy)
phyloseq_16S_filtered <- prune_taxa(taxa_sums(physeq_16S_filt)>0, physeq_16S_filt)

sample_metadata <- data.frame(phyloseq_16S_filtered@sam_data)%>% 
  rownames_to_column(., "sample") %>% 
  rename(species_name=Species) %>% 
  rename(Species=Sp) %>%
  mutate(Habitat = ifelse(Habitat == "Natural", "Nat", Habitat)) %>%
  mutate(Habitat = ifelse(Habitat == "Anthropogenic", "Anth", Habitat))

genome_metadata <- data.frame(phyloseq_16S_filtered@tax_table) %>% 
  rownames_to_column(., "genome") %>% 
  mutate(Phylum = case_when(
        Phylum == "Actinobacteriota" ~ "Actinomycetota",
        Phylum == "Firmicutes" ~ "Bacillota",
        Phylum == "Firmicutes_A" ~ "Bacillota_A",
        Phylum == "Firmicutes_C" ~ "Bacillota_C",
        Phylum == "Cyanobacteria" ~ "Cyanobacteriota",
        Phylum == "Proteobacteria" ~ "Pseudomonadota",
        TRUE ~ Phylum))

colnames(genome_metadata)<- c("genome", "domain", "phylum", "class", "order", "family", "genus", "species")
genome_counts_filt <- data.frame(phyloseq_16S_filtered@otu_table) %>% 
  rownames_to_column(., "genome")

## Prepare color scheme
phylum_colors <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
  mutate_at(vars(phylum), ~ str_replace(., "[dpcofgs]__", ""))%>%
    right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
    arrange(match(genome, genome_metadata$genome)) %>%
    select(phylum, colors) %>% 
    unique() %>%
    arrange(phylum) %>%
    pull(colors, name=phylum)

taxonomy <- genome_metadata %>% 
  column_to_rownames("genome") %>% 
  as.matrix() %>% 
  tax_table()

sample_data <- sample_metadata %>% 
  column_to_rownames("sample") %>% 
  sample_data()

tax_table(phyloseq_16S_filtered) <- taxonomy

sample_data(phyloseq_16S_filtered) <- sample_data
save(sample_metadata, 
     genome_metadata,
     genome_counts_filt,
     phyloseq_16S_filtered,
     phylum_colors,
     file = "resources/amplicon/data_nocopyfilt.Rdata")

2.3.3 Standard data filtering criteria

load("resources/amplicon/data_nocopyfilt.Rdata")
genome_metadata <- genome_metadata %>%
  filter(!is.na(phylum))

genome_counts_filt_abs <- genome_counts_filt %>%
  mutate(across(-genome,  ~ . / sum(.))) %>%
  mutate(across(-genome, ~ if_else(. < 0.0001, 0, .))) %>% 
  mutate(across(-genome, ~ if_else(. > 0, 1, 0)))

genome_counts_filt <- genome_counts_filt %>%
  column_to_rownames("genome") %>%
  `*`(genome_counts_filt_abs %>%
        column_to_rownames("genome")) %>%
  rownames_to_column("genome") %>% 
  filter(genome %in% genome_metadata$genome) %>% 
  filter(rowSums(across(-genome)) != 0)
  
genome_metadata <- genome_metadata %>%
  filter(genome %in% genome_counts_filt$genome)

phylum_colors <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
  mutate_at(vars(phylum), ~ str_replace(., "[dpcofgs]__", ""))%>%
    right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
    arrange(match(genome, genome_metadata$genome)) %>%
    select(phylum, colors) %>% 
    unique() %>%
    arrange(phylum) %>%
    pull(colors, name=phylum)

phyloseq_16S_stand <- prune_taxa(taxa_names(phyloseq_16S_filtered) %in% genome_counts_filt$genome, phyloseq_16S_filtered)
save(sample_metadata, 
     genome_metadata,
     genome_counts_filt,
     phylum_colors,
     phyloseq_16S_stand,
     file = "resources/amplicon/data_standard_filt.Rdata")

2.3.4 Create master table for amplicon data

load("resources/amplicon/data_nocopyfilt.Rdata")
threshold_table <- genome_counts_filt %>% 
  mutate_at(vars(-genome), ~ ./sum(.)) %>%
  pivot_longer(!genome, names_to = "sample",values_to = "t0") %>%
  left_join(sample_metadata %>% select(sample,Species), by="sample") %>%
  mutate(t1 = if_else(t0 < 0.0001, 0, t0),
         t2 = if_else(t0 < 0.001, 0, t0),
         t3 = if_else(t0 < 0.01, 0, t0),
         t4 = if_else(t0 < 0.1, 0, t0)) %>%
  pivot_longer(!c(genome,sample,Species), names_to = "copy_threshold",values_to = "raw_value") %>%
  ungroup()

prevalence_table <- threshold_table %>%
  group_by(Species,genome,copy_threshold) %>% 
  summarise(present = sum(raw_value != 0, na.rm = TRUE),
            total = n()) %>% 
  mutate(prevalence=present/total) %>% 
  select(-present,-total)

master_table <- inner_join(threshold_table,prevalence_table,by=join_by("Species"=="Species","genome"=="genome","copy_threshold"=="copy_threshold")) %>% 
  rename(p0=raw_value) %>% 
  mutate(p10 = if_else(prevalence < 0.1, 0, p0),
         p20 = if_else(prevalence < 0.2, 0, p0),
         p30 = if_else(prevalence < 0.3, 0, p0),
         p40 = if_else(prevalence < 0.4, 0, p0)) %>%
  select(-prevalence) %>% 
  pivot_longer(!c(genome,sample,Species,copy_threshold), names_to = "prevalence",values_to = "value") %>% 
  mutate(pa = if_else(value > 0, 1, 0)) %>%
  group_by(Species, sample, copy_threshold,prevalence) %>%
  mutate(value = value / sum(value)) %>%
  ungroup()
save(master_table, file = "resources/amplicon/master_table.Rdata")