Chapter 2 Prepare data
2.1 Metagenomics
2.1.2 Read counts
## Melt dataframe
counts_raw1 <- counts_raw
colnames(counts_raw1) <- gsub("\\.", " ", colnames(counts_raw1))
molten_table <- counts_raw1 %>%
pivot_longer(!genome,
names_to = c("sample", ".value"),
names_sep = " ") %>%
dplyr::rename("read_count" = "Read") %>%
dplyr::rename("covered_fraction" = "Covered") %>%
dplyr::rename("MAG_length" = "Length")
read_counts <- molten_table %>%
select(genome, sample, read_count) %>%
pivot_wider(names_from = "sample", values_from = "read_count")%>%
column_to_rownames(., "genome") %>%
filter(rowSums(. != 0, na.rm = TRUE) > 0) %>%
select_if(~!all(. == 0))%>%
rownames_to_column(., "genome")smf <- read_tsv("resources/metagenomics/microbial_fraction_estimation.tsv") %>%
mutate(sample = str_remove(sample, "_M_1")) %>%
dplyr::rename(metagenome_bases=metagenome_size)
preprocessing <- read_tsv("resources/metagenomics/preprocessing_report.tsv") %>%
mutate(host_bases=bases_post_filt-metagenomic_bases) %>%
mutate(lowqual_bases=bases_pre_filt-bases_post_filt)
microbial_fraction <- molten_table %>%
select(genome, sample, read_count) %>%
group_by(sample) %>%
summarise(mapped_bases=sum(read_count*146)) %>%
inner_join(smf, by="sample") %>%
inner_join(preprocessing %>% select(sample,lowqual_bases,host_bases), by="sample") %>%
mutate(unmapped_bases=metagenome_bases-mapped_bases) %>%
mutate(estimated_microbial_fraction=bacterial_archaeal_bases/metagenome_bases*100) %>%
mutate(mapped_microbial_fraction=mapped_bases/metagenome_bases*100) %>%
select(sample,lowqual_bases,host_bases,metagenome_bases,mapped_bases,unmapped_bases,bacterial_archaeal_bases,estimated_microbial_fraction,mapped_microbial_fraction)2.1.3 Genome counts
genome_counts_filt <- molten_table %>%
mutate(read_count = if_else(covered_fraction < 0.3, 0, read_count)) %>%
mutate(read_count = read_count*150/MAG_length) %>%
select(genome, sample, read_count) %>%
pivot_wider(names_from = sample, values_from = read_count) %>%
rename(genome=1)%>%
column_to_rownames(., "genome") %>%
filter(rowSums(. != 0, na.rm = TRUE) > 0) %>%
select_if(~!all(. == 0)) %>%
rownames_to_column(., "genome")
readlength=150
genome_counts <- read_counts %>%
arrange(match(genome,genomes_length$genome))%>%
mutate(across(where(is.numeric), ~ . / (genomes_length$length / readlength) ))%>%
column_to_rownames(., "genome") %>%
filter(rowSums(. != 0, na.rm = TRUE) > 0) %>%
select_if(~!all(. == 0)) %>%
rownames_to_column(., "genome")2.1.5 Genome metadata
Unify the taxonomy
genome_metadata <- read_tsv("resources/metagenomics/all_together_gtdbtk.tsv") %>%
rename(genome=1) %>%
mutate(genome = str_replace(genome, "\\.fa$", "")) %>%
semi_join(., genome_counts_filt, by = "genome") %>%
arrange(match(genome,genome_counts_filt$genome)) %>%
mutate(classification = str_replace_all(classification, ".__", "")) %>%
separate(col = classification, sep = ";", into = c("domain", "phylum", "class", "order", "family", "genus", "species"))%>%
left_join(., genomes_length, by=join_by(genome==genome))%>%
mutate(phylum = case_when(
phylum == "Bacillota_A" ~ "Bacillota",
phylum == "Bacillota_C" ~ "Bacillota",
TRUE ~ phylum))
# mutate(phylum = case_when(
# phylum == "Actinobacteriota" ~ "Actinomycetota",
# phylum == "Firmicutes" ~ "Bacillota",
# phylum == "Firmicutes_A" ~ "Bacillota_A",
# phylum == "Firmicutes_C" ~ "Bacillota_C",
# phylum == "Cyanobacteria" ~ "Cyanobacteriota",
# phylum == "Proteobacteria" ~ "Pseudomonadota",
# TRUE ~ phylum))2.1.7 Preprocessing info
unmapped <- read_tsv("resources/metagenomics/all_species_mapping_rate.txt")
mapping <- unmapped[c(1),] %>%
t() %>%
row_to_names(row_number = 1) %>%
as.data.frame() %>%
mutate_if(is.character, as.numeric) %>%
rownames_to_column("sample") %>%
mutate(mapped=100-unmapped)
preprocess_info <- read_tsv("resources/metagenomics/preprocessing_report.tsv") %>%
arrange(match(sample,sample_metadata$sample)) %>%
mutate(host_percent=100*(host_reads)/(reads_post_filt)) %>%
left_join(., mapping, by=join_by(sample==sample))
preprocess_info <- preprocess_info[match(sample_metadata$sample, preprocess_info$sample), ]2.1.9 Phyloseq object
#phyloseq object without structural zeros
phylo_samples <- sample_metadata %>%
column_to_rownames("sample") %>%
sample_data()
phylo_counts <- genome_counts_filt %>%
column_to_rownames("genome") %>%
otu_table(., taxa_are_rows = TRUE)
phylo_taxonomy <- genome_metadata %>%
dplyr::select(domain,phylum,class,order,family,genus,species,genome,length) %>%
column_to_rownames("genome") %>%
as.matrix() %>%
tax_table()
tree <- phy_tree(genome_tree)
genome_data <- phyloseq(phylo_counts, phylo_taxonomy, phylo_samples, tree)2.1.10 Prepare color scheme
phylum_colors <- read_tsv(
"https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
mutate_at(vars(phylum), ~ str_replace(., "[dpcofgs]__", "")) %>%
right_join(genome_metadata, by = join_by(phylum == phylum)) %>%
arrange(match(genome, genome_tree$tip.label)) %>%
select(phylum, colors) %>%
unique() %>%
arrange(phylum) %>%
pull(colors, name = phylum)2.2 Amplicon
2.2.1 Load data
read_counts <- read_tsv("resources/amplicon/ASVs_count_overlap10_nochim0.tsv") %>%
dplyr::rename(asv = 1) %>%
as.data.frame() %>%
column_to_rownames(., "asv")
genome_metadata_16S <- read_tsv("resources/amplicon/taxonomy_species_tax_overlap10_nochim0.tsv") %>%
dplyr::rename(asv = 1) %>%
as.data.frame() %>%
column_to_rownames(., "asv")
metadata <- read_csv("resources/amplicon/metadata.csv")%>%
filter(Sample %in% colnames(read_counts)) %>%
tibble::column_to_rownames(., "Sample")2.2.2 Create a phyloseq object
count_phy <- otu_table(read_counts, taxa_are_rows = T)
sample_info_tab_phy <- sample_data(metadata)
asv_tax.m <- as.matrix(genome_metadata_16S)
TAX = tax_table(asv_tax.m)
physeq.raw = phyloseq(count_phy, TAX, sample_info_tab_phy)
physeq.raw <- prune_taxa(taxa_sums(physeq.raw) > 0, physeq.raw)
physeq.raw <- prune_samples(sample_sums(physeq.raw) > 0, physeq.raw)
physeq_sample <- subset_samples(physeq.raw, Control == "True sample")
physeq_sample <- prune_taxa(taxa_sums(physeq_sample) > 0, physeq_sample)2.2.3 Remove contaminants
sample_data(physeq.raw)$is.neg <- sample_data(physeq.raw)$Control == "Control sample"
contamdf.prev <- isContaminant(physeq.raw, method = "prevalence", neg = "is.neg")
table(contamdf.prev$contaminant)
contam_asvs <- row.names(contamdf.prev[contamdf.prev$contaminant == TRUE, ])
contaminants <- asv_tax.m[row.names(asv_tax.m) %in% contam_asvs, ]2.2.4 Basic filtering
read_counts_final <- data.frame(physeq.final@otu_table)
count_filtdepth <- read_counts_final[, colSums(read_counts_final) > 1000]
count_phy_filt <- otu_table(count_filtdepth, taxa_are_rows = T)
otu_table(physeq.final) <- count_phy_filt
#taxonomy filtering
physeq.final <- subset_taxa(physeq.final, Kingdom != "Eukaryota")
physeq.final <- prune_taxa(taxa_sums(physeq.final) > 0, physeq.final)
physeq_bacteria <- subset_taxa(physeq.final, Kingdom == "Bacteria")
physeq_others <- subset_taxa(physeq.final, Kingdom != "Bacteria")
physeq_archaea <- subset_taxa(physeq.final, Kingdom == "Archaea")
physeq_phylum <- subset_taxa(physeq_bacteria, !is.na(Phylum))
physeq_phylumNA <- subset_taxa(physeq_bacteria, is.na(Phylum))
physeq_class <- subset_taxa(physeq_phylum, !is.na(Class))
physeq_classNA <- subset_taxa(physeq_phylum, is.na(Class))2.2.5 Removing chloroplast and mitochondria
taxonomy <- physeq.final@tax_table
is.mitochondria <- physeq.final@tax_table[, "Family"] %in% "Mitochondria"
taxonomy <- taxonomy[!is.mitochondria, ]
is.chloroplast <- taxonomy[, "Order"] %in% "Chloroplast"
taxonomy <- taxonomy[!is.chloroplast, ]
taxonomy.m <- as.matrix(taxonomy)
tax_table(physeq.final) <- taxonomy.m
physeq.final.filt <- prune_taxa(taxa_sums(physeq.final) > 0, physeq.final)2.3 Merging amplicon and metagenomic data
physeq_16S <- readRDS("resources/amplicon/physeq_final_nocopyfilter.RData")
physeq_16S <- prune_taxa(taxa_sums(physeq_16S)>0, physeq_16S)#list duplicated data
remov_samples <- c("P19_r1", "P19_r2", "P10_r2", "P10_r1", "E06_r3", "E06_r2", "E15_r1", "E29_r1", "E29_r2", "E37_r2", "E37_r3", "E47_r2", "E50_r1", "H09_r1", "H13_r2", "H39_r1", "P35_r3", "P64_r2", "P64_r3")
asv_16S_filt <- physeq_16S@otu_table %>%
as.data.frame() %>%
rename_with(~ sub("^[^.]+\\.", "", .x)) %>%
select(-any_of(remov_samples)) %>%
rename_with(~ gsub("_r[1-3]$", "", .x)) %>%
filter(rowSums(. > 0) != 0) %>%
rownames_to_column("asv")
#Prepare the samples data and filter
sample_metadata_16S <- data.frame(physeq_16S@sam_data) %>%
rownames_to_column("sample_id") %>%
# split sample_id into id + sample, drop id
separate(sample_id, into = c("id", "sample"), sep = "\\.", extra = "merge", fill = "right") %>%
select(-id) %>%
# split Sp_Biome into two columns
tidyr::separate_wider_delim(Sp_Biome, "_", names = c("Sp", "Biome")) %>%
# remove unwanted samples
filter(!sample %in% remov_samples) %>%
# clean replicate suffixes in sample names
mutate(sample = gsub("_r[1-3]$", "", sample))
genome_metadata_16S <- physeq_16S@tax_table %>%
as.data.frame() %>%
rownames_to_column("asv") %>%
filter(asv %in% asv_16S_filt$asv)# save(sample_metadata_16S,
# genome_metadata_16S,
# file = "resources/amplicon/data_withoutcopy.Rdata")2.3.1 Filtering the metagenomic data to match amplicon data
load("resources/metagenomics/data.Rdata")
#keep samples present in metagenomic study
sample_metadata_16S_filt <- sample_metadata_16S[sample_metadata_16S$sample %in% sample_metadata$sample,]
#keep samples present in 16S study
sample_metadata <- sample_metadata[sample_metadata$sample %in% sample_metadata_16S$sample,]genome_counts_filt <- genome_counts_filt %>%
select(any_of(c("genome",intersect(sample_metadata$sample, colnames(genome_counts_filt))))) %>%
filter(rowSums(across(where(is.numeric)))!=0) %>%
select(genome, where(~ !all(. == 0)))
genome_metadata <- genome_metadata %>%
filter(genome %in% genome_counts_filt$genome)
genome_metadata <- genome_metadata %>%
mutate(
phylum = case_when(
phylum == "Bacillota_A" ~ "Bacillota",
phylum == "Bacillota_C" ~ "Bacillota",
TRUE ~ phylum
)) %>%
mutate(
family = if_else(
(genus %in% c("Morganella", "Arsenophonus")) & family == "Enterobacteriaceae", "Morganellaceae", family
)) %>%
mutate(
order = case_when(order == "Enterobacterales_A" ~ "Enterobacterales", TRUE ~ order
)) %>%
mutate(
family = case_when(family == "Burkholderiaceae_B" ~ "Burkholderiaceae", TRUE ~ family
)) %>%
mutate(
genus = case_when(
genus == "Enterococcus_B" ~ "Enterococcus",
genus == "Fusobacterium_A" ~ "Fusobacterium",
genus == "Enterococcus_D" ~ "Enterococcus",
genus == "Citrobacter_A" ~ "Citrobacter",
TRUE ~ genus
)) %>%
mutate(species = sub("^[^ ]+\\s+", "", species))
preprocess_info <- preprocess_info %>%
filter(sample %in% sample_metadata$sample)
genome_tree <- keep.tip(genome_tree, tip=genome_metadata$genome)
genome_gifts <- genome_gifts[rownames(genome_gifts) %in% genome_counts_filt$genome,]2.3.2 Matching the 16S data to metagenomic data
phylo_samples <- sample_metadata_16S_filt %>%
remove_rownames() %>%
column_to_rownames(., "sample") %>%
sample_data()
phylo_counts <- asv_16S_filt %>%
column_to_rownames(., "asv") %>%
otu_table(., taxa_are_rows = TRUE)
phylo_taxonomy <- genome_metadata_16S %>%
column_to_rownames(., "asv")%>%
as.matrix() %>%
tax_table()
physeq_16S_filt <- phyloseq(phylo_counts, phylo_samples, phylo_taxonomy)
phyloseq_16S_filtered <- prune_taxa(taxa_sums(physeq_16S_filt)>0, physeq_16S_filt)
sample_metadata <- data.frame(phyloseq_16S_filtered@sam_data)%>%
rownames_to_column(., "sample") %>%
rename(species_name=Species) %>%
rename(Species=Sp) %>%
mutate(Habitat = ifelse(Habitat == "Natural", "Nat", Habitat)) %>%
mutate(Habitat = ifelse(Habitat == "Anthropogenic", "Anth", Habitat))
genome_metadata <- data.frame(phyloseq_16S_filtered@tax_table) %>%
rownames_to_column(., "genome") %>%
mutate(Phylum = case_when(
Phylum == "Actinobacteriota" ~ "Actinomycetota",
Phylum == "Firmicutes" ~ "Bacillota",
Phylum == "Firmicutes_A" ~ "Bacillota_A",
Phylum == "Firmicutes_C" ~ "Bacillota_C",
Phylum == "Cyanobacteria" ~ "Cyanobacteriota",
Phylum == "Proteobacteria" ~ "Pseudomonadota",
TRUE ~ Phylum))
colnames(genome_metadata)<- c("genome", "domain", "phylum", "class", "order", "family", "genus", "species")
genome_counts_filt <- data.frame(phyloseq_16S_filtered@otu_table) %>%
rownames_to_column(., "genome")
## Prepare color scheme
phylum_colors <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
mutate_at(vars(phylum), ~ str_replace(., "[dpcofgs]__", ""))%>%
right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
arrange(match(genome, genome_metadata$genome)) %>%
select(phylum, colors) %>%
unique() %>%
arrange(phylum) %>%
pull(colors, name=phylum)
taxonomy <- genome_metadata %>%
column_to_rownames("genome") %>%
as.matrix() %>%
tax_table()
sample_data <- sample_metadata %>%
column_to_rownames("sample") %>%
sample_data()
tax_table(phyloseq_16S_filtered) <- taxonomy
sample_data(phyloseq_16S_filtered) <- sample_data2.3.3 Standard data filtering criteria
genome_metadata <- genome_metadata %>%
filter(!is.na(phylum))
genome_counts_filt_abs <- genome_counts_filt %>%
mutate(across(-genome, ~ . / sum(.))) %>%
mutate(across(-genome, ~ if_else(. < 0.0001, 0, .))) %>%
mutate(across(-genome, ~ if_else(. > 0, 1, 0)))
genome_counts_filt <- genome_counts_filt %>%
column_to_rownames("genome") %>%
`*`(genome_counts_filt_abs %>%
column_to_rownames("genome")) %>%
rownames_to_column("genome") %>%
filter(genome %in% genome_metadata$genome) %>%
filter(rowSums(across(-genome)) != 0)
genome_metadata <- genome_metadata %>%
filter(genome %in% genome_counts_filt$genome)
phylum_colors <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
mutate_at(vars(phylum), ~ str_replace(., "[dpcofgs]__", ""))%>%
right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
arrange(match(genome, genome_metadata$genome)) %>%
select(phylum, colors) %>%
unique() %>%
arrange(phylum) %>%
pull(colors, name=phylum)
phyloseq_16S_stand <- prune_taxa(taxa_names(phyloseq_16S_filtered) %in% genome_counts_filt$genome, phyloseq_16S_filtered)2.3.4 Create master table for amplicon data
threshold_table <- genome_counts_filt %>%
mutate_at(vars(-genome), ~ ./sum(.)) %>%
pivot_longer(!genome, names_to = "sample",values_to = "t0") %>%
left_join(sample_metadata %>% select(sample,Species), by="sample") %>%
mutate(t1 = if_else(t0 < 0.0001, 0, t0),
t2 = if_else(t0 < 0.001, 0, t0),
t3 = if_else(t0 < 0.01, 0, t0),
t4 = if_else(t0 < 0.1, 0, t0)) %>%
pivot_longer(!c(genome,sample,Species), names_to = "copy_threshold",values_to = "raw_value") %>%
ungroup()
prevalence_table <- threshold_table %>%
group_by(Species,genome,copy_threshold) %>%
summarise(present = sum(raw_value != 0, na.rm = TRUE),
total = n()) %>%
mutate(prevalence=present/total) %>%
select(-present,-total)
master_table <- inner_join(threshold_table,prevalence_table,by=join_by("Species"=="Species","genome"=="genome","copy_threshold"=="copy_threshold")) %>%
rename(p0=raw_value) %>%
mutate(p10 = if_else(prevalence < 0.1, 0, p0),
p20 = if_else(prevalence < 0.2, 0, p0),
p30 = if_else(prevalence < 0.3, 0, p0),
p40 = if_else(prevalence < 0.4, 0, p0)) %>%
select(-prevalence) %>%
pivot_longer(!c(genome,sample,Species,copy_threshold), names_to = "prevalence",values_to = "value") %>%
mutate(pa = if_else(value > 0, 1, 0)) %>%
group_by(Species, sample, copy_threshold,prevalence) %>%
mutate(value = value / sum(value)) %>%
ungroup()