Chapter 3 Preliminary visualizations
3.1 Filtering and normalization
# Create edgeR DGEList
dge <- DGEList(counts = read_counts)
# Filter lowly expressed genes (keep genes with CPM>1 in at least 2 samples)
keep <- rowSums(cpm(dge) > 1) >= 2
dge <- dge[keep, , keep.lib.sizes = FALSE]
# Normalize for library size (TMM normalization)
dge <- calcNormFactors(dge)
# Log2 CPM transformation
logCPM <- cpm(dge, log = TRUE, prior.count = 1) # log2(CPM + 1)3.2 Preliminary visualizations
3.2.1 Library size plot
In this plot we see a fold difference of 3-4X between the samples with the lowest number of reads and the samples with the highest number of reads. Most of the difference is between the 3 h samples (higher expression, exponential growth curve) and 24 h samples (lower expression, stationary phase). At 24 h DREX protocol seems yield more RNA than most of Zymo samples. Goethite samples at 3 h -(except DREX sample) have less reads than the rest of 3 h samples, which agrees with the concentrations calculated with Qubit. We expect some cell death in goethite at 3 h.
# Total reads per sample (wihtout filtering and normalizing)
lib_sizes <- colSums(read_counts)
par(mgp = c(3.4, 1, 0))
barplot(lib_sizes,
las = 2,
cex.axis = 0.7,
ylab = "Total reads",
main = "Library sizes per sample")3.2.2 Box plots of transformed counts
Most of the samples processed with DREX protocol show narrower distributions (reads are more evenly distributed across genes), is this good or bad?
boxplot(logCPM,
las = 2,
outline = FALSE,
ylab = "log2 CPM",
main = "Distribution of log2-CPM per sample")3.2.3 Hierarchical clustering
The clustering separates each mineral sample quite good at 3 h, but not that good at 24 h (except for goethite)
distance_mat <- as.dist(1 - cor(logCPM)) # Pearson correlation distance
hclust_res <- hclust(distance_mat, method = "complete")
plot(hclust_res, labels = colnames(logCPM), main = "Hierarchical Clustering (log2 CPM)")3.2.4 Sample-sample distance heatmap
Distance calculation is the same as the previous one but here with all the metadata and pairwise distances visible
heatmap_metadata <- sample_metadata %>%
dplyr::select(`Qubit conc ng/uL`, Extraction, timepoint, mineral_code)
sample_dist <- as.matrix(distance_mat)
pheatmap(sample_dist,
# cluster_rows = hclust_res,
# cluster_cols = hclust_res,
annotation_col = heatmap_metadata,
main = "Sample–sample distance heatmap",
clustering_distance_cols = "correlation",
clustering_distance_rows = "correlation")3.2.5 PCA
In the PCA there are some interesting observations:
The first PC explains most of the variance in the samples and it clearly separates the samples at 3 h and 24 h for all the minerals and RNA extraction protocols.
PC2 captures some of the variance between samples processed with DREX at 24 h. It seems for samples at 3 h the extraction protocol did not have that effect.
PC2 and especially PC3 caputres some of the variance that differentiates the goethite samples at 3 h (maybe stress response?)
pca_res <- prcomp(t(logCPM)) # samples as rows
pca_scores <- as.data.frame(pca_res$x) %>%
tibble::rownames_to_column("sample_pca")
pca_metadata <- sample_metadata %>%
tibble::rownames_to_column("sample_pca")
pca_df <- pca_scores %>%
dplyr::left_join(pca_metadata, by = "sample_pca")
# Calculate percent of variance explained
# Variance explained per PC
pca_var <- pca_res$sdev^2
pca_var_frac <- pca_var / sum(pca_var)
# Percent variance explained
pca_var_pct <- round(100 * pca_var_frac, 1)
ggplot(pca_df, aes(x = PC1, y = PC2,
color = mineral_code,
shape = timepoint)) +
geom_point(size = 3) +
geom_text(aes(label = sample_pca), vjust = -0.7, size = 3) +
theme_bw() +
labs(
title = "PCA of log2-CPM counts",
x = paste0("PC1 (", pca_var_pct[1], "%)"),
y = paste0("PC2 (", pca_var_pct[2], "%)")
)ggplot(pca_df, aes(x = PC1, y = PC3,
color = mineral_code,
shape = timepoint)) +
geom_point(size = 3) +
geom_text(aes(label = sample_pca), vjust = -0.7, size = 3) +
theme_bw() +
labs(
title = "PCA of log2-CPM counts",
x = paste0("PC1 (", pca_var_pct[1], "%)"),
y = paste0("PC3 (", pca_var_pct[3], "%)")
)ggplot(pca_df, aes(x = PC2, y = PC3,
color = mineral_code,
shape = timepoint)) +
geom_point(size = 3) +
geom_text(aes(label = sample_pca), vjust = -0.7, size = 3) +
theme_bw() +
labs(
title = "PCA of log2-CPM counts",
x = paste0("PC2 (", pca_var_pct[2], "%)"),
y = paste0("PC3 (", pca_var_pct[3], "%)")
)pca_var_df <- data.frame(
PC = factor(paste0("PC", seq_along(pca_var_pct)),
levels = paste0("PC", seq_along(pca_var_pct))),
variance = pca_var_pct
)
ggplot(pca_var_df[1:15, ], aes(x = PC, y = variance)) +
geom_col(fill = "steelblue") +
geom_line(aes(group = 1)) +
geom_point() +
theme_bw() +
labs(
title = "Variance explained by principal components",
y = "Variance explained (%)",
x = ""
)pca_var_df$cumulative <- cumsum(pca_var_df$variance)
ggplot(pca_var_df[1:20, ], aes(x = PC, y = cumulative)) +
geom_line(group = 1) +
geom_point() +
theme_bw() +
labs(
title = "Cumulative variance explained",
y = "Cumulative variance (%)",
x = ""
)3.2.6 PERMANOVA and dispersion test
Here we do PERMANOVA using the option “terms” that adds predictors one at a time, in order, and seeing how much extra variation each of them adds.
y ~ timepoint * mineral is the same as y ~ timepoint + mineral + timepoint:mineral
# Distance matrix
dist_mat <- as.dist(1 - cor(logCPM, method = "pearson"))
# by = "terms" means adonis2 tests terms in order, like Type I sums of squares in ANOVA. The differences between terms and margin are not very important when you have balanced samples like we have here.
# Permanova
perm_res <- adonis2(
dist_mat ~ timepoint * mineral_code,
data = sample_metadata,
permutations = 999,
by = "terms"
)
# Nice table
perm_table <- as.data.frame(perm_res) %>%
rownames_to_column("term")
perm_table term Df SumOfSqs R2 F Pr(>F)
1 timepoint 1 0.46814880 0.55480841 95.781150 0.001
2 mineral_code 3 0.15978610 0.18936431 10.897174 0.001
3 timepoint:mineral_code 3 0.05946157 0.07046858 4.055191 0.003
4 Residual 32 0.15640616 0.18535870 NA NA
5 Total 39 0.84380264 1.00000000 NA NA
The PERMANOVA on sample–sample expression distances reveals a significant effect of the mineral (R² = 0.12, p = 0.014) and a strong effect of timepoint × mineral interaction (R² = 0.41, p = 0.001), while the main effect of timepoint alone was not significant (R² = 0.03, p = 0.119). The timepoint not being significant means that the 3h and 24h variation is different for different minerals? They were clearly separated in the PCA. Does it mean that is important but only when accounting for the interaction (considering the R2)? Conclusion of the permanova is that it suggests mineral-associated transcriptional responses differ substantially between 3h and 24h.
# Create combined group factor: timepoint:mineral
sample_metadata <- sample_metadata %>%
mutate(group = interaction(timepoint, mineral_code, drop = TRUE))
bd_group <- betadisper(dist_mat, sample_metadata$group)
disp_group <- anova(bd_group)
bd_time <- betadisper(dist_mat, sample_metadata$timepoint)
bd_mineral <- betadisper(dist_mat, sample_metadata$mineral_code)
disp_time <- anova(bd_time)
disp_mineral <- anova(bd_mineral)
disp_table <- bind_rows(
data.frame(test = "Dispersion: timepoint",
F = disp_time$`F value`[1],
p = disp_time$`Pr(>F)`[1]),
data.frame(test = "Dispersion: mineral",
F = disp_mineral$`F value`[1],
p = disp_mineral$`Pr(>F)`[1]),
data.frame(test = "Dispersion: timepoint:mineral",
F = disp_group$`F value`[1],
p = disp_group$`Pr(>F)`[1])
) %>%
mutate(
F = round(F, 3),
p = signif(p, 3)
)
disp_table test F p
1 Dispersion: timepoint 7.091 0.0113
2 Dispersion: mineral 0.663 0.5800
3 Dispersion: timepoint:mineral 1.460 0.2170
final_table <- perm_table %>%
dplyr::select(term, Df, F, R2, `Pr(>F)`) %>%
dplyr::rename(p = `Pr(>F)`)
final_table term Df F R2 p
1 timepoint 1 95.781150 0.55480841 0.001
2 mineral_code 3 10.897174 0.18936431 0.001
3 timepoint:mineral_code 3 4.055191 0.07046858 0.003
4 Residual 32 NA 0.18535870 NA
5 Total 39 NA 1.00000000 NA
The 8 timepoint*mieneral groups do NOT differ significantly in their spread, so the permanova interaction is not likely an artifact of dispersion differences. 3h and 24h have similar within group variability and minerals do not differ in dispersion either. This means that we can trust the PERMANOVA.
3.3 Preliminary visualizations wo DREX group
We plot some of the same visualizations after removing the DREX samples.
3.3.1 Filtering DREX group, filtering and normalization
# Create edgeR DGEList
dge <- DGEList(counts = counts_wo_drex)
# Filter lowly expressed genes (keep genes with CPM>1 in at least 2 samples)
keep <- rowSums(cpm(dge) > 1) >= 2
dge <- dge[keep, , keep.lib.sizes = FALSE]
# Normalize for library size (TMM normalization)
dge <- calcNormFactors(dge)
# Log2 CPM transformation
logCPM <- cpm(dge, log = TRUE, prior.count = 1) # log2(CPM + 1)3.3.2 Hierarchical clustering
distance_mat <- as.dist(1 - cor(logCPM)) # Pearson correlation distance
hclust_res <- hclust(distance_mat, method = "complete")
plot(hclust_res, labels = colnames(logCPM), main = "Hierarchical Clustering (log2 CPM)")3.3.3 Sample-sample distance heatmap
heatmap_metadata <- metadata_wo_drex %>%
dplyr::select(`Qubit conc ng/uL`, Extraction, timepoint, mineral_code)
sample_dist <- as.matrix(distance_mat)
pheatmap(sample_dist,
# cluster_rows = hclust_res,
# cluster_cols = hclust_res,
annotation_col = heatmap_metadata,
main = "Sample–sample distance heatmap",
clustering_distance_cols = "correlation",
clustering_distance_rows = "correlation")3.3.4 PCA
Observations after removing the DREX samples:
PC1 keeps capturing the variance related to the timepoint of sample collection.
PC2 clearly separates the goetihte smaples at 3 h and a little bit at 24 h, some of the expression pattern in goethite is quite different at the beginning and the difference start fading after 24 h.
PC3 separates the goethite samples at 24 h but not at 3 h.
PC4 separates the controls at 24 h
pca_res <- prcomp(t(logCPM)) # samples as rows
pca_scores <- as.data.frame(pca_res$x) %>%
tibble::rownames_to_column("sample_pca")
pca_metadata <- metadata_wo_drex %>%
tibble::rownames_to_column("sample_pca")
pca_df <- pca_scores %>%
dplyr::left_join(pca_metadata, by = "sample_pca")
# Calculate percent of variance explained
# Variance explained per PC
pca_var <- pca_res$sdev^2
pca_var_frac <- pca_var / sum(pca_var)
# Percent variance explained
pca_var_pct <- round(100 * pca_var_frac, 1)
ggplot(pca_df, aes(x = PC1, y = PC2,
color = mineral_code,
shape = timepoint)) +
geom_point(size = 3) +
geom_text(aes(label = sample_pca), vjust = -0.7, size = 3) +
theme_bw() +
labs(
title = "PCA of log2-CPM counts",
x = paste0("PC1 (", pca_var_pct[1], "%)"),
y = paste0("PC2 (", pca_var_pct[2], "%)")
)ggplot(pca_df, aes(x = PC1, y = PC3,
color = mineral_code,
shape = timepoint)) +
geom_point(size = 3) +
geom_text(aes(label = sample_pca), vjust = -0.7, size = 3) +
theme_bw() +
labs(
title = "PCA of log2-CPM counts",
x = paste0("PC1 (", pca_var_pct[1], "%)"),
y = paste0("PC3 (", pca_var_pct[3], "%)")
)ggplot(pca_df, aes(x = PC2, y = PC3,
color = mineral_code,
shape = timepoint)) +
geom_point(size = 3) +
geom_text(aes(label = sample_pca), vjust = -0.7, size = 3) +
theme_bw() +
labs(
title = "PCA of log2-CPM counts",
x = paste0("PC2 (", pca_var_pct[2], "%)"),
y = paste0("PC3 (", pca_var_pct[3], "%)")
)ggplot(pca_df, aes(x = PC3, y = PC4,
color = mineral_code,
shape = timepoint)) +
geom_point(size = 3) +
geom_text(aes(label = sample_pca), vjust = -0.7, size = 3) +
theme_bw() +
labs(
title = "PCA of log2-CPM counts",
x = paste0("PC3 (", pca_var_pct[3], "%)"),
y = paste0("PC4 (", pca_var_pct[4], "%)")
)pca_var_df <- data.frame(
PC = factor(paste0("PC", seq_along(pca_var_pct)),
levels = paste0("PC", seq_along(pca_var_pct))),
variance = pca_var_pct
)
ggplot(pca_var_df[1:15, ], aes(x = PC, y = variance)) +
geom_col(fill = "steelblue") +
geom_line(aes(group = 1)) +
geom_point() +
theme_bw() +
labs(
title = "Variance explained by principal components",
y = "Variance explained (%)",
x = ""
)pca_var_df$cumulative <- cumsum(pca_var_df$variance)
ggplot(pca_var_df[1:20, ], aes(x = PC, y = cumulative)) +
geom_line(group = 1) +
geom_point() +
theme_bw() +
labs(
title = "Cumulative variance explained",
y = "Cumulative variance (%)",
x = ""
)3.3.5 PERMANOVA and dispersion test
# Create edgeR DGEList
dge <- DGEList(counts = counts_wo_drex)
# Filter lowly expressed genes (keep genes with CPM>1 in at least 2 samples)
keep <- rowSums(cpm(dge) > 1) >= 2
dge <- dge[keep, , keep.lib.sizes = FALSE]
# Normalize for library size (TMM normalization)
dge <- calcNormFactors(dge)
# Log2 CPM transformation
logCPM <- cpm(dge, log = TRUE, prior.count = 1) # log2(CPM + 1)
# Distance matrix
dist_mat <- as.dist(1 - cor(logCPM, method = "pearson"))
# by = "terms" means adonis2 tests terms in order, like Type I sums of squares in ANOVA
# Permanova
perm_res <- adonis2(
dist_mat ~ timepoint * mineral_code,
data = metadata_wo_drex,
permutations = 999,
by = "terms"
)
# Nice table
perm_table <- as.data.frame(perm_res) %>%
rownames_to_column("term")
perm_table term Df SumOfSqs R2 F Pr(>F)
1 timepoint 1 0.37113609 0.65935109 332.457894 0.001
2 mineral_code 3 0.13154087 0.23369223 39.277419 0.001
3 timepoint:mineral_code 3 0.03341171 0.05935840 9.976561 0.001
4 Residual 24 0.02679216 0.04759829 NA NA
5 Total 31 0.56288083 1.00000000 NA NA
Here we get similar results as when including the DREX samples. The tiempoint by itself does not show an effect in the PERMANOVA (R² = 0.03, p = 0.156), we observe effect from the mineral when considering alredy timepoint in the model (R² = 0.14, p = 0.031) and again a strong timepoint × mineral interaction (R² = 0.44, p = 0.001).
# Create combined group factor: timepoint:mineral
metadata_wo_drex <- metadata_wo_drex %>%
mutate(group = interaction(timepoint, mineral_code, drop = TRUE))
bd_group <- betadisper(dist_mat, metadata_wo_drex$group)
disp_group <- anova(bd_group)
bd_time <- betadisper(dist_mat, metadata_wo_drex$timepoint)
bd_mineral <- betadisper(dist_mat, metadata_wo_drex$mineral_code)
disp_time <- anova(bd_time)
disp_mineral <- anova(bd_mineral)
disp_table <- bind_rows(
data.frame(test = "Dispersion: timepoint",
F = disp_time$`F value`[1],
p = disp_time$`Pr(>F)`[1]),
data.frame(test = "Dispersion: mineral",
F = disp_mineral$`F value`[1],
p = disp_mineral$`Pr(>F)`[1]),
data.frame(test = "Dispersion: timepoint:mineral",
F = disp_group$`F value`[1],
p = disp_group$`Pr(>F)`[1])
) %>%
mutate(
F = round(F, 3),
p = signif(p, 3)
)
disp_table test F p
1 Dispersion: timepoint 4.764 0.0370
2 Dispersion: mineral 0.370 0.7750
3 Dispersion: timepoint:mineral 2.609 0.0374
final_table <- perm_table %>%
dplyr::select(term, Df, F, R2, `Pr(>F)`) %>%
dplyr::rename(p = `Pr(>F)`)
final_table term Df F R2 p
1 timepoint 1 332.457894 0.65935109 0.001
2 mineral_code 3 39.277419 0.23369223 0.001
3 timepoint:mineral_code 3 9.976561 0.05935840 0.001
4 Residual 24 NA 0.04759829 NA
5 Total 31 NA 1.00000000 NA
Dispersion also remained being not significant and so again the PERMANOVA interaction is not likely an artifact of dispersion differences. In the boxplot we see that after removing the DREX group the variations within groups decreased a little bit.
3.4 Other PCAs
3.4.1 Samples at 3h
Here we check PCAs information by subsetting the data in different groups. First we look at PCA of samples at 3h.
At 3h most of the variation separates goethite from the rest of the samples (PC1).
PC2 separates some of the minerals, but not completely. The PC3 separates the minerals quite well, especially mica and the control. It keeps goethite and hematite together. This is interesting because these two are both iron oxides and there might be some biological meaning behind this.
# Filter out samples processed using DREX protocol and keep only samples at 3h
read_counts_3h <- read_counts %>%
dplyr::select(-ends_with("D")) %>%
dplyr::select((matches("3_")))
# Create edgeR DGEList
dge <- DGEList(counts = read_counts_3h)
# Filter lowly expressed genes (keep genes with CPM>1 in at least 2 samples)
keep <- rowSums(cpm(dge) > 1) >= 2
dge <- dge[keep, , keep.lib.sizes = FALSE]
# Normalize for library size (TMM normalization)
dge <- calcNormFactors(dge)
# Log2 CPM transformation
logCPM <- cpm(dge, log = TRUE, prior.count = 1) # log2(CPM + 1)
pca_res <- prcomp(t(logCPM)) # samples as rows
pca_scores <- as.data.frame(pca_res$x) %>%
tibble::rownames_to_column("sample_pca")
pca_metadata <- sample_metadata %>%
tibble::rownames_to_column("sample_pca")
pca_df <- pca_scores %>%
dplyr::left_join(pca_metadata, by = "sample_pca")
# Calculate percent of variance explained
# Variance explained per PC
pca_var <- pca_res$sdev^2
pca_var_frac <- pca_var / sum(pca_var)
# Percent variance explained
pca_var_pct <- round(100 * pca_var_frac, 1)
ggplot(pca_df, aes(x = PC1, y = PC2,
color = mineral_code,
shape = timepoint)) +
geom_point(size = 3) +
geom_text(aes(label = sample_pca), vjust = -0.7, size = 3) +
theme_bw() +
labs(
title = "PCA of log2-CPM counts",
x = paste0("PC1 (", pca_var_pct[1], "%)"),
y = paste0("PC2 (", pca_var_pct[2], "%)")
)ggplot(pca_df, aes(x = PC1, y = PC3,
color = mineral_code,
shape = timepoint)) +
geom_point(size = 3) +
geom_text(aes(label = sample_pca), vjust = -0.7, size = 3) +
theme_bw() +
labs(
title = "PCA of log2-CPM counts",
x = paste0("PC1 (", pca_var_pct[1], "%)"),
y = paste0("PC3 (", pca_var_pct[3], "%)")
)ggplot(pca_df, aes(x = PC2, y = PC3,
color = mineral_code,
shape = timepoint)) +
geom_point(size = 3) +
geom_text(aes(label = sample_pca), vjust = -0.7, size = 3) +
theme_bw() +
labs(
title = "PCA of log2-CPM counts",
x = paste0("PC2 (", pca_var_pct[2], "%)"),
y = paste0("PC3 (", pca_var_pct[3], "%)")
)ggplot(pca_df, aes(x = PC3, y = PC4,
color = mineral_code,
shape = timepoint)) +
geom_point(size = 3) +
geom_text(aes(label = sample_pca), vjust = -0.7, size = 3) +
theme_bw() +
labs(
title = "PCA of log2-CPM counts",
x = paste0("PC3 (", pca_var_pct[3], "%)"),
y = paste0("PC4 (", pca_var_pct[4], "%)")
)3.4.2 Samples at 24h
Here we see again that PC1 separates the goethite samples from the rest and that has most of the variation. Here we dont get as clear samples’s clusters as we did for 3h. Might suggest that 24h samples are more similar than the 3h samples.
# Filter out samples processed using DREX protocol and keep only samples at 24 h
read_counts_24h <- read_counts %>%
dplyr::select(-ends_with("D")) %>%
dplyr::select((matches("24_")))
# Create edgeR DGEList
dge <- DGEList(counts = read_counts_24h)
# Filter lowly expressed genes (keep genes with CPM>1 in at least 2 samples)
keep <- rowSums(cpm(dge) > 1) >= 2
dge <- dge[keep, , keep.lib.sizes = FALSE]
# Normalize for library size (TMM normalization)
dge <- calcNormFactors(dge)
# Log2 CPM transformation
logCPM <- cpm(dge, log = TRUE, prior.count = 1) # log2(CPM + 1)
pca_res <- prcomp(t(logCPM)) # samples as rows
pca_scores <- as.data.frame(pca_res$x) %>%
tibble::rownames_to_column("sample_pca")
pca_metadata <- sample_metadata %>%
tibble::rownames_to_column("sample_pca")
pca_df <- pca_scores %>%
dplyr::left_join(pca_metadata, by = "sample_pca")
# Calculate percent of variance explained
# Variance explained per PC
pca_var <- pca_res$sdev^2
pca_var_frac <- pca_var / sum(pca_var)
# Percent variance explained
pca_var_pct <- round(100 * pca_var_frac, 1)
ggplot(pca_df, aes(x = PC1, y = PC2,
color = mineral_code,
shape = timepoint)) +
geom_point(size = 3) +
geom_text(aes(label = sample_pca), vjust = -0.7, size = 3) +
theme_bw() +
labs(
title = "PCA of log2-CPM counts",
x = paste0("PC1 (", pca_var_pct[1], "%)"),
y = paste0("PC2 (", pca_var_pct[2], "%)")
)ggplot(pca_df, aes(x = PC1, y = PC3,
color = mineral_code,
shape = timepoint)) +
geom_point(size = 3) +
geom_text(aes(label = sample_pca), vjust = -0.7, size = 3) +
theme_bw() +
labs(
title = "PCA of log2-CPM counts",
x = paste0("PC1 (", pca_var_pct[1], "%)"),
y = paste0("PC3 (", pca_var_pct[3], "%)")
)ggplot(pca_df, aes(x = PC2, y = PC3,
color = mineral_code,
shape = timepoint)) +
geom_point(size = 3) +
geom_text(aes(label = sample_pca), vjust = -0.7, size = 3) +
theme_bw() +
labs(
title = "PCA of log2-CPM counts",
x = paste0("PC2 (", pca_var_pct[2], "%)"),
y = paste0("PC3 (", pca_var_pct[3], "%)")
)ggplot(pca_df, aes(x = PC3, y = PC4,
color = mineral_code,
shape = timepoint)) +
geom_point(size = 3) +
geom_text(aes(label = sample_pca), vjust = -0.7, size = 3) +
theme_bw() +
labs(
title = "PCA of log2-CPM counts",
x = paste0("PC3 (", pca_var_pct[3], "%)"),
y = paste0("PC4 (", pca_var_pct[4], "%)")
)