Chapter 4 Differential expression analysis

4.1 Data preparation for DE

4.1.1 Prepare metadata

# Define factors needed for contrasts
metadata_wo_drex <- metadata_wo_drex %>%
  mutate(
    timepoint = factor(timepoint, levels = c("3", "24")),
    mineral_code = factor(mineral_code, levels = c("C", "G", "H", "M"))
  )


stopifnot(all(colnames(counts_wo_drex) == rownames(metadata_wo_drex)))

4.1.2 Custom plot functions

# Volcano plots
plot_volcano <- function(res, lfc_cutoff = 1, fdr_cutoff = 0.05, title = "") {
  res <- res %>%
    mutate(
      Significance = case_when(
        adj.P.Val < fdr_cutoff & logFC > lfc_cutoff ~ "Up",
        adj.P.Val < fdr_cutoff & logFC < -lfc_cutoff ~ "Down",
        TRUE ~ "NS"
      )
    )
  
  ggplot(res, aes(x = logFC, y = -log10(adj.P.Val), color = Significance)) +
    geom_point(alpha = 0.6) +
    scale_color_manual(values = c("Up" = "red", "Down" = "blue", "NS" = "grey")) +
    geom_vline(xintercept = c(-lfc_cutoff, lfc_cutoff), linetype = "dashed") +
    geom_hline(yintercept = -log10(fdr_cutoff), linetype = "dashed") +
    theme_bw() +
    ggtitle(title) +
    xlab("log2 Fold Change") +
    ylab("-log10 FDR")
}

plot_MA <- function(res, title = "", fdr = 0.05, lfc = 1) {

  res2 <- res %>%
    dplyr::mutate(DE = adj.P.Val < fdr & abs(logFC) >= lfc)

  ggplot(res2, aes(x = AveExpr, y = logFC, color = DE)) +
    geom_point(alpha = 0.6, size = 1) +
    geom_hline(yintercept = 0, linetype = "dashed") +
    theme_bw() +
    xlab("Average log2-CPM") +
    ylab("log2 Fold Change") +
    ggtitle(title) +
    scale_color_manual(values = c("FALSE" = "grey70", "TRUE" = "red")) +
    guides(color = guide_legend(title = paste0("FDR<", fdr, " & |logFC|≥", lfc)))
}

4.2 Differential expression analysis

dge <- DGEList(counts = counts_wo_drex)

# Filter: keep genes expressed (CPM > 1) in at least 2 samples
keep <- rowSums(cpm(dge) > 1) >= 2
dge <- dge[keep, , keep.lib.sizes = FALSE]

# TMM normalization
dge <- calcNormFactors(dge)

4.2.1 3 hours

meta_3h <- metadata_wo_drex %>% filter(timepoint == "3")
dge_3h <- dge[, rownames(meta_3h)]

design_3h <- model.matrix(~ mineral_code, data = meta_3h)

v_3h <- voom(dge_3h, design_3h, plot = TRUE)

fit_3h <- lmFit(v_3h, design_3h)

contr_3h <- makeContrasts(
  G_vs_C = mineral_codeG,
  H_vs_C = mineral_codeH,
  M_vs_C = mineral_codeM,
  levels = design_3h
)

fit_3h2 <- contrasts.fit(fit_3h, contr_3h)
fit_3h2 <- eBayes(fit_3h2)

# Save tables
res_3h_G <- topTable(fit_3h2, coef = "G_vs_C", number = Inf, sort.by = "P")
res_3h_H <- topTable(fit_3h2, coef = "H_vs_C", number = Inf, sort.by = "P")
res_3h_M <- topTable(fit_3h2, coef = "M_vs_C", number = Inf, sort.by = "P")
# Goethite
plot_volcano(res_3h_G, title = "Volcano plot: Goethite vs Control (3h)")

# Hematite
plot_volcano(res_3h_H, title = "Volcano plot: Hematite vs Control (3h)")

# Mica
plot_volcano(res_3h_M, title = "Volcano plot: Mica vs Control (3h)")

y_thresh <- 2          # example: -log10(p) > 5
lfc_thresh <- 4        # log2FC > 2 or < -2

res_3h_G_filt <- res_3h_G %>%
  dplyr::mutate(log10P = -log10(P.Value)) %>%
  dplyr::filter(log10P >= y_thresh,
                abs(logFC) >= lfc_thresh)


res_3h_G_filt
                     logFC     AveExpr          t      P.Value    adj.P.Val           B    log10P
NC_000964.3_1368 -4.042256  9.89618458 -34.774446 3.496545e-17 1.464353e-13 29.47251582 16.456361
NC_000964.3_3402 -6.155483 10.52203036 -28.408554 1.013809e-15 1.415277e-12 26.26267407 14.994044
NC_000964.3_1369 -4.955862 12.10632121 -27.585203 1.651222e-15 1.728829e-12 25.84561707 14.782195
NC_000964.3_3300  4.287365  3.98371467  27.090107 2.229122e-15 1.867113e-12 25.38560068 14.651866
NC_000964.3_3299  4.169577  4.09275131  26.274710 3.696833e-15 2.580389e-12 24.88753209 14.432170
NC_000964.3_2588  4.562007  7.61313694  24.398407 1.256402e-14 6.577264e-12 23.85004934 13.900871
NC_000964.3_3298  4.127578  3.06642338  23.745325 1.964248e-14 8.514012e-12 23.19632100 13.706804
NC_000964.3_868   4.601745  4.70948839  23.201160 2.876008e-14 1.001919e-11 22.96546004 13.541210
NC_000964.3_1601 -4.315125  9.39674463 -22.688665 4.150764e-14 1.053498e-11 22.66416847 13.381872
NC_000964.3_1660  4.048154  8.60180723  21.806030 7.955003e-14 1.332622e-11 22.01479248 13.099360
NC_000964.3_3048  4.192985  5.64790630  21.000602 1.472079e-13 2.125885e-11 21.40355102 12.832069
NC_000964.3_1911  4.155400  5.02165838  19.903044 3.531288e-13 3.711243e-11 20.52821962 12.452067
NC_000964.3_1600 -4.751609  8.84939901 -19.601099 4.527638e-13 4.514702e-11 20.28101543 12.344128
NC_000964.3_626   4.302640  9.24594390  19.164967 6.523955e-13 5.575985e-11 19.89470221 12.185489
NC_000964.3_2565  4.624844  3.96972109  19.016253 7.402223e-13 6.078531e-11 19.77428365 12.130638
NC_000964.3_1599 -5.055277  8.53367032 -18.513251 1.142460e-12 8.394077e-11 19.35910903 11.942159
NC_000964.3_1271  4.303512  5.19653090  17.626375 2.522378e-12 1.563532e-10 18.56264202 11.598190
NC_000964.3_3526  4.321915  9.24640315  17.277123 3.479965e-12 1.868474e-10 18.19434595 11.458425
NC_000964.3_1604 -4.046683  7.33000763 -17.074154 4.206990e-12 2.122756e-10 18.05810701 11.376029
NC_000964.3_625   6.368384  3.82837747  16.286448 8.959197e-12 3.619936e-10 17.31527955 11.047731
NC_000964.3_1597 -4.448474  8.68953910 -15.899048 1.315028e-11 4.873751e-10 16.88536456 10.881065
NC_000964.3_2591  4.116523  6.49482807  15.001716 3.304687e-11 8.871814e-10 15.92029511 10.480870
NC_000964.3_274   4.612438  2.40696861  14.289394 7.110527e-11 1.575603e-09 15.21175320 10.148098
NC_000964.3_1984 -5.348197  4.80890702 -14.020958 9.572095e-11 1.936615e-09 14.50067231 10.018993
NC_000964.3_3704  4.442730  8.84442188  13.825306 1.192445e-10 2.259710e-09 14.57389994  9.923562
NC_000964.3_406   4.215793  6.58070389  13.527329 1.674932e-10 2.874843e-09 14.25546977  9.776003
NC_000964.3_407   4.364758  9.02094372  13.355110 2.044241e-10 3.396796e-09 14.01524361  9.689468
NC_000964.3_3705  4.910768  7.65343450  13.315727 2.140189e-10 3.481782e-09 13.99147381  9.669548
NC_000964.3_4041  4.266453  7.32227703  12.833044 3.790213e-10 5.589229e-09 13.39646365  9.421336
NC_000964.3_1281 -4.593500  5.77455563 -12.807366 3.909159e-10 5.674591e-09 13.56657883  9.407917
NC_000964.3_3706  5.186826  7.85510230  12.696022 4.472301e-10 6.222590e-09 13.22698905  9.349469
NC_000964.3_3571  4.065735  2.18058775  12.557708 5.293272e-10 7.006452e-09 13.22717481  9.276276
NC_000964.3_3707  5.158712  7.11304240  11.786170 1.394590e-09 1.452872e-08 12.07686783  8.855553
NC_000964.3_3162  4.338110  4.93885877  11.702474 1.553829e-09 1.594959e-08 12.02898451  8.808597
NC_000964.3_1979 -4.192778  2.50928358 -11.681368 1.596932e-09 1.627238e-08 11.80443808  8.796714
NC_000964.3_1464  4.632495  0.97139213  11.084559 3.519712e-09 3.136288e-08 11.36222467  8.453493
NC_000964.3_4040  4.155862  6.01206919  10.902760 4.507207e-09 3.860160e-08 10.90181684  8.346092
NC_000964.3_3754 -4.849073  4.88231315 -10.407076 8.991468e-09 7.091576e-08 10.43942543  8.046169
NC_000964.3_2232  4.369551  5.69437294  10.333447 9.983707e-09 7.637588e-08 10.06151075  8.000708
NC_000964.3_457   4.097563  8.22735962   9.313065 4.518796e-08 2.730840e-07  8.44125447  7.344977
NC_000964.3_1465  4.803673 -0.05367729   8.532753 1.552638e-07 8.037635e-07  7.61601581  6.808930
NC_000964.3_3970  4.169778  3.04686060   8.513284 1.602699e-07 8.215548e-07  7.48907571  6.795148
NC_000964.3_4075  4.634956  5.36550293   8.350463 2.093680e-07 1.034002e-06  6.97909460  6.679090
NC_000964.3_903  -4.495023  0.87836990  -7.783422 5.451068e-07 2.387429e-06  5.98868198  6.263518
NC_000964.3_1463  4.803247 -0.68480792   7.641880 6.966841e-07 2.980299e-06  6.18580389  6.156964
NC_000964.3_450   4.016022  2.05666989   7.273687 1.335440e-06 5.256412e-06  5.47393569  5.874376
NC_000964.3_2400 -4.838769  3.99487851  -7.165107 1.623558e-06 6.278356e-06  5.41958480  5.789532
NC_000964.3_2879 -4.448527  1.61727330  -6.719665 3.679617e-06 1.261067e-05  4.57501760  5.434197
NC_000964.3_1229  4.534827  1.90638630   6.121856 1.151138e-05 3.455889e-05  3.41635358  4.938873
NC_000964.3_1447 -4.312754  4.62968556  -5.430029 4.572088e-05 1.186363e-04  2.08283175  4.339885
NC_000964.3_1933 -4.229052  0.94292655  -4.996466 1.118260e-04 2.653412e-04  1.40102758  3.951457
NC_000964.3_3144 -4.041985  0.02481842  -4.928645 1.288616e-04 3.025069e-04  1.23253983  3.889877
NC_000964.3_499  -5.079678  4.46389026  -4.922780 1.304542e-04 3.055606e-04  1.06005828  3.884542
NC_000964.3_495  -5.802429  4.83783433  -4.917003 1.320429e-04 3.091088e-04  1.04427279  3.879285
NC_000964.3_494  -4.698508  4.63339318  -4.779566 1.762782e-04 4.027567e-04  0.67380056  3.753801
NC_000964.3_498  -4.085327  4.55972695  -4.730892 1.953577e-04 4.427262e-04  0.50690227  3.709169
NC_000964.3_497  -5.122464  3.52693204  -4.387127 4.060470e-04 8.610253e-04  0.04607402  3.391424
NC_000964.3_496  -6.468176  3.38995007  -3.710540 1.748853e-03 3.272652e-03 -1.24181275  2.757247
NC_000964.3_2135 -5.049997  1.42323235  -3.301436 4.237209e-03 7.462334e-03 -1.92073895  2.372920
plot_MA(res_3h_G, title = "MA: Goethite vs Control (3h)", fdr = 0.05, lfc = 1)

plot_MA(res_3h_H, title = "MA: Hematite vs Control (3h)", fdr = 0.05, lfc = 1)

plot_MA(res_3h_M, title = "MA: Mica vs Control (3h)", fdr = 0.05, lfc = 1)

gene_map <- complete_gene_annotation %>%
  dplyr::select(ID_prodigal, gene, locus_tag)

# For genes that dont have a name
gene_map <- complete_gene_annotation %>%
  dplyr::mutate(gene_label = ifelse(is.na(gene), locus_tag, gene)) %>%
  dplyr::select(ID_prodigal, gene_label)



# Top 30 DE genes in Goethite vs Control
top_genes <- res_3h_G %>%
  arrange(adj.P.Val) %>%
  head(30) %>%
  rownames()

matG <- v_3h$E[top_genes, ]  # log2-CPM from voom

# Replace rownames with gene names
new_names <- gene_map$gene_label[match(top_genes, gene_map$ID_prodigal)]
# Replace NA with original gene IDs
new_names[is.na(new_names)] <- top_genes[is.na(new_names)]

rownames(matG) <- new_names


annotation <- meta_3h[, "mineral_code", drop = FALSE]
rownames(annotation) <- rownames(meta_3h)


pheatmap(
  matG,
  annotation_col = annotation,
  scale = "row",
  clustering_distance_rows = "correlation",
  clustering_distance_cols = "correlation",
  main = "Top 30 DE genes: Goethite vs Control (3h)"
)

# Top 30 DE genes in Hematite vs Control
top_genes <- res_3h_H %>%
  arrange(adj.P.Val) %>%
  head(30) %>%
  rownames()

matH <- v_3h$E[top_genes, ]  # log2-CPM from voom

# Replace rownames with gene names
new_names <- gene_map$gene_label[match(top_genes, gene_map$ID_prodigal)]
# Replace NA with original gene IDs
new_names[is.na(new_names)] <- top_genes[is.na(new_names)]

rownames(matH) <- new_names



annotation <- meta_3h[, "mineral_code", drop = FALSE]
rownames(annotation) <- rownames(meta_3h)


pheatmap(
  matH,
  annotation_col = annotation,
  scale = "row",
  clustering_distance_rows = "correlation",
  clustering_distance_cols = "correlation",
  main = "Top 30 DE genes: Hematite vs Control (3h)"
)

# Top 30 DE genes in Mica vs Control
top_genes <- res_3h_M %>%
  arrange(adj.P.Val) %>%
  head(30) %>%
  rownames()

matM <- v_3h$E[top_genes, ]  # log2-CPM from voom

# Replace rownames with gene names
new_names <- gene_map$gene_label[match(top_genes, gene_map$ID_prodigal)]
# Replace NA with original gene IDs
new_names[is.na(new_names)] <- top_genes[is.na(new_names)]

rownames(matM) <- new_names



annotation <- meta_3h[, "mineral_code", drop = FALSE]
rownames(annotation) <- rownames(meta_3h)

pheatmap(
  matM,
  annotation_col = annotation,
  scale = "row",
  clustering_distance_rows = "correlation",
  clustering_distance_cols = "correlation",
  main = "Top 30 DE genes: Mica vs Control (3h)"
)

4.2.2 24 hours

# The reference is choosen based on the first level in levels(meta_24h$mineral_code) so each mineral is compared to the control
meta_24h <- metadata_wo_drex %>% filter(timepoint == "24")
dge_24h <- dge[, rownames(meta_24h)]

design_24h <- model.matrix(~ mineral_code, data = meta_24h)

v_24h <- voom(dge_24h, design_24h, plot = TRUE)

fit_24h <- lmFit(v_24h, design_24h)

contr_24h <- makeContrasts(
  G_vs_C = mineral_codeG,
  H_vs_C = mineral_codeH,
  M_vs_C = mineral_codeM,
  levels = design_24h
)

fit_24h2 <- contrasts.fit(fit_24h, contr_24h)
fit_24h2 <- eBayes(fit_24h2)


# Save tables
res_24h_G <- topTable(fit_24h2, coef = "G_vs_C", number = Inf, sort.by = "P")
res_24h_H <- topTable(fit_24h2, coef = "H_vs_C", number = Inf, sort.by = "P")
res_24h_M <- topTable(fit_24h2, coef = "M_vs_C", number = Inf, sort.by = "P")
# Goethite
plot_volcano(res_24h_G, title = "Volcano plot: Goethite vs Control (24h)")

# Hematite
plot_volcano(res_24h_H, title = "Volcano plot: Hematite vs Control (24h)")

# Mica
plot_volcano(res_24h_M, title = "Volcano plot: Mica vs Control (24h)")

plot_MA(res_24h_G, title = "MA plot: Goethite vs Control (24h)", fdr = 0.05, lfc = 1)

plot_MA(res_24h_H, title = "MA plot: Hematite vs Control (24h)", fdr = 0.05, lfc = 1)

plot_MA(res_24h_M, title = "MA plot: Mica vs Control (24h)", fdr = 0.05, lfc = 1)

# Top 30 DE genes in Goethite vs Control
top_genes <- res_24h_G %>%
  arrange(adj.P.Val) %>%
  head(30) %>%
  rownames()

matG <- v_24h$E[top_genes, ]  # log2-CPM from voom

# Replace rownames with gene names
new_names <- gene_map$gene_label[match(top_genes, gene_map$ID_prodigal)]
# Replace NA with original gene IDs
new_names[is.na(new_names)] <- top_genes[is.na(new_names)]

rownames(matG) <- new_names


annotation <- meta_24h[, "mineral_code", drop = FALSE]
rownames(annotation) <- rownames(meta_24h)


pheatmap(
  matG,
  annotation_col = annotation,
  scale = "row",
  clustering_distance_rows = "correlation",
  clustering_distance_cols = "correlation",
  main = "Top 30 DE genes: Goethite vs Control (24h)"
)

# Top 30 DE genes in Hematite vs Control
top_genes <- res_24h_H %>%
  arrange(adj.P.Val) %>%
  head(30) %>%
  rownames()

matH <- v_24h$E[top_genes, ]  # log2-CPM from voom

# Replace rownames with gene names
new_names <- gene_map$gene_label[match(top_genes, gene_map$ID_prodigal)]
# Replace NA with original gene IDs
new_names[is.na(new_names)] <- top_genes[is.na(new_names)]

rownames(matH) <- new_names



annotation <- meta_24h[, "mineral_code", drop = FALSE]
rownames(annotation) <- rownames(meta_24h)


pheatmap(
  matH,
  annotation_col = annotation,
  scale = "row",
  clustering_distance_rows = "correlation",
  clustering_distance_cols = "correlation",
  main = "Top 30 DE genes: Hematite vs Control (24h)"
)

# Top 30 DE genes in Mica vs Control
top_genes <- res_24h_M %>%
  arrange(adj.P.Val) %>%
  head(30) %>%
  rownames()

matM <- v_24h$E[top_genes, ]  # log2-CPM from voom

# Replace rownames with gene names
new_names <- gene_map$gene_label[match(top_genes, gene_map$ID_prodigal)]
# Replace NA with original gene IDs
new_names[is.na(new_names)] <- top_genes[is.na(new_names)]

rownames(matM) <- new_names



annotation <- meta_24h[, "mineral_code", drop = FALSE]
rownames(annotation) <- rownames(meta_24h)

pheatmap(
  matM,
  annotation_col = annotation,
  scale = "row",
  clustering_distance_rows = "correlation",
  clustering_distance_cols = "correlation",
  main = "Top 30 DE genes: Mica vs Control (24h)"
)

4.2.3 Results summary

summarise_contrast <- function(res, name) {
  data.frame(
    contrast = name,
    
    # High fold change (strict)
    high_up = sum(res$logFC >= 4.5 & res$adj.P.Val <= 0.01, na.rm = TRUE),
    high_down = sum(res$logFC <= -4.5 & res$adj.P.Val <= 0.01, na.rm = TRUE),
    
    # Top hits
    top_up = sum(res$logFC >= 2 & res$adj.P.Val <= 0.05, na.rm = TRUE),
    top_down = sum(res$logFC <= -2 & res$adj.P.Val <= 0.05, na.rm = TRUE),
    
    # Total DE
    total_up = sum(res$logFC > 0 & res$adj.P.Val <= 0.05, na.rm = TRUE),
    total_down = sum(res$logFC < 0 & res$adj.P.Val <= 0.05, na.rm = TRUE)
  )
}
DE_summary_table <- dplyr::bind_rows(
  summarise_contrast(res_3h_G, "3h Goethite vs Control"),
  summarise_contrast(res_3h_H, "3h Hematite vs Control"),
  summarise_contrast(res_3h_M, "3h Mica vs Control"),
  summarise_contrast(res_24h_G, "24h Goethite vs Control"),
  summarise_contrast(res_24h_H, "24h Hematite vs Control"),
  summarise_contrast(res_24h_M, "24h Mica vs Control")
  
)

DE_summary_table
                 contrast high_up high_down top_up top_down total_up total_down
1  3h Goethite vs Control      13        14    393      278     1517       1358
2  3h Hematite vs Control       0         0     29       18      676        655
3      3h Mica vs Control       0         0     17       14      608        650
4 24h Goethite vs Control       5         8    209      234      716        834
5 24h Hematite vs Control       0         0     34       81      179        159
6     24h Mica vs Control      24         3    157      157      657        614

4.2.4 Top highly expressed genes heatmaps

# Goethite vs Control 3h
high_up_genes   <- res_3h_G %>% filter(logFC >= 4.5 & adj.P.Val <= 0.01) %>% rownames()
high_down_genes <- res_3h_G %>% filter(logFC <= -4.5 & adj.P.Val <= 0.01) %>% rownames()

high_genes <- c(high_up_genes, high_down_genes)

# subset the expression matrix
matG_high <- v_3h$E[high_genes, ]

# replace rownames with gene names where available
gene_map <- complete_gene_annotation %>%
  mutate(gene_label = ifelse(is.na(gene), locus_tag, gene)) %>%
  select(ID_prodigal, gene_label)

new_names <- gene_map$gene_label[match(high_genes, gene_map$ID_prodigal)]
new_names[is.na(new_names)] <- high_genes[is.na(new_names)]

rownames(matG_high) <- new_names

# column annotation
annotation <- meta_3h[, "mineral_code", drop = FALSE]
rownames(annotation) <- rownames(meta_3h)


pheatmap(
  matG_high,
  annotation_col = annotation,
  scale = "row",
  clustering_distance_rows = "correlation",
  clustering_distance_cols = "correlation",
  main = "High DE genes: Goethite vs Control (3h)"
)

# Goethite vs Control 24h
high_up_genes   <- res_24h_G %>% filter(logFC >= 4.5 & adj.P.Val <= 0.01) %>% rownames()
high_down_genes <- res_24h_G %>% filter(logFC <= -4.5 & adj.P.Val <= 0.01) %>% rownames()

high_genes <- c(high_up_genes, high_down_genes)

# subset the expression matrix
matG_high <- v_24h$E[high_genes, ]

# replace rownames with gene names where available
gene_map <- complete_gene_annotation %>%
  mutate(gene_label = ifelse(is.na(gene), locus_tag, gene)) %>%
  select(ID_prodigal, gene_label)

new_names <- gene_map$gene_label[match(high_genes, gene_map$ID_prodigal)]
new_names[is.na(new_names)] <- high_genes[is.na(new_names)]

rownames(matG_high) <- new_names

# column annotation
annotation <- meta_24h[, "mineral_code", drop = FALSE]
rownames(annotation) <- rownames(meta_24h)

pheatmap(
  matG_high,
  annotation_col = annotation,
  scale = "row",
  clustering_distance_rows = "correlation",
  clustering_distance_cols = "correlation",
  main = "High DE genes: Goethite vs Control (24h)"
)

# Mica vs Control 24h
high_up_genes   <- res_24h_M %>% filter(logFC >= 4.5 & adj.P.Val <= 0.01) %>% rownames()
high_down_genes <- res_24h_M %>% filter(logFC <= -4.5 & adj.P.Val <= 0.01) %>% rownames()

high_genes <- c(high_up_genes, high_down_genes)

# subset the expression matrix
matM_high <- v_24h$E[high_genes, ]

# replace rownames with gene names where available
gene_map <- complete_gene_annotation %>%
  mutate(gene_label = ifelse(is.na(gene), locus_tag, gene)) %>%
  select(ID_prodigal, gene_label)

new_names <- gene_map$gene_label[match(high_genes, gene_map$ID_prodigal)]
new_names[is.na(new_names)] <- high_genes[is.na(new_names)]

rownames(matM_high) <- new_names

# column annotation
annotation <- meta_24h[, "mineral_code", drop = FALSE]
rownames(annotation) <- rownames(meta_24h)

pheatmap(
  matM_high,
  annotation_col = annotation,
  scale = "row",
  clustering_distance_rows = "correlation",
  clustering_distance_cols = "correlation",
  main = "High DE genes: Mica vs Control (24h)"
)

4.2.5 Interaction DE - UNDER CONSTRUCTION

design_full <- model.matrix(~ timepoint * mineral_code, data = metadata_wo_drex)

v <- voom(dge, design_full, plot = TRUE)

fit <- lmFit(v, design_full)

colnames(design_full)
[1] "(Intercept)"               "timepoint24"               "mineral_codeG"            
[4] "mineral_codeH"             "mineral_codeM"             "timepoint24:mineral_codeG"
[7] "timepoint24:mineral_codeH" "timepoint24:mineral_codeM"
colnames(design_full) <- make.names(colnames(design_full))

colnames(design_full)
[1] "X.Intercept."              "timepoint24"               "mineral_codeG"            
[4] "mineral_codeH"             "mineral_codeM"             "timepoint24.mineral_codeG"
[7] "timepoint24.mineral_codeH" "timepoint24.mineral_codeM"
contr_interaction <- makeContrasts(
  G_change_24_vs_3 = timepoint24.mineral_codeG,
  H_change_24_vs_3 = timepoint24.mineral_codeH,
  M_change_24_vs_3 = timepoint24.mineral_codeM,
  levels = design_full
)


fit_int <- contrasts.fit(fit, contr_interaction)
fit_int <- eBayes(fit_int)

res_int_G <- topTable(fit_int, coef = "G_change_24_vs_3", number = Inf, sort.by = "P")
res_int_H <- topTable(fit_int, coef = "H_change_24_vs_3", number = Inf, sort.by = "P")
res_int_M <- topTable(fit_int, coef = "M_change_24_vs_3", number = Inf, sort.by = "P")

# write.csv(res_3h_G,  "DE_3h_G_vs_C.csv")
# write.csv(res_3h_H,  "DE_3h_H_vs_C.csv")
# write.csv(res_3h_M,  "DE_3h_M_vs_C.csv")
# 
# write.csv(res_24h_G, "DE_24h_G_vs_C.csv")
# write.csv(res_24h_H, "DE_24h_H_vs_C.csv")
# write.csv(res_24h_M, "DE_24h_M_vs_C.csv")
# 
# write.csv(res_int_G, "DE_interaction_G_change_24_vs_3.csv")
# write.csv(res_int_H, "DE_interaction_H_change_24_vs_3.csv")
# write.csv(res_int_M, "DE_interaction_M_change_24_vs_3.csv")
plot_volcano(res_int_G, title = "Interaction volcano: Goethite (24h vs 3h relative to control)")

plot_volcano(res_int_H, title = "Interaction volcano: Hematite (24h vs 3h relative to control)")

plot_volcano(res_int_M, title = "Interaction volcano: Mica (24h vs 3h relative to control)")

top_genes_G <- rownames(res_int_G %>% arrange(adj.P.Val) %>% head(30))

samples_CG <- rownames(metadata_wo_drex) %>%
  {.[metadata_wo_drex$mineral_code %in% c("C","G")]}

mat_G <- v$E[top_genes_G, samples_CG]

ann_G <- metadata_wo_drex[samples_CG, c("timepoint", "mineral_code"), drop = FALSE]

pheatmap(mat_G,
         annotation_col = ann_G,
         scale = "row",
         clustering_distance_rows = "correlation",
         clustering_distance_cols = "correlation",
         main = "Top 30 interaction genes: Goethite (C vs G only)")

top_genes_H <- rownames(res_int_H %>% arrange(adj.P.Val) %>% head(30))

samples_CH <- rownames(metadata_wo_drex) %>%
  {.[metadata_wo_drex$mineral_code %in% c("C","H")]}

mat_H <- v$E[top_genes_H, samples_CH]

ann_H <- metadata_wo_drex[samples_CH, c("timepoint", "mineral_code"), drop = FALSE]

pheatmap(mat_H,
         annotation_col = ann_H,
         scale = "row",
         clustering_distance_rows = "correlation",
         clustering_distance_cols = "correlation",
         main = "Top 30 interaction genes: Hematite (C vs H only)")

top_genes_M <- rownames(res_int_M %>% arrange(adj.P.Val) %>% head(30))

samples_CM <- rownames(metadata_wo_drex) %>%
  {.[metadata_wo_drex$mineral_code %in% c("C","M")]}

mat_M <- v$E[top_genes_M, samples_CM]

ann_M <- metadata_wo_drex[samples_CM, c("timepoint", "mineral_code"), drop = FALSE]

pheatmap(mat_M,
         annotation_col = ann_M,
         scale = "row",
         clustering_distance_rows = "correlation",
         clustering_distance_cols = "correlation",
         main = "Top 30 interaction genes: Mica (C vs M only)")

plot_profiles <- function(gene_ids, v_obj, meta, mineral = "G") {
  
  # expression matrix for selected genes
  expr <- v_obj$E[gene_ids, , drop = FALSE]
  
  df <- as.data.frame(t(expr)) %>%
    tibble::rownames_to_column("sample_id") %>%
    left_join(meta %>% tibble::rownames_to_column("sample_id"), by = "sample_id") %>%
    filter(mineral_code %in% c("C", mineral)) %>%
    mutate(timepoint = factor(timepoint, levels = c("3", "24")),
           mineral_code = factor(mineral_code, levels = c("C", mineral))) %>%
    pivot_longer(cols = all_of(gene_ids), names_to = "gene", values_to = "logCPM")
  
  df_sum <- df %>%
    group_by(gene, mineral_code, timepoint) %>%
    summarise(mean = mean(logCPM), .groups = "drop")
  
  ggplot(df_sum, aes(x = timepoint, y = mean, group = mineral_code, color = mineral_code)) +
    geom_line(linewidth = 1) +
    geom_point(size = 2) +
    facet_wrap(~ gene, scales = "free_y") +
    theme_bw() +
    ggtitle(paste0("Interaction profiles: ", mineral, " vs Control")) +
    ylab("Mean log2-CPM") +
    xlab("Timepoint")
}

top6_G <- rownames(res_int_G %>% arrange(adj.P.Val) %>% head(6))
plot_profiles(top6_G, v, metadata_wo_drex, mineral = "G")

delta_delta_plot <- function(v_obj, meta, mineral = "G", ntop = 3000) {
  
  expr <- v_obj$E
  
  # keep top variable genes to make it fast
  rv <- matrixStats::rowVars(expr)
  keep <- order(rv, decreasing = TRUE)[seq_len(min(ntop, nrow(expr)))]
  expr <- expr[keep, , drop = FALSE]
  
  # group means
  meta2 <- meta %>%
    mutate(timepoint = factor(timepoint, levels = c("3","24"))) %>%
    filter(mineral_code %in% c("C", mineral))
  
  samples_C3  <- rownames(meta2)[meta2$mineral_code == "C" & meta2$timepoint == "3"]
  samples_C24 <- rownames(meta2)[meta2$mineral_code == "C" & meta2$timepoint == "24"]
  samples_M3  <- rownames(meta2)[meta2$mineral_code == mineral & meta2$timepoint == "3"]
  samples_M24 <- rownames(meta2)[meta2$mineral_code == mineral & meta2$timepoint == "24"]
  
  C3  <- rowMeans(expr[, samples_C3,  drop = FALSE])
  C24 <- rowMeans(expr[, samples_C24, drop = FALSE])
  M3  <- rowMeans(expr[, samples_M3,  drop = FALSE])
  M24 <- rowMeans(expr[, samples_M24, drop = FALSE])
  
  df <- data.frame(
    gene = rownames(expr),
    delta_control = C24 - C3,
    delta_mineral = M24 - M3
  )
  
  ggplot(df, aes(x = delta_control, y = delta_mineral)) +
    geom_point(alpha = 0.4) +
    geom_abline(slope = 1, intercept = 0) +
    theme_bw() +
    ggtitle(paste0("Delta–delta plot: ", mineral, " vs Control")) +
    xlab("Control: (24h - 3h) log2-CPM") +
    ylab(paste0(mineral, ": (24h - 3h) log2-CPM"))
}

delta_delta_plot(v, metadata_wo_drex, mineral = "G")