Chapter 2 Differential Expression Analyses

2.1 Pre-processing of data

2.1.1 Adapt the format

Define a function that reads and processes RNASeq input files, adapting the format to produce matrices containing only integer count values, with genes as rows and samples as columns (without annotation columns).

read_and_adapt <- function(file_path) {
  counts <- read_tsv(file_path, col_names = TRUE)
  counts_matrix <- counts %>%
    select(-gene_name) %>%
    column_to_rownames(var = "gene_id") %>%
    as.matrix()
  mode(counts_matrix) <- "numeric"
  return(counts_matrix)
}

Apply the function to all the tissues (amygdala, cortex, hyppocampus and liver).

amygdala_counts <- read_and_adapt("RNASeq_tables/RNASeq_out_Amigdala/salmon.merged.gene_counts.tsv")
cortex_counts <- read_and_adapt("RNASeq_tables/RNASeq_out_Cortex/salmon.merged.gene_counts.tsv")
hippocampus_counts <- read_and_adapt("RNASeq_tables/RNASeq_out_Hippocampus/salmon.merged.gene_counts.tsv")
liver_counts <- read_and_adapt("RNASeq_tables/RNASeq_out_Liver/salmon.merged.gene_counts.tsv")

#Save for later use
write.table(amygdala_counts, file = "differential_analysis/amygdala_counts.tsv", sep = "\t", quote = FALSE, row.names = TRUE)

Example of the amygdala count matrix (first 6 rows, first 5 columns):

(#tab:amygdala_input)First rows and columns of the amygdala count matrix
A10	A11	A12	A13	A14
ENSSSCG00000000002	6	5	44	59
ENSSSCG00000000003	481	476	473	474
ENSSSCG00000000005	541	666	617	693
ENSSSCG00000000006	228	413	310	268
ENSSSCG00000000007	301	467	348	347
ENSSSCG00000000010	749	1192	1154	1825

2.1.2 Filter low-expressed genes

Keep genes with CPM values greater than one in at least two samples. The CPM value is used instead of the read count to eliminate the deviation caused by different sequencing depths.

filter_CPM <- function(counts_matrix, min_cpm = 1, min_samples = 2){
  cpm_values <- cpm(counts_matrix)
  keep <- rowSums(cpm_values > min_cpm) >= min_samples
  filtered_matrix <- counts_matrix[keep, , drop=FALSE]
  return(filtered_matrix)
}

Apply the function to all the tissues (amygdala, cortex, hyppocampus and liver).

amygdala_counts_f <- filter_CPM(amygdala_counts)
cortex_counts_f   <- filter_CPM(cortex_counts)
hippocampus_counts_f <- filter_CPM(hippocampus_counts)
liver_counts_f    <- filter_CPM(liver_counts)

# Save for later use
saveRDS(amygdala_counts_f, "differential_analysis/limma_voom/amygdala_counts_f.rds")
saveRDS(cortex_counts_f, "differential_analysis/limma_voom/cortex_counts_f.rds")
saveRDS(hippocampus_counts_f, "differential_analysis/limma_voom/hippocampus_counts_f.rds")
saveRDS(liver_counts_f, "differential_analysis/limma_voom/liver_counts_f.rds")

2.1.3 Import sample metadata

Import all the sample information regarding treatment condition and sex for all tissues.

# Read metadata
metadata_amygdala <- read.table("metadata/metadata_amygdala.txt", header = TRUE)
metadata_cortex <- read.table("metadata/metadata_cortex.txt", header = TRUE)
metadata_hippocampus <- read.table("metadata/metadata_hippocampus.txt", header = TRUE)
metadata_liver <- read.table("metadata/metadata_liver.txt", header = TRUE)

# Make sure sample order is the same in data and metadata
metadata_liver <- metadata_liver[
  match(colnames(liver_counts), metadata_liver$sample),
]

Example of metadata for amygdala samples.

(#tab:amygdala_metadata)First rows of amygdala metadata
sample	tissue	condition	sex	Code_Xavi	Day_sacrif	ID_code	ID
A10	Amygdala	control	F	8	Oct10	F850	946
A11	Amygdala	control	M	9	Oct10	M997	991
A12	Amygdala	control	M	10	Oct10	M329	993
A13	Amygdala	control	F	11	Oct10	F830	987
A14	Amygdala	control	F	12	Oct10	F825	990
A15	Amygdala	control	F	13	Oct10	F4936	988

Group data and metadata into lists for later use.

# Group data and metadata into lists for later use
counts_list <- list(amygdala = amygdala_counts_f, cortex = cortex_counts_f, hippocampus = hippocampus_counts_f, liver = liver_counts_f)
metadata_list <-list(amygdala = metadata_amygdala, cortex = metadata_cortex, hippocampus = metadata_hippocampus, liver = metadata_liver)

2.1.4 Identify outliers

We applied two different approaches to objectively identify sample outliers in each tissue using Principal Component Analysis (PCA).

Both methods use the 500 most variable genes, log-transformed counts, and color samples by experimental group (e.g., control or stress). Outliers are highlighted with a different shape and labeled in the plot. Ellipses indicate the 95% confidence region for each group.

2.1.4.1 Euclidean distance to the robust centroid (MAD method)

Samples are considered outliers if their distance to the median sample in PCA space exceeds a threshold based on the median absolute deviation (MAD).

detect_pca_outliers_euclidean <- function(counts_matrix, metadata, tissue, 
                                           n_top_genes = 500, n_pcs = 2, mad_thresh = 3, ellipse.level = 0.95,
                                           group_col = "condition") {
  # Top 500 most variable genes
  vars <- apply(counts_matrix, 1, var)
  top_genes <- names(sort(vars, decreasing = TRUE))[1:min(n_top_genes, length(vars))]
  log_counts <- log2(counts_matrix[top_genes, ] + 1)
  
  # PCA
  pca <- prcomp(t(log_counts), scale. = TRUE)
  pc_scores <- pca$x[, 1:n_pcs, drop = FALSE]
  
  # Centroide robusto y distancia euclidiana
  center <- apply(pc_scores, 2, median)
  dists <- sqrt(rowSums((pc_scores - matrix(center, nrow=nrow(pc_scores), ncol=n_pcs, byrow=TRUE))^2))
  
  # Outliers
  mad_val <- mad(dists)
  outlier_samples <- names(dists)[dists > median(dists) + mad_thresh * mad_val]
  
  # Dataframe
  pca_df <- data.frame(
    Sample = rownames(pc_scores),
    PC1 = pc_scores[,1],
    PC2 = pc_scores[,2],
    Outlier = factor(ifelse(rownames(pc_scores) %in% outlier_samples, "Outlier", "Normal"), levels = c("Normal", "Outlier")),
    Group = as.factor(metadata[match(rownames(pc_scores), metadata$sample), group_col])
  )
  
  # Plot
  p <- ggplot(pca_df, aes(x = PC1, y = PC2, color = Group, shape = Outlier, label = Sample)) +
    geom_point(size = 3) +
    stat_ellipse(aes(group = Group, color = Group), level = ellipse.level, linetype = 2) +
    geom_text(data = subset(pca_df, Outlier == "Outlier"), vjust = -1, size = 3) +
    labs(title = paste("PCA with Outlier Detection (", tissue, ")", sep = ""),
         x = "PC1", y = "PC2") +
    scale_shape_manual(values = c(16, 17)) +
    theme_bw()
  
  return(list(plot = p, outliers = outlier_samples, pca_df = pca_df))
}

Apply to all the tissues:

results_pca_outliers <- list()
for (tissue in names(counts_list)) {
  results_pca_outliers[[tissue]] <- detect_pca_outliers_euclidean(
    counts_matrix = counts_list[[tissue]],
    metadata = metadata_list[[tissue]],
    tissue = tissue
  )
  print(results_pca_outliers[[tissue]]$plot)
  print(results_pca_outliers[[tissue]]$outliers)
  
  outlier_dir <- "differential_analysis/outliers"
  if (!dir.exists(outlier_dir)) {
  dir.create(outlier_dir, recursive = TRUE)
  }
  
  ggsave(
    filename = file.path(outlier_dir, paste0("PCA_outliers_MAD_", tissue, ".png")),
    plot = results_pca_outliers[[tissue]]$plot,
    width = 6, height = 5, dpi = 300
  )
}

2.1.4.2 Mahalanobis distance

Samples are considered outliers if their Mahalanobis distance in PCA space corresponds to a p-value below a given threshold, accounting for the covariance structure.

detect_pca_outliers_mahalanobis <- function(counts_matrix, metadata, tissue, 
                                            n_top_genes = 500, n_pcs = 2, pval_thresh = 0.01, ellipse.level = 0.95,
                                            group_col = "condition") {
  # Top variable genes
  vars <- apply(counts_matrix, 1, var)
  top_genes <- names(sort(vars, decreasing = TRUE))[1:min(n_top_genes, length(vars))]
  log_counts <- log2(counts_matrix[top_genes, ] + 1)
  
  # PCA
  pca <- prcomp(t(log_counts), scale. = TRUE)
  pc_scores <- pca$x[, 1:n_pcs, drop = FALSE]
  
  # Mahalanobis distance
  center <- colMeans(pc_scores)
  covariance <- cov(pc_scores)
  mahal_dist <- mahalanobis(pc_scores, center, covariance)
  pvals <- pchisq(mahal_dist, df = n_pcs, lower.tail = FALSE)
  
  outlier_samples <- names(pvals)[pvals < pval_thresh]
  
  # Dataframe for plotting
  pca_df <- data.frame(
    Sample = rownames(pc_scores),
    PC1 = pc_scores[,1],
    PC2 = pc_scores[,2],
    Outlier = factor(ifelse(rownames(pc_scores) %in% outlier_samples, "Outlier", "Normal"), levels = c("Normal", "Outlier")),
    Group = as.factor(metadata[match(rownames(pc_scores), metadata$sample), group_col]),
    pval = pvals
  )
  
  # Plot with ggplot2
  p <- ggplot(pca_df, aes(x = PC1, y = PC2, color = Group, shape = Outlier, label = Sample)) +
    geom_point(size = 3) +
    stat_ellipse(aes(group = Group, color = Group), level = ellipse.level, linetype = 2) +
    geom_text(data = subset(pca_df, Outlier == "Outlier"), vjust = -1, size = 3) +
    labs(title = paste("PCA with Mahalanobis Outlier Detection (", tissue, ")", sep = ""),
         x = "PC1", y = "PC2") +
    scale_shape_manual(values = c(16, 17)) +
    theme_bw()
  
  return(list(plot = p, outliers = outlier_samples, pca_df = pca_df))
}

Apply to all tissues

results_pca_outliers_mahal <- list()
for (tissue in names(counts_list)) {
  results_pca_outliers_mahal[[tissue]] <- detect_pca_outliers_mahalanobis(
    counts_matrix = counts_list[[tissue]],
    metadata = metadata_list[[tissue]],
    tissue = tissue
  )
  print(results_pca_outliers_mahal[[tissue]]$plot)
  print(results_pca_outliers_mahal[[tissue]]$outliers)
  
  outlier_dir <- "differential_analysis/outliers"
  if (!dir.exists(outlier_dir)) {
  dir.create(outlier_dir, recursive = TRUE)
  }
  
  ggsave(
    filename = file.path(outlier_dir, paste0("PCA_outliers_MH_", tissue, ".png")),
    plot = results_pca_outliers_mahal[[tissue]]$plot,
    width = 6, height = 5, dpi = 300
  )
}

2.1.4.3 PCA plots

PCA outlier plots for each tissue are saved in differential_analysis/outliers/ and displayed below. Outliers are highlighted for each method, enabling objective comparison and selection of outlier samples prior to downstream analysis.

2.1.4.3.1 Amygdala

Euclidean (MAD)

Mahalanobis

2.1.4.3.2 Cortex

Euclidean (MAD)

Mahalanobis

2.1.4.3.3 Hippocampus

Euclidean (MAD)

Mahalanobis

2.1.4.3.4 Liver

Euclidean (MAD)

Mahalanobis

2.2 Limma voom

2.2.1 Methodology

First, we create a function to apply Limma‑voom analysis to all tissues. In this function, the first step is to create a design matrix specific for Limma‑voom analysis using a group vector. Then we create a DGEList object and normalise the data by the TMM method. Next, we apply voom, which transforms the count data to log‑counts per million (logCPM) and computes observation‑level weights to account for the mean‑variance relationship typical of RNA‑seq data. After voom, we fit a linear model using lmFit and then apply empirical Bayes moderation with eBayes to obtain moderated t‑statistics, p‑values and false discovery rates (FDR). The function then extracts the results table for the contrast of interest (stress vs. control) using topTable. Based on user‑defined thresholds (log₂FC > 2, FDR < 0.05), it classifies each gene as up‑regulated, down‑regulated or not significant. Finally, the function returns a list containing the results table, the DGEList object, the voom object, the design matrix and a summary of the number of differentially expressed genes.

run_limma_voom <- function(counts_matrix, metadata, group_col = "condition", covariates = c("sex"), logFC_thresh = 2, pval_thresh = 0.05) {

  # Create group (factor) using condition column
  group <- factor(metadata[[group_col]], levels = c("control", "stress"))
  
  # Build the design formula including group and covariables
  design_formula <- as.formula(paste("~", group_col, "+", paste0(covariates, collapse = "+")))
  design <- model.matrix(design_formula, data = metadata)
  rownames(design) <- metadata$sample
  
  # Create DGEList and normalise it by TMM method
  dge <- DGEList(counts = counts_matrix, group = group)
  dge <- calcNormFactors(dge, method = "TMM")
  
  # Apply voom
  v <- voom(dge, design, plot = FALSE)
  
  # Adjust the linear model
  fit <- lmFit(v, design)
  fit <- eBayes(fit)
  
  # Extract result table
  res <- topTable(fit, coef = "conditionstress", n = Inf, sort.by = "none")
  
  # Create significance column based on thresholds
  res$sig <- factor(
    ifelse(res$adj.P.Val < pval_thresh & abs(res$logFC) > logFC_thresh,
           ifelse(res$logFC > logFC_thresh, "up", "down"),
           "not"),
    levels = c("up", "down", "not")
  )
  
  # Summary of DEGs
  deg_summary <- summary(res$sig)
  
  # Results
  return(list(results = res, dge = dge, v = v, design = design, deg_summary = deg_summary))
}

Apply the function to all the tissues (amygdala, cortex, hyppocampus and liver).

#Apply the function to each pair of data and metadata
limma_results <- map2(counts_list, metadata_list, ~ run_limma_voom(counts_matrix = .x, metadata = .y))

#Save result tables
walk2(limma_results, names(limma_results), ~ write.csv(.x$results, file = file.path("differential_analysis/limma_voom", paste0("result_limma_voom_", .y, ".csv")), row.names = TRUE))

2.2.2 Results

The following tables show the first few lines of the results for each tissue (genes with logFC, p-values, etc.). Full tables are available in the differential_analysis/limma_voom/ folder.

2.2.2.1 Amygdala

Top 6 rows of limma-voom results for amygdala.

(#tab:limma_voom_amygdala)Limma‑voom results for amygdala (top 6 rows)
	logFC	AveExpr	t	P.Value	adj.P.Val	B	sig
ENSSSCG00000000002	0.1272223	-1.720627	0.4688020	0.6411920	0.8646184	-4.815137	not
ENSSSCG00000000003	0.1471246	3.441317	1.9250392	0.0597626	0.3621335	-4.114587	not
ENSSSCG00000000005	0.0311047	3.624529	0.4917307	0.6250017	0.8570678	-5.666735	not
ENSSSCG00000000006	-0.0068489	2.696342	-0.0739173	0.9413629	0.9817962	-5.628632	not
ENSSSCG00000000007	0.0700214	3.040492	0.6960261	0.4895456	0.7909370	-5.472915	not
ENSSSCG00000000010	0.0126956	4.677031	0.0960375	0.9238640	0.9755966	-5.872082	not

Summary of DEGs in amygdala (up/down/not):


  not 
16196

2.2.2.2 Cortex

Top 6 rows of limma-voom results for cortex.

(#tab:limma_voom_cortex)Limma‑voom results for cortex (top 6 rows)
	logFC	AveExpr	t	P.Value	adj.P.Val	B	sig
ENSSSCG00000000003	0.0158026	3.267039	0.1867906	0.8525458	0.9298206	-5.909755	not
ENSSSCG00000000005	-0.1503963	3.638338	-2.0207378	0.0484278	0.2260110	-4.110167	not
ENSSSCG00000000006	-0.0335232	2.683880	-0.2923176	0.7711975	0.8858632	-5.780618	not
ENSSSCG00000000007	-0.0631811	3.329472	-0.7951129	0.4301329	0.6661372	-5.638570	not
ENSSSCG00000000010	-0.4141370	5.146826	-1.7194817	0.0914261	0.3068071	-4.735873	not
ENSSSCG00000000014	0.1002355	3.733248	1.5714403	0.1220967	0.3561970	-4.844817	not

Summary of DEGs in cortex (up/down/not):


 down   not 
    1 15986

2.2.2.3 Hippocampus

Top 6 rows of limma-voom results for hippocampus.

(#tab:limma_voom_hippocampus)Limma‑voom results for hippocampus (top 6 rows)
	logFC	AveExpr	t	P.Value	adj.P.Val	B	sig
ENSSSCG00000000003	0.1490716	3.372600	2.0723106	0.0432079	0.1152114	-4.357393	not
ENSSSCG00000000005	-0.0470691	3.654761	-0.5897833	0.5578883	0.6860923	-6.268605	not
ENSSSCG00000000006	-0.2591378	2.540821	-1.8905322	0.0642612	0.1510190	-4.563778	not
ENSSSCG00000000007	-0.1970656	3.114347	-2.1530055	0.0359746	0.1018307	-4.166996	not
ENSSSCG00000000010	-0.4180822	4.979186	-2.0716377	0.0432731	0.1153451	-4.500896	not
ENSSSCG00000000014	-0.0036010	3.566223	-0.0507714	0.9597021	0.9769166	-6.427767	not

Summary of DEGs in hippocampus (up/down/not):


 down   not    up 
   51 15965    28

2.2.2.4 Liver

Top 6 rows of limma-voom results for liver.

(#tab:limma_voom_liver)Limma‑voom results for liver (top 6 rows)
	logFC	AveExpr	t	P.Value	adj.P.Val	B	sig
ENSSSCG00000000003	0.5283572	6.2095058	5.0514294	0.0000054	0.0001118	3.823475	not
ENSSSCG00000000005	0.8546173	6.0164992	4.9550395	0.0000076	0.0001426	3.493476	not
ENSSSCG00000000006	1.1894580	4.3657484	5.1540402	0.0000038	0.0000854	4.151317	not
ENSSSCG00000000007	0.1605117	2.6281456	1.6179670	0.1115530	0.2097834	-5.160816	not
ENSSSCG00000000010	-0.1217155	7.6243238	-1.2215405	0.2272387	0.3570506	-5.706670	not
ENSSSCG00000000014	-0.0140391	0.8048311	-0.1286167	0.8981434	0.9356727	-6.033441	not

Summary of DEGs in liver (up/down/not):


 down   not    up 
    4 13143    17

2.2.3 Volcano plots

Create volcano plots for each tissue using the volcano_plot function (thresholds: log₂FC > 2, FDR < 0.05). The plots are saved as PNG and PDF in differential_analysis/limma_voom/volcano_plots/. They are displayed below.

# Volcano plot function
volcano_plot <- function(data, thre_logFC = 2, thre_pval = 0.05) {
  data$color <- as.character(
    ifelse(data$adj.P.Val < thre_pval & abs(data$logFC) > thre_logFC,
           ifelse(data$logFC > thre_logFC, 'red', 'green'),
           'grey'))
  
  p <- ggplot(data, aes(x = logFC, y = -log10(adj.P.Val))) +
    geom_point(size = 0.25, colour = data$color) +
    theme_bw() +
    geom_hline(yintercept = -log10(thre_pval), colour = "black", linetype = "dashed") +
    geom_vline(xintercept = -thre_logFC, colour = "black", linetype = "dashed") +
    geom_vline(xintercept = thre_logFC, colour = "black", linetype = "dashed") +
    theme(axis.text.y = element_text(size = 10, face = "plain"),
          axis.title.y = element_text(size = 10, face = "bold"),
          panel.border = element_blank(),
          axis.line = element_line(colour = "black", size = 0.8),
          legend.position = "none",
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.background = element_rect(fill = "white")) +
    ylab("-log10(adj.P.Val)") +
    xlab("log2FC")
  
  return(p)
}

Apply the function to all the tissues and save the plots.

#Define output directory
output_dir <- "differential_analysis/limma_voom"
volcano_dir <- file.path(output_dir, "volcano_plots")
if (!dir.exists(volcano_dir)) dir.create(volcano_dir, recursive = TRUE)

# Iterate over the results and save each volcano plot
walk2(limma_results, names(limma_results), function(res, tissue) {
  # Extract results
  df <- res$results
  # Generate plot
  p <- volcano_plot(df, thre_logFC = 2, thre_pval = 0.05)
  # Save as png
  ggsave(filename = file.path(volcano_dir, paste0("volcano_", tissue, ".png")),
         plot = p, width = 6, height = 5, dpi = 300)
  # Save as pdf
  ggsave(filename = file.path(volcano_dir, paste0("volcano_", tissue, ".pdf")),
         plot = p, width = 6, height = 5)
})

Amygdala

Cortex

Hippocampus

Liver

2.3 EdgeR

2.3.1 Methodology

This code implements a function in R to perform differential expression analysis using edgeR. Unlike limma-voom, which transforms count data for linear modeling, edgeR directly models raw count data using negative binomial generalized linear models. The function takes a count matrix and a metadata table as input, builds a design matrix including covariates (such as sex), fits the statistical model, and returns a table of differentially expressed genes between groups of interest, correcting for the specified covariates.

run_edgeR <- function(counts_matrix, metadata, group_col = "condition", covariates = c("sex"), logFC_thresh = 2, pval_thresh = 0.05) {
  # Ensure that the order of the columns in `counts_matrix` matches that in `metadata$sample
  counts_matrix <- counts_matrix[, metadata$sample]
  
  # Create the group vector
  group <- factor(metadata[[group_col]], levels = c("control", "stress"))
  
  # Build the design matrix using covariates
  design_formula <- as.formula(paste("~", group_col, "+", paste0(covariates, collapse = "+")))
  design <- model.matrix(design_formula, data = metadata)
  rownames(design) <- metadata$sample
  
  # Create the DGEList object and normalise it using TMM method
  dge <- DGEList(counts = counts_matrix, group = group)
  dge <- calcNormFactors(dge, method = "TMM")
  
  # Estimate the dispersion
  dge <- estimateDisp(dge, design, robust = TRUE)
  
  # Fits the generalised linear model (GLM) and performs the quasi-likelihood F-test
  fit <- glmQLFit(dge, design)
  qlf <- glmQLFTest(fit, coef = "conditionstress")
  
  # Extract the results table
  res <- topTags(qlf, n = Inf)$table
  
  # Create a significance column based on thresholds
  res$sig <- factor(
    ifelse(res$FDR < pval_thresh & abs(res$logFC) > logFC_thresh,
           ifelse(res$logFC > logFC_thresh, "up", "down"),
           "not"),
    levels = c("up", "down", "not")
  )
  
  # Summary of DEGs
  deg_summary <- summary(res$sig)
  
  
  # Results
  return(list(results = res, dge = dge, fit = fit, design = design, deg_summary = deg_summary))
}

Apply the function to all the tissues (amygdala, cortex, hyppocampus and liver).

# Apply the edgeR function to each pair
edgeR_results <- map2(counts_list, metadata_list, ~ run_edgeR(counts_matrix = .x, metadata = .y))

# Save result tables
walk2(edgeR_results, names(edgeR_results), ~ 
  write.csv(.x$results, file = file.path("differential_analysis/edgeR", paste0("result_edgeR_", .y, ".csv")), row.names = TRUE)
)

2.3.2 Results

The following tables show the first few lines of the results for each tissue (genes with logFC, p-values, etc.). Full tables are available in the differential_analysis/edgeR/ folder.

2.3.2.1 Amygdala

Top 6 rows of edgeR results for amygdala:

(#tab:edgeR_amygdala)edgeR results for amygdala (top 6 rows)
	logFC	logCPM	F	PValue	FDR	sig
ENSSSCG00000008011	3.578722	-1.116083	27.94790	1.3e-06	0.0136809	up
ENSSSCG00000012346	3.439099	-2.345614	26.05372	1.8e-06	0.0136809	up
ENSSSCG00000054680	3.203517	-2.094333	25.28025	2.5e-06	0.0136809	up
ENSSSCG00000001483	2.943937	-1.753810	24.74153	3.8e-06	0.0152573	up
ENSSSCG00000039413	2.748904	-1.233918	23.57103	7.4e-06	0.0238941	up
ENSSSCG00000009665	3.106066	1.329164	24.36399	9.0e-06	0.0242851	up

Summary of DEGs in amygdala (up/down/not):


  not    up 
16183    13

2.3.2.2 Cortex

Top 6 rows of edgeR results for cortex:

(#tab:edgeR_cortex)edgeR results for cortex (top 6 rows)
	logFC	logCPM	F	PValue	FDR	sig
ENSSSCG00000000932	5.1358420	1.700749	33.96057	2.0e-07	0.0038276	up
ENSSSCG00000002729	-0.2021708	5.536446	28.17967	2.3e-06	0.0144869	not
ENSSSCG00000005904	-0.2019873	6.684174	27.24570	3.1e-06	0.0144869	not
ENSSSCG00000025293	0.2321773	5.943974	26.02142	4.8e-06	0.0144869	not
ENSSSCG00000015048	4.3451482	2.092224	24.56285	5.0e-06	0.0144869	up
ENSSSCG00000024413	-0.2047066	8.944727	25.59156	5.5e-06	0.0144869	not

Summary of DEGs in cortex (up/down/not):


 down   not    up 
    7 15973     7

2.3.2.3 Hippocampus

Top 6 rows of edgeR results for hippocampus:

(#tab:edgeR_hippocampus)edgeR results for hippocampus (top 6 rows)
	logFC	logCPM	F	PValue	FDR	sig
ENSSSCG00000021018	3.594479	-0.1780427	35.63069	2e-07	0.0008321	up
ENSSSCG00000034102	1.931049	5.3105377	35.80301	2e-07	0.0008321	not
ENSSSCG00000014124	2.863863	4.1445362	35.65344	2e-07	0.0008321	up
ENSSSCG00000055661	3.176130	1.3597646	35.16443	3e-07	0.0008321	up
ENSSSCG00000003778	3.758276	-1.0977240	31.18360	3e-07	0.0008321	up
ENSSSCG00000017909	2.687305	2.2701125	34.46075	3e-07	0.0008321	up

Summary of DEGs in hippocampus (up/down/not):


 down   not    up 
    2 15934   108

2.3.2.4 Liver

Top 6 rows of edgeR results for liver:

(#tab:edgeR_liver)edgeR results for liver(top 6 rows)
	logFC	logCPM	F	sig
ENSSSCG00000001424	0.5619546	4.367224	78.02654	not
ENSSSCG00000017882	0.6987771	6.106816	76.94772	not
ENSSSCG00000021289	0.5652605	4.862521	76.56571	not
ENSSSCG00000040624	0.6443372	3.535575	76.45935	not
ENSSSCG00000001409	0.7205550	5.880420	75.79880	not
ENSSSCG00000013900	0.7083329	4.101120	71.49054	not

Summary of DEGs in liver (up/down/not):


 down   not    up 
    3 13153     8

2.3.3 Volcano plots

The following volcano plots were generated for each tissue using the same volcano_plot function as for the limma-voom analysis. The function is applied to the edgeR results, with the same thresholds (log₂FC > 2, FDR < 0.05). Plots are saved as PNG and PDF files in differential_analysis/edgeR/volcano_plots/, and are displayed below.

Apply the function to all the tissues and save the plots.

# Define output directory for edgeR volcano plots
output_dir <- "differential_analysis/edgeR"
volcano_dir <- file.path(output_dir, "volcano_plots")
if (!dir.exists(volcano_dir)) dir.create(volcano_dir, recursive = TRUE)

# Iterate over the edgeR results and save each volcano plot
walk2(edgeR_results, names(edgeR_results), function(res, tissue) {
  df <- res$results
  # Note: For edgeR, use FDR instead of adj.P.Val
  colnames(df)[colnames(df) == "FDR"] <- "adj.P.Val"  # For compatibility with volcano_plot
  p <- volcano_plot(df, thre_logFC = 2, thre_pval = 0.05)
  # Save as PNG
  ggsave(filename = file.path(volcano_dir, paste0("volcano_", tissue, ".png")),
         plot = p, width = 6, height = 5, dpi = 300)
  # Save as PDF
  ggsave(filename = file.path(volcano_dir, paste0("volcano_", tissue, ".pdf")),
         plot = p, width = 6, height = 5)
})

Amygdala

Cortex

Hippocampus

Liver

2.4 DESeq2

2.4.1 Methodology

This code defines a function to perform differential expression analysis using DESeq2. The function takes an integer count matrix and a metadata table as input, builds a design matrix including covariates (such as sex), and runs the DESeq2 pipeline. Unlike limma-voom, which transforms count data for linear modeling, and edgeR, which uses negative binomial models with different estimation procedures, DESeq2 also models count data with a negative binomial distribution but uses its own methods for normalisation and dispersion estimation. The function returns a table of differentially expressed genes between groups of interest, applying user-defined thresholds (log₂FC > 2, FDR < 0.05), and provides a summary of upregulated and downregulated genes.

run_DESeq2 <- function(counts_matrix, metadata, group_col = "condition", covariates = c("sex"), logFC_thresh = 2, pval_thresh = 0.05) {
  
  # Convert counts to integer (DESeq2 requirement)
  counts_matrix <- round(counts_matrix)
  storage.mode(counts_matrix) <- "integer"
  
  # Build the design formula
  design_formula <- as.formula(paste("~", group_col, "+", paste0(covariates, collapse = "+")))
  
  # Create DESeqDataSet
  dds <- DESeqDataSetFromMatrix(
    countData = counts_matrix,
    colData = metadata,
    design = design_formula
  )
  
  # Run DESeq2
  dds <- DESeq(dds)
  
  # Extract results for the main group contrast (e.g. stress vs control)
  contrast <- c(group_col, "stress", "control")
  res <- results(dds, contrast = contrast)
  res <- as.data.frame(res)
  
  # Add gene IDs as rownames (if not already)
  if (is.null(rownames(res))) rownames(res) <- rownames(counts_matrix)
  
  # Add logFC column for compatibility
  res$logFC <- res$log2FoldChange
  
  # Add significance column
  res$sig <- factor(
    ifelse(res$padj < pval_thresh & abs(res$logFC) > logFC_thresh,
           ifelse(res$logFC > logFC_thresh, "up", "down"),
           "not"),
    levels = c("up", "down", "not")
  )
  
  deg_summary <- summary(res$sig)
  
  return(list(results = res, dds = dds, deg_summary = deg_summary))
}

Apply the function to all the tissues.

# Apply the DESeq2 function to each pair of data and metadata
DESeq2_results <- map2(counts_list, metadata_list, ~ run_DESeq2(counts_matrix = .x, metadata = .y))

# Save result tables
walk2(DESeq2_results, names(DESeq2_results), ~ 
  write.csv(.x$results, file = file.path("differential_analysis/DESeq2", paste0("result_DESeq2_", .y, ".csv")), row.names = TRUE)
)

2.4.2 Results

2.4.2.1 Amygdala

Top 6 rows of DESeq2 results for amygdala:

(#tab:DESeq2_amygdala)DESeq2 results for amygdala (top 6 rows)
	baseMean	log2FoldChange	lfcSE	stat	pvalue	padj	logFC	sig
ENSSSCG00000000002	18.58295	0.1090747	0.2806962	0.3885864	0.6975821	0.9016860	0.1090747	not
ENSSSCG00000000003	550.75668	0.1481962	0.0762700	1.9430465	0.0520105	0.3119777	0.1481962	not
ENSSSCG00000000005	620.30083	0.0320247	0.0621768	0.5150584	0.6065122	0.8617177	0.0320247	not
ENSSSCG00000000006	330.75408	-0.0069834	0.0952062	-0.0733506	0.9415272	0.9822891	-0.0069834	not
ENSSSCG00000000007	421.80793	0.0804937	0.1018349	0.7904338	0.4292745	0.7591323	0.0804937	not
ENSSSCG00000000010	1352.54281	0.0014799	0.1367087	0.0108255	0.9913627	0.9976863	0.0014799	not

Summary of DEGs in amygdala (up/down/not):


  not    up 
16184    12

2.4.2.2 Cortex

Top 6 rows of DESeq2 results for cortex:

(#tab:DESeq2_cortex)DESeq2 results for cortex (top 6 rows)
	baseMean	log2FoldChange	lfcSE	stat	pvalue	padj	logFC	sig
ENSSSCG00000000003	524.6040	-0.0040772	0.0848951	-0.0480262	0.9616954	0.9787844	-0.0040772	not
ENSSSCG00000000005	678.5726	-0.1929807	0.0799856	-2.4126927	0.0158352	0.1044043	-0.1929807	not
ENSSSCG00000000006	356.7804	-0.0550498	0.1157944	-0.4754098	0.6344949	0.7863427	-0.0550498	not
ENSSSCG00000000007	547.2443	-0.0925411	0.0811355	-1.1405760	0.2540464	0.4703198	-0.0925411	not
ENSSSCG00000000010	2338.4609	-0.6098899	0.2457401	-2.4818492	0.0130703	0.0961893	-0.6098899	not
ENSSSCG00000000014	719.0256	0.0969554	0.0661094	1.4665901	0.1424876	0.3354712	0.0969554	not

Summary of DEGs in cortex (up/down/not):


  not 
15983

2.4.2.3 Hippocampus

Top 6 rows of DESeq2 results for hippocampus:

(#tab:DESeq2_hippocampus)DESeq2 results for hippocampus (top 6 rows)
	baseMean	log2FoldChange	lfcSE	stat	pvalue	padj	logFC	sig
ENSSSCG00000000003	560.7572	0.1472090	0.0722623	2.0371494	0.0416351	0.1081946	0.1472090	not
ENSSSCG00000000005	683.9935	-0.0342954	0.0813476	-0.4215907	0.6733238	0.7756971	-0.0342954	not
ENSSSCG00000000006	326.7181	-0.2231543	0.1294650	-1.7236653	0.0847683	0.1788632	-0.2231543	not
ENSSSCG00000000007	473.9274	-0.1831979	0.0935225	-1.9588643	0.0501287	0.1229574	-0.1831979	not
ENSSSCG00000000010	1972.6162	-0.4061736	0.2111636	-1.9235020	0.0544170	0.1305618	-0.4061736	not
ENSSSCG00000000014	639.8191	0.0029228	0.0706021	0.0413977	0.9669788	0.9794323	0.0029228	not

Summary of DEGs in hippocampus (up/down/not):


 down   not    up 
    3 15925   116

2.4.2.4 Liver

Top 6 rows of DESeq2 results for liver:

(#tab:DESeq2_liver)DESeq2 results for liver (top 6 rows)
	baseMean	log2FoldChange	lfcSE	stat	pvalue	padj	logFC	sig
ENSSSCG00000000003	4124.46119	0.4984285	0.1027084	4.8528494	0.0000012	0.0000212	0.4984285	not
ENSSSCG00000000005	3894.62173	0.7076342	0.1695440	4.1737487	0.0000300	0.0003043	0.7076342	not
ENSSSCG00000000006	1379.78243	1.1114666	0.2156778	5.1533667	0.0000003	0.0000058	1.1114666	not
ENSSSCG00000000007	340.23022	0.1009453	0.1086170	0.9293697	0.3526975	0.4884925	0.1009453	not
ENSSSCG00000000010	10817.93454	-0.1616482	0.0978577	-1.6518698	0.0985611	0.1858290	-0.1616482	not
ENSSSCG00000000014	95.84374	0.0190787	0.1131682	0.1685875	0.8661211	0.9121717	0.0190787	not

Summary of DEGs in liver (up/down/not):


 down   not    up 
    4 13152     8

2.4.3 Volcano plots

The following volcano plots were generated for each tissue using the same volcano_plot function as for the limma-voom and edgeR analyses. The function is applied to the DESeq2 results, with the same thresholds (log₂FC > 2, padj < 0.05). Plots are saved as PNG and PDF files in differential_analysis/DESeq2/volcano_plots/, and are displayed below.

# Define output directory for DESeq2 volcano plots
output_dir <- "differential_analysis/DESeq2"
volcano_dir <- file.path(output_dir, "volcano_plots")
if (!dir.exists(volcano_dir)) dir.create(volcano_dir, recursive = TRUE)

# Iterate over the DESeq2 results and save each volcano plot
walk2(DESeq2_results, names(DESeq2_results), function(res, tissue) {
  df <- res$results
  # For compatibility with volcano_plot function, rename padj to adj.P.Val if necessary
  if ("padj" %in% colnames(df)) colnames(df)[colnames(df) == "padj"] <- "adj.P.Val"
  p <- volcano_plot(df, thre_logFC = 2, thre_pval = 0.05)
  # Save as PNG
  ggsave(filename = file.path(volcano_dir, paste0("volcano_", tissue, ".png")),
         plot = p, width = 6, height = 5, dpi = 300)
  # Save as PDF
  ggsave(filename = file.path(volcano_dir, paste0("volcano_", tissue, ".pdf")),
         plot = p, width = 6, height = 5)
})

Amygdala

Cortex

Hippocampus

Liver

2.5 Comparison of methods

This code defines a function to compare the differential expression results obtained from limma-voom, edgeR, and DESeq2 for each tissue. For each method, the function extracts the lists of upregulated and downregulated genes and generates Venn diagrams to visualize the overlap between methods. The resulting plots, saved in the differential_analysis/venn_diagrams/ folder, allow for a straightforward comparison of the consistency and differences in gene detection across the three differential expression analysis approaches.

run_venn <- function(
  limma_file, edgeR_file, DESeq2_file, 
  output_dir = "differential_analysis/venn_diagrams",
  tissue = "Tissue"
) {
  # Create output directory if it doesn't exist
  if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE)
  
  # Read result files
  res_limma  <- read.csv(limma_file, row.names = 1)
  res_edgeR  <- read.csv(edgeR_file, row.names = 1)
  res_DESeq2 <- read.csv(DESeq2_file, row.names = 1)
  
  for (sig_type in c("up", "down")) {
    # Remove NAs from 'sig' column in each result
    res_limma_filt  <- res_limma[!is.na(res_limma$sig), ]
    res_edgeR_filt  <- res_edgeR[!is.na(res_edgeR$sig), ]
    res_DESeq2_filt <- res_DESeq2[!is.na(res_DESeq2$sig), ]
    
    # Extract DEGs for the current type
    DEGs_limma  <- rownames(res_limma_filt)[res_limma_filt$sig == sig_type]
    DEGs_edgeR  <- rownames(res_edgeR_filt)[res_edgeR_filt$sig == sig_type]
    DEGs_DESeq2 <- rownames(res_DESeq2_filt)[res_DESeq2_filt$sig == sig_type]
    
    # Skip if all sets are empty
    if (length(DEGs_limma) == 0 & length(DEGs_edgeR) == 0 & length(DEGs_DESeq2) == 0) {
      message(paste("No", sig_type, "genes in any method for", tissue, "- skipping Venn diagram."))
      next
    }
    
    venn_list <- list(
      limma = DEGs_limma,
      edgeR = DEGs_edgeR,
      DESeq2 = DEGs_DESeq2
    )
    
    main_title <- paste(
      toupper(substring(sig_type, 1, 1)), substring(sig_type, 2),
      "-regulated DEGs overlap (", tissue, ")", sep = ""
    )
    file_name <- paste0("venn_", tolower(tissue), "_", sig_type, ".png")
    output_path <- file.path(output_dir, file_name)
    
    venn.plot <- venn.diagram(
      x = venn_list,
      filename = NULL,
      fill = c("#FF6666", "#FFFF00", "#993366"),
      alpha = 0.5,
      cex = 1.5,
      cat.cex = 1.5,
      main = main_title
    )
    
    png(output_path, width=800, height=800, res=120)
    grid.newpage()
    grid.draw(venn.plot)
    dev.off()
    
    #clean up temporary files
    tmp_files <- list.files(pattern = "^VennDiagram\\.")
    if (length(tmp_files) > 0) file.remove(tmp_files)
  }
}

Apply run_venn function to all the tissues:

# Amygdala
run_venn(
  limma_file  = "differential_analysis/limma_voom/result_limma_voom_amygdala.csv",
  edgeR_file  = "differential_analysis/edgeR/result_edgeR_amygdala.csv",
  DESeq2_file = "differential_analysis/DESeq2/result_DESeq2_amygdala.csv",
  tissue = "Amygdala"
)
# Cortex
run_venn(
  limma_file  = "differential_analysis/limma_voom/result_limma_voom_cortex.csv",
  edgeR_file  = "differential_analysis/edgeR/result_edgeR_cortex.csv",
  DESeq2_file = "differential_analysis/DESeq2/result_DESeq2_cortex.csv",
  tissue = "Cortex"
)
# Hippocampus
run_venn(
  limma_file  = "differential_analysis/limma_voom/result_limma_voom_hippocampus.csv",
  edgeR_file  = "differential_analysis/edgeR/result_edgeR_hippocampus.csv",
  DESeq2_file = "differential_analysis/DESeq2/result_DESeq2_hippocampus.csv",
  tissue = "Hippocampus"
)
# Liver
run_venn(
  limma_file  = "differential_analysis/limma_voom/result_limma_voom_liver.csv",
  edgeR_file  = "differential_analysis/edgeR/result_edgeR_liver.csv",
  DESeq2_file = "differential_analysis/DESeq2/result_DESeq2_liver.csv",
  tissue = "Liver"
)

2.5.1 Venn diagram plots

2.5.1.1 Amygdala

Amygdala – Up-regulated

Amygdala – Down-regulated

2.5.1.2 Cortex

Cortex – Up-regulated

Cortex – Down-regulated

2.5.1.3 Hippocampus

Hippocampus – Up-regulated

Hippocampus – Down-regulated

2.5.1.4 Liver

Liver – Up-regulated

Liver – Down-regulated