Chapter 2 Prepare data
2.1 Bioinformatics
2.1.1 Get data
2.1.3 Experimental design
This project consists of 40 RNAseq samples. Twenty samples were collected at 3 h and the remaining 20 at 24 h. The experiment includes one control group (C) and three treatment groups, in which bacteria were exposed to 5 mg/mL of the following minerals: goethite (G), hematite (H) and mica (M).
Each condition includes 4 biological replicates for which RNA was extracted using the ZymoBIOMICS RNA Kit, and one replicate for which RNA was extracted using the DREX protocol.
2.2 Load data
Samples have been split in different lanes during sequencing so gene_counts.tsv.xz has more columns that what is expected for the conditions of the experiments (40 different samples).
read_counts <- read_tsv(
"data/gene_counts.tsv.xz",
comment = "#", # skip featureCounts metadata
col_names = TRUE, # header is present
show_col_types = FALSE # silence messages
) %>%
dplyr::rename(gene = Geneid) %>%
pivot_longer(!c(gene,Chr,Start,End,Strand,Length),names_to="library",values_to = "counts") %>%
mutate(sample = str_extract(library, "^[^_]+")) %>%
group_by(gene,Chr,Start,End,Strand,Length,sample) %>%
summarise(counts=sum(counts)) %>%
pivot_wider(c(gene,Chr,Start,End,Strand,Length), names_from = "sample", values_from = "counts") 2.3 Prepare data
Preliminary processing of the data to prepare it for visualizations
read_counts <- read_counts %>%
pivot_longer(!c(gene,Chr,Start,End,Strand,Length),names_to="sample",values_to = "counts") %>%
inner_join(sample_metadata, by="sample") %>%
mutate(ID = paste(`Sample ID`, str_sub(Extraction, 1, 1), sep = "_")) %>%
dplyr::select(gene, ID, counts) %>%
pivot_wider(c(gene), names_from = "ID", values_from = "counts") %>%
column_to_rownames(var = "gene")sample_metadata <- sample_metadata %>%
mutate(ID = paste(`Sample ID`, str_sub(Extraction, 1, 1), sep = "_")) %>%
column_to_rownames(var = "ID")
# Make sure order is the same as in metadata
read_counts <- read_counts[, rownames(sample_metadata)]2.3.1 Prepare data subsets
Subset data that will be used in the following analysis
# Filter out DREX samples
counts_wo_drex <- read_counts %>%
dplyr::select(-ends_with("D"))
metadata_wo_drex <- sample_metadata %>%
dplyr::filter(Extraction == "Zymo") %>%
droplevels()
# Fix the order in counts
# MOVE THIS TO AN EARLIER STEP IN THE WORKFLOW
#counts_wo_drex <- counts_wo_drex[, rownames(metadata_wo_drex)]2.3.2 Prepare annotation file
The annotation file provided was produced using Prodigal and lacks information that will be required later in the analysis like gene names, locus tags and Entrez IDs (required to run kegga() from limma package).
To add the missing information the BSGatlas GFF file for B.subtilis str 168 was downloaded from BSGatlas
# Use this if you want to use BSGatlas.gff annotation
# Load GFF
gff_bsg <- import("data/BSGatlas_v1.1.gff")
# Keep only gene features
genes_bsg <- gff_bsg[mcols(gff_bsg)$type == "gene"]
get_first <- function(x) {
sapply(x, function(v) {
if (length(v) == 0) NA_character_ else as.character(v[1])
})
}
get_last <- function(x) {
sapply(x, function(v) {
if (length(v) == 0) NA_character_ else as.character(v[length(v)])
})
}
bsg_annotation <- tibble(
seqnames = as.character(seqnames(genes_bsg)),
start = start(genes_bsg),
end = end(genes_bsg),
strand = as.character(strand(genes_bsg)),
bsg_id = mcols(genes_bsg)$ID,
gene = get_last(mcols(genes_bsg)$Name),
locus_tag = toupper(get_first(mcols(genes_bsg)$locus_tag)),
operon_id = get_first(mcols(genes_bsg)$Parent)
)
# Cleaning of bad locus tag names
bsg_annotation <- bsg_annotation %>%
mutate(
locus_priority = case_when(
grepl("^NEW_", locus_tag) ~ 2L,
TRUE ~ 1L
)
) %>%
group_by(bsg_id) %>%
arrange(locus_priority, locus_tag, .by_group = TRUE) %>%
slice(1) %>%
ungroup() %>%
select(-locus_priority)
readr::write_tsv(bsg_annotation, "data/bsg_gene_annotation.tsv")# Load the prepared refseq annotation file and join with the original annotation file
bsg_annotation <- read_tsv("data/bsg_gene_annotation.tsv")
complete_gene_annotation <- gene_annotations %>%
dplyr::rename(ID_prodigal = gene) %>%
left_join(bsg_annotation,
by = c("start", "end", "strand"))# Here I check if the two gff files are consistent, they differ in the gene name of 572 entries and the locus_tag of 3
# I decided to work with the BSG atlas gff as it is more refined for the B. subtilis, the species of interest in this work.
gff_refseq <- import("data/genomic_RefSeq.gff")
gff_bsg <- import("data/BSGatlas_v1.1.gff")
genes_refseq <- gff_refseq[mcols(gff_refseq)$type == "gene"]
genes_bsg <- gff_bsg[mcols(gff_bsg)$type == "gene"]
get_first <- function(x) {
sapply(x, function(v) {
if (length(v) == 0) NA_character_ else as.character(v[1])
})
}
get_last <- function(x) {
sapply(x, function(v) {
if (length(v) == 0) NA_character_ else as.character(v[length(v)])
})
}
df_refseq <- tibble(
start = start(genes_refseq),
end = end(genes_refseq),
strand = as.character(strand(genes_refseq)),
gene_name_refseq = get_last(mcols(genes_refseq)$Name),
locus_tag_refseq = toupper(get_first(mcols(genes_refseq)$locus_tag))
)
df_bsg <- tibble(
start = start(genes_bsg),
end = end(genes_bsg),
strand = as.character(strand(genes_bsg)),
gene_name_bsg = get_last(mcols(genes_bsg)$Name),
locus_tag_bsg = toupper(get_first(mcols(genes_bsg)$locus_tag)),
bsg_id = mcols(genes_bsg)$ID
)
conversion_table <- df_bsg %>%
inner_join(df_refseq, by = c("start", "end", "strand"))# Use this if you want to use RefSeq.gff annotation
# RefSeq GFF file for B.subtilis str 168 can be downloaded from [NCBI](https://www.ncbi.nlm.nih.gov/datasets/taxonomy/224308/)
# Load GFF
gff_file <- "data/genomic_RefSeq.gff"
gff <- rtracklayer::import(gff_file)
gene_gff <- gff[gff$type == "gene"]
gene_df <- as.data.frame(gene_gff)
# Dbxref is a CharacterList
gene_df$Dbxref <- as.character(gene_df$Dbxref)
# Extract only the GeneID
gene_df$GeneID <- str_extract(gene_df$Dbxref, "GeneID:\\d+")
gene_df$GeneID <- str_remove(gene_df$GeneID, "GeneID:")
refseq_annotation <- gene_df %>%
dplyr::select(
seqnames,
start,
end,
strand,
gene,
locus_tag,
GeneID
)
readr::write_tsv(refseq_annotation, "data/refseq_gene_annotation.tsv")
# Load the prepared refseq annotation file and join with the original annotation file
refseq_annotation <- read_tsv("data/refseq_gene_annotation.tsv")
complete_gene_annotation <- gene_annotations %>%
dplyr::rename(ID_prodigal = gene) %>%
left_join(refseq_annotation,
by = c("start", "end", "strand"))