Chapter 2 Prepare data

2.1 Bioinformatics

2.1.1 Get data

mkdir genome
wget -P genome https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/009/045/GCF_000009045.1_ASM904v1/GCF_000009045.1_ASM904v1_genomic.fna.gz

mkdir reads
# fetch data from ERDA

2.1.2 Run annotation and profiling

drakkar annotating -b genome
drakkar expressing -b genome -r reads

2.1.3 Experimental design

This project consists of 40 RNAseq samples. Twenty samples were collected at 3 h and the remaining 20 at 24 h. The experiment includes one control group (C) and three treatment groups, in which bacteria were exposed to 5 mg/mL of the following minerals: goethite (G), hematite (H) and mica (M).

Each condition includes 4 biological replicates for which RNA was extracted using the ZymoBIOMICS RNA Kit, and one replicate for which RNA was extracted using the DREX protocol.

Experimental design

2.1.4 Get working files

gene_annotations.tsv.xz
gene_counts.tsv.xz

2.2 Load data

Samples have been split in different lanes during sequencing so gene_counts.tsv.xz has more columns that what is expected for the conditions of the experiments (40 different samples).

read_counts <- read_tsv(
  "data/gene_counts.tsv.xz",
  comment = "#",          # skip featureCounts metadata
  col_names = TRUE,       # header is present
  show_col_types = FALSE  # silence messages
) %>%
  dplyr::rename(gene = Geneid) %>% 
  pivot_longer(!c(gene,Chr,Start,End,Strand,Length),names_to="library",values_to = "counts") %>% 
  mutate(sample = str_extract(library, "^[^_]+")) %>% 
  group_by(gene,Chr,Start,End,Strand,Length,sample) %>% 
  summarise(counts=sum(counts)) %>% 
  pivot_wider(c(gene,Chr,Start,End,Strand,Length), names_from = "sample", values_from = "counts")

gene_annotations <- read_tsv("data/gene_annotations.tsv.xz") %>%
    dplyr::rename(gene=1)

sample_metadata <- read_csv("data/sample_metadata.csv")

sample_metadata <- sample_metadata %>%
  mutate(timepoint = str_extract(`Sample ID`, "\\d+$"),
         mineral_code = substr(`Sample ID`, 3, 3))

2.3 Prepare data

Preliminary processing of the data to prepare it for visualizations

read_counts <- read_counts %>% 
  pivot_longer(!c(gene,Chr,Start,End,Strand,Length),names_to="sample",values_to = "counts") %>%
  inner_join(sample_metadata, by="sample") %>%
  mutate(ID = paste(`Sample ID`, str_sub(Extraction, 1, 1), sep = "_")) %>%
  dplyr::select(gene, ID, counts) %>%
  pivot_wider(c(gene), names_from = "ID", values_from = "counts") %>%
  column_to_rownames(var = "gene")

sample_metadata <- sample_metadata %>%
  mutate(ID = paste(`Sample ID`, str_sub(Extraction, 1, 1), sep = "_")) %>%
  column_to_rownames(var = "ID")


# Make sure order is the same as in metadata
read_counts <- read_counts[, rownames(sample_metadata)]

2.3.1 Prepare data subsets

Subset data that will be used in the following analysis

# Filter out DREX samples
counts_wo_drex <- read_counts %>%
  dplyr::select(-ends_with("D"))

metadata_wo_drex <- sample_metadata %>%
  dplyr::filter(Extraction == "Zymo") %>%
  droplevels()

# Fix the order in counts
# MOVE THIS TO AN EARLIER STEP IN THE WORKFLOW
#counts_wo_drex <- counts_wo_drex[, rownames(metadata_wo_drex)]

2.3.2 Prepare annotation file

The annotation file provided was produced using Prodigal and lacks information that will be required later in the analysis like gene names, locus tags and Entrez IDs (required to run kegga() from limma package).

To add the missing information the BSGatlas GFF file for B.subtilis str 168 was downloaded from BSGatlas

# Use this if you want to use BSGatlas.gff annotation

# Load GFF
gff_bsg <- import("data/BSGatlas_v1.1.gff")

# Keep only gene features
genes_bsg <- gff_bsg[mcols(gff_bsg)$type == "gene"]


get_first <- function(x) {
  sapply(x, function(v) {
    if (length(v) == 0) NA_character_ else as.character(v[1])
  })
}

get_last <- function(x) {
  sapply(x, function(v) {
    if (length(v) == 0) NA_character_ else as.character(v[length(v)])
  })
}

bsg_annotation <- tibble(
  seqnames = as.character(seqnames(genes_bsg)),
  start = start(genes_bsg),
  end = end(genes_bsg),
  strand = as.character(strand(genes_bsg)),
  bsg_id = mcols(genes_bsg)$ID,
  gene = get_last(mcols(genes_bsg)$Name),
  locus_tag = toupper(get_first(mcols(genes_bsg)$locus_tag)),
  operon_id = get_first(mcols(genes_bsg)$Parent)
)

# Cleaning of bad locus tag names
bsg_annotation <- bsg_annotation %>%
  mutate(
    locus_priority = case_when(
      grepl("^NEW_", locus_tag) ~ 2L,
      TRUE ~ 1L
    )
  ) %>%
  group_by(bsg_id) %>%
  arrange(locus_priority, locus_tag, .by_group = TRUE) %>%
  slice(1) %>%
  ungroup() %>%
  select(-locus_priority)


readr::write_tsv(bsg_annotation, "data/bsg_gene_annotation.tsv")

# Load the prepared refseq annotation file and join with the original annotation file
bsg_annotation <- read_tsv("data/bsg_gene_annotation.tsv")

complete_gene_annotation <- gene_annotations %>%
  dplyr::rename(ID_prodigal = gene) %>%
  left_join(bsg_annotation,
            by = c("start", "end", "strand"))

# Here I check if the two gff files are consistent, they differ in the gene name of 572 entries and the locus_tag of 3
# I decided to work with the BSG atlas gff as it is more refined for the B. subtilis, the species of interest in this work.

gff_refseq <- import("data/genomic_RefSeq.gff")
gff_bsg <- import("data/BSGatlas_v1.1.gff")

genes_refseq <- gff_refseq[mcols(gff_refseq)$type == "gene"]
genes_bsg <- gff_bsg[mcols(gff_bsg)$type == "gene"]

get_first <- function(x) {
  sapply(x, function(v) {
    if (length(v) == 0) NA_character_ else as.character(v[1])
  })
}

get_last <- function(x) {
  sapply(x, function(v) {
    if (length(v) == 0) NA_character_ else as.character(v[length(v)])
  })
}

df_refseq <- tibble(
  start = start(genes_refseq),
  end = end(genes_refseq),
  strand = as.character(strand(genes_refseq)),
  gene_name_refseq = get_last(mcols(genes_refseq)$Name),
  locus_tag_refseq = toupper(get_first(mcols(genes_refseq)$locus_tag))
)

df_bsg <- tibble(
  start = start(genes_bsg),
  end = end(genes_bsg),
  strand = as.character(strand(genes_bsg)),
  gene_name_bsg = get_last(mcols(genes_bsg)$Name),
  locus_tag_bsg = toupper(get_first(mcols(genes_bsg)$locus_tag)),
  bsg_id = mcols(genes_bsg)$ID
)

conversion_table <- df_bsg %>%
  inner_join(df_refseq, by = c("start", "end", "strand"))

# Use this if you want to use RefSeq.gff annotation
# RefSeq GFF file for B.subtilis str 168 can be downloaded from [NCBI](https://www.ncbi.nlm.nih.gov/datasets/taxonomy/224308/)

# Load GFF
gff_file <- "data/genomic_RefSeq.gff"
gff <- rtracklayer::import(gff_file)

gene_gff <- gff[gff$type == "gene"]

gene_df <- as.data.frame(gene_gff)

# Dbxref is a CharacterList
gene_df$Dbxref <- as.character(gene_df$Dbxref)

# Extract only the GeneID
gene_df$GeneID <- str_extract(gene_df$Dbxref, "GeneID:\\d+")
gene_df$GeneID <- str_remove(gene_df$GeneID, "GeneID:")

refseq_annotation <- gene_df %>%
  dplyr::select(
    seqnames,
    start,
    end,
    strand,
    gene,
    locus_tag,
    GeneID
  )

readr::write_tsv(refseq_annotation, "data/refseq_gene_annotation.tsv")

# Load the prepared refseq annotation file and join with the original annotation file
refseq_annotation <- read_tsv("data/refseq_gene_annotation.tsv")

complete_gene_annotation <- gene_annotations %>%
  dplyr::rename(ID_prodigal = gene) %>%
  left_join(refseq_annotation,
            by = c("start", "end", "strand"))