Chapter 3 Metatranscriptomic data preparation
3.1 Load data
3.1.1 Metatranscriptomic gene expression counts
if(!file.exists("data/gene_expressions.tsv.xz")){
download.file(
url = 'https://sid.erda.dk/share_redirect/Bd8UfDO2D6/gene_expressions.tsv.xz',
destfile = 'data/gene_expressions.tsv.xz', method = 'curl'
)
}
gene_expr <-
read_tsv("data/gene_expressions.tsv.xz") %>%
select(1:129) %>%
rename(mag_name = 'MAG') %>%
relocate(mag_name)
# Chunk analysis to sets of 100 MAGs
mags <- sort(unique(gene_expr$mag_name))
gene_expr_1 <- gene_expr[gene_expr$mag_name %in% mags[c(1:100)],c(2:129)]
gene_expr_2 <- gene_expr[gene_expr$mag_name %in% mags[c(101:200)],c(2:129)]
gene_expr_3 <- gene_expr[gene_expr$mag_name %in% mags[c(201:300)],c(2:129)]
gene_expr_4 <- gene_expr[gene_expr$mag_name %in% mags[c(301:400)],c(2:129)]
gene_expr_5 <- gene_expr[gene_expr$mag_name %in% mags[c(401:500)],c(2:129)]
gene_expr_6 <- gene_expr[gene_expr$mag_name %in% mags[c(501:600)],c(2:129)]
gene_expr_7 <- gene_expr[gene_expr$mag_name %in% mags[c(601:700)],c(2:129)]
gene_expr_8 <- gene_expr[gene_expr$mag_name %in% mags[c(701:825)],c(2:129)]
save(gene_expr_1, file = "data/gene_expr_1.Rdata")
save(gene_expr_2, file = "data/gene_expr_2.Rdata")
save(gene_expr_3, file = "data/gene_expr_3.Rdata")
save(gene_expr_4, file = "data/gene_expr_4.Rdata")
save(gene_expr_5, file = "data/gene_expr_5.Rdata")
save(gene_expr_6, file = "data/gene_expr_6.Rdata")
save(gene_expr_7, file = "data/gene_expr_7.Rdata")
save(gene_expr_8, file = "data/gene_expr_8.Rdata")
rm(list = ls())3.1.2 Distillation
mag_ann <-
read_tsv("data/gene_annotations.tsv.xz") %>%
mutate(gene_length = end_position - start_position)
# Chunk analysis
load("data/gene_expr_1.Rdata")
distq_1 <- distillq(gene_expr_1, mag_ann, GIFT_db2,
genecol = 1, genomecol = 2, annotcol = c(9, 10, 19))
save(distilled_caecum_1, file = "data/distilled_caecum_1.Rdata")
rm(gene_expr_1, distilled_caecum_1)
load("data/gene_expr_2.Rdata")
distq_2 <- distillq(gene_expr_2, mag_ann, GIFT_db2,
genecol = 1, genomecol = 2, annotcol = c(9, 10, 19))
save(distilled_caecum_2, file = "data/distilled_caecum_2.Rdata")
rm(gene_expr_2, distilled_caecum_2)
load("data/gene_expr_3.Rdata")
distq_3 <- distillq(gene_expr_3, mag_ann, GIFT_db2,
genecol = 1, genomecol = 2, annotcol = c(9, 10, 19))
save(distilled_caecum_3, file = "results/tables/distilled_caecum_3.Rdata")
rm(gene_expr_3, distilled_caecum_3)
load("data/gene_expr_4.Rdata")
distq_4 <- distillq(gene_expr_4, mag_ann, GIFT_db2,
genecol = 1, genomecol = 2, annotcol = c(9, 10, 19))
save(distilled_caecum_4, file = "results/tables/distilled_caecum_4.Rdata")
rm(gene_expr_4, distilled_caecum_4)
load("data/gene_expr_5.Rdata")
distq_5 <- distillq(gene_expr_5, mag_ann, GIFT_db2,
genecol = 1, genomecol = 2, annotcol = c(9, 10, 19))
save(distilled_caecum_5, file = "results/tables/distilled_caecum_5.Rdata")
rm(gene_expr_5, distilled_caecum_5)
load("data/gene_expr_6.Rdata")
distq_6 <- distillq(gene_expr_6, mag_ann, GIFT_db2,
genecol = 1, genomecol = 2, annotcol = c(9, 10, 19))
save(distilled_caecum_6, file = "results/tables/distilled_caecum_6.Rdata")
rm(gene_expr_6, distilled_caecum_6)
load("data/gene_expr_7.Rdata")
distq_7 <- distillq(gene_expr_7, mag_ann, GIFT_db2,
genecol = 1, genomecol = 2, annotcol = c(9, 10, 12))
save(distilled_caecum_7, file = "results/tables/distilled_caecum_7.Rdata")
rm(gene_expr_7, distilled_caecum_7)
load("results/tables/gene_expr_8.Rdata")
distq_8 <- distillq(gene_expr_8, mag_ann, GIFT_db2,
genecol = 1, genomecol = 2, annotcol = c(9, 10, 12))
save(distilled_caecum_8, file = "results/tables/distilled_caecum_8.Rdata")
rm(gene_expr_8, distilled_caecum_8)3.1.3 Combine metatranscriptomic GIFT expression counts
load("data/distilled_caecum_1.Rdata")
load("data/distilled_caecum_2.RData")
load("data/distilled_caecum_3.RData")
load("data/distilled_caecum_4.RData")
load("data/distilled_caecum_5.RData")
load("data/distilled_caecum_6.RData")
load("data/distilled_caecum_7.RData")
load("data/distilled_caecum_8.RData")
expr_counts_raw <- c(distilled_expression_caecum_1, distilled_expression_caecum_2,
distilled_expression_caecum_3, distilled_expression_caecum_4,
distilled_expression_caecum_5, distilled_expression_caecum_6,
distilled_expression_caecum_7, distilled_expression_caecum_8)
rm(distilled_expression_caecum_1, distilled_expression_caecum_2,
distilled_expression_caecum_3, distilled_expression_caecum_4,
distilled_expression_caecum_5, distilled_expression_caecum_6,
distilled_expression_caecum_7, distilled_expression_caecum_8)
# Change MAG IDs for a standardised code
ena_to_mag_id <- read_tsv("data/ena_to_mag_id.tsv")
mag_name <- names(expr_counts_raw)
names(expr_counts_raw) <- ena_to_mag_id$mag_id[match(mag_name, ena_to_mag_id$mag_name)]
# Correct animal codes
expr_counts <- lapply(expr_counts_raw, function(x) {
rownames(x) <- gsub("F1a", "", rownames(x))
x
})