#sample size and treatment variables
n1 = 15
n2 = 30
n3 = 60
n4 = 120
t1 = 1
t2 = 2
t3 = 3
t4 = 4
################################################################################
### individual assembly
df_sum_total_ind <- df_sum_total %>%
filter(strategy == "single-coverage") %>%
mutate(sample_size = n2,
n_treatments = t1,
time_total_s = time_total_s / 5)
#this creates sample_size 30, with time_total_s = 150 / 5 = 30
indiv <- tibble(
strategy = "single-coverage",
sample_size = c(n1, n3, n4, n1, n2, n3, n4, n1, n2, n3, n4, n1, n2, n3, n4),
n_treatments = c(t1, t1, t1, t2, t2, t2, t2, t3, t3, t3, t3, t4, t4, t4, t4)
) %>%
bind_rows(., df_sum_total_ind) %>%
mutate(time_total_s = (df_sum_total_ind$time_total_s / 30) * sample_size * n_treatments)
#note df_sum_total_ind$time_total_s / 30 = estimated time per sample.
fig_b <- indiv %>%
ggplot(aes(x = sample_size, y = time_total_s / 3600,
group = n_treatments, colour = n_treatments)) +
geom_line() +
geom_point() +
scale_x_continuous(breaks = c(n1, n2, n3, n4)) +
theme_classic() +
theme(axis.title = element_text(face = "bold", size = 14),
axis.text = element_text(size = 14),
axis.title.x = element_blank()) +
ylab("Time (hours)") +
ggtitle("Single-coverage binning")
################################################################################
### coassembly
df_sum_total_coa <- df_sum_total %>%
filter(str_detect(strategy, "^coassembly"))
#Pull coassembly job times
coassembly_df <- df_filt %>%
filter(str_detect(strategy, "^coassembly"))
#Get mean times across coassembly strategies
#obviously more samples = longer assemblies, but this is just a simple estimate
#note: divide by n samples in coassembly
mean_coassembly_t <- coassembly_df %>%
filter(job == "Assembly" & strategy == "coassembly_timepoint_all") %>%
pull(time_s) %>%
mean() / 30
mean_coassembly_ct <- coassembly_df %>%
filter(job == "Assembly" & strategy == "coassembly_timepoint_cage") %>%
pull(time_s) %>%
mean() / 5
mean_coassembly = (mean_coassembly_t + mean_coassembly_ct) / 2
#Do same for other jobs
mean_mapping = coassembly_df %>%
filter(job == "assembly_mapping") %>%
pull(time_s) %>%
mean()
mean_binning_t <- coassembly_df %>%
filter(job == "binning" & strategy == "coassembly_timepoint_all") %>%
pull(time_s) %>%
mean() / 30
mean_binning_ct <- coassembly_df %>%
filter(job == "binning" & strategy == "coassembly_timepoint_cage") %>%
pull(time_s) %>%
mean() / 5
mean_binning = (mean_binning_t + mean_binning_ct) / 2
mean_refinement_t <- coassembly_df %>%
filter(job == "refinement" & strategy == "coassembly_timepoint_all") %>%
pull(time_s) %>%
mean() / 30
mean_refinement_ct <- coassembly_df %>%
filter(job == "refinement" & strategy == "coassembly_timepoint_cage") %>%
pull(time_s) %>%
mean() / 5
mean_refinement = (mean_refinement_t + mean_refinement_ct) / 2
#create tibble with estimatess
coa <- tibble(
strategy = "coassembly",
sample_size = c(n1, n2, n3, n4, n1, n2, n3, n4, n1, n2, n3, n4, n1, n2, n3, n4),
n_treatments = c(t1, t1, t1, t1, t2, t2, t2, t2, t3, t3, t3, t3, t4, t4, t4, t4)
) %>%
mutate(time_total_s = (mean_coassembly * sample_size * n_treatments) + (mean_mapping * sample_size * n_treatments) + (mean_binning * sample_size * n_treatments) + (mean_refinement * sample_size * n_treatments))
fig_c <- coa %>%
ggplot(aes(x = sample_size, y = time_total_s / 3600,
group = n_treatments, colour = n_treatments)) +
geom_line() +
geom_point() +
scale_x_continuous(breaks = c(n1, n2, n3, n4)) +
theme_classic() +
theme(axis.title = element_text(face = "bold", size = 14),
axis.text = element_text(size = 14),
axis.title.y = element_blank(),
axis.title.x = element_blank()) +
ggtitle("Coassembly")
################################################################################
### multi-coverage binning
df_cob_cage_treat <- df_filt %>%
filter(strategy == "multi-coverage_timepoint_cage")
#mean time for assembly?
mean_assembly <- df_cob_cage_treat %>%
filter(job == "Assembly") %>%
pull(time_s) %>%
mean()
#mean time for mapping?
mean_mapping <- df_cob_cage_treat %>%
filter(job == "assembly_mapping") %>%
pull(time_s) %>%
mean()
#mean time for binning?
mean_binning <- df_cob_cage_treat %>%
filter(job == "binning") %>%
pull(time_s) %>%
mean()
#mean time for refinement?
mean_refinement <- df_cob_cage_treat %>%
filter(job == "refinement") %>%
pull(time_s) %>%
mean()
cob <- tibble(
strategy = "multi-coverage binning",
sample_size = c(n1, n2, n3, n4, n1, n2, n3, n4, n1, n2, n3, n4, n1, n2, n3, n4),
n_treatments = c(t1, t1, t1, t1, t2, t2, t2, t2, t3, t3, t3, t3, t4, t4, t4, t4)
) %>%
mutate(time_total_s = (mean_assembly * sample_size * n_treatments) + (mean_mapping * (sample_size^2) * n_treatments) + (mean_binning * sample_size * n_treatments) + (mean_refinement * sample_size * n_treatments))
#scaling is sample_size, so sample_size^2 -- e.g. 2 samples = 2 * 2 mappings, 3 samples = 3 * 3 mappings, etc.
fig_d <- cob %>%
ggplot(aes(x = sample_size, y = time_total_s / 3600,
group = n_treatments, colour = n_treatments)) +
geom_line() +
geom_point() +
scale_x_continuous(breaks = c(n1, n2, n3, n4)) +
theme_classic() +
theme(axis.text = element_text(size = 14),
axis.title = element_text(size = 14, face = "bold")) +
xlab("Number of samples") +
ylab("Time (hours)") +
ggtitle("Multi-coverage binning")
################################################################################
### multi-split
df_sum_total_ms <- df_sum_total %>%
filter(str_detect(strategy, "^multi-split"))
df_ms_cage_treat <- df_filt %>%
filter(strategy == "multi-split_timepoint_cage")
#mean time for assembly?
mean_assembly <- df_ms_cage_treat %>%
filter(job == "Assembly") %>%
pull(time_s) %>%
mean()
#How does checkm scale?
checkm_t <- df_filt %>%
filter(job == "checkm" & strategy == "multi-split_timepoint_all")
checkm_ct <- df_filt %>%
filter(job == "checkm" & strategy == "multi-split_timepoint_cage")
treat <- mean(checkm_t$time_s)
cage_treat <- mean(checkm_ct$time_s)
#per sample?
treat_ps <- treat / 30
cage_treat_ps <- cage_treat / 5
#pretty similar, so for simplicity (and given it's a fraction of the pipeline's time, let's assume the mean of the two)
mean_checkm <- (treat_ps + cage_treat_ps) / 2
#How does vamb_multisplit scale?
vamb_multisplit_t <- df_filt %>%
filter(job == "vamb_multisplit" & strategy == "multi-split_timepoint_all")
vamb_multisplit_ct <- df_filt %>%
filter(job == "vamb_multisplit" & strategy == "multi-split_timepoint_cage")
treat <- mean(vamb_multisplit_t$time_s)
cage_treat <- mean(vamb_multisplit_ct$time_s)
#per sample?
treat_ps <- treat / 30
cage_treat_ps <- cage_treat / 5
#pretty similar, so for simplicity (and given it's a fraction of the pipeline's time, let's assume the mean of the two)
mean_vamb_multisplit <- (treat_ps + cage_treat_ps) / 2
#How does assembly_mapping scale?
mapping_t <- df_filt %>%
filter(job == "assembly_mapping" & strategy == "multi-split_timepoint_all")
mapping_ct <- df_filt %>%
filter(job == "assembly_mapping" & strategy == "multi-split_timepoint_cage")
treat <- mean(mapping_t$time_s)
cage_treat <- mean(mapping_ct$time_s)
# more samples combined = larger reference = longer mapping time
# Find scaling exponent (note, treat has 30 samples, cage_treat 5)
alpha <- log(treat / cage_treat) / log(30 / 5)
# Calculate a proportionality constant (choose cage_treat arbitrarily)
k <- cage_treat / (5) ^ alpha
# Function to estimate time for 1 sample to map to combined references
estimate_time <- function(n) {
k * (n) ^ alpha
}
# Estimation for 15, 30, 60, and 120 samples combined
n_values <- c(15, 30, 60, 120)
mapping_estimated_times <- sapply(n_values, estimate_time)
# Create tibble with estimates
ms <- tibble(
strategy = "multi-split",
sample_size = c(n1, n2, n3, n4, n1, n2, n3, n4, n1, n2, n3, n4, n1, n2, n3, n4),
n_treatments = c(t1, t1, t1, t1, t2, t2, t2, t2, t3, t3, t3, t3, t4, t4, t4, t4)
) %>%
mutate(time_total_s = (mean_assembly * sample_size * n_treatments) + (mapping_estimated_times * sample_size * n_treatments) + (mean_vamb_multisplit * sample_size * n_treatments) + (mean_checkm * sample_size * n_treatments))
fig_e <- ms %>%
ggplot(aes(x = sample_size, y = time_total_s / 3600,
group = n_treatments, colour = n_treatments)) +
geom_line() +
geom_point() +
scale_x_continuous(breaks = c(n1, n2, n3, n4)) +
theme_classic() +
theme(axis.title = element_text(face = "bold", size = 14),
axis.text = element_text(size = 14),
axis.title.y = element_blank(),) +
xlab("Number of samples") +
ggtitle("Multi-split")