Chapter 3 Data statistics

load("resources/data.Rdata")

3.1 Sequencing reads statistics

data_stats$raw_reads %>% sum()
[1] 387468658
data_stats$raw_reads %>% mean()
[1] 19373433
data_stats$raw_reads %>% sd()
[1] 6509982

3.2 DNA fractions

#Per sample type
data_stats %>%
    mutate(mapped_host_perc=mapped_SceUnd/trimmed_reads) %>%
    left_join(., sample_metadata, by = join_by(sample == sample)) %>%
    group_by(type) %>%
    summarise(mean=mean(mapped_host_perc),sd=sd(mapped_host_perc))
# A tibble: 2 × 3
  type     mean     sd
  <chr>   <dbl>  <dbl>
1 cloaca 0.688  0.0254
2 feces  0.0480 0.0557
#Overall
data_stats %>%
    mutate(mapped_perc=mapped_mags/trimmed_reads) %>%
    summarise(mean=mean(mapped_perc),sd=sd(mapped_perc))
# A tibble: 1 × 2
   mean    sd
  <dbl> <dbl>
1 0.294 0.292
#Per sample type
data_stats %>%
    mutate(mapped_perc=mapped_mags/trimmed_reads) %>%
    left_join(., sample_metadata, by = join_by(sample == sample)) %>%
    group_by(type) %>%
    summarise(mean=mean(mapped_perc),sd=sd(mapped_perc))
# A tibble: 2 × 3
  type     mean      sd
  <chr>   <dbl>   <dbl>
1 cloaca 0.0152 0.00906
2 feces  0.573  0.0849 
data_stats %>%
  left_join(., sample_metadata, by = join_by(sample == sample)) %>%
  lmerTest::lmer(mapped_SceUnd ~ type + (1 | individual), data = ., REML = FALSE) %>%
  summary()
Warning in as_lmerModLT(model, devfun): Model may not have converged with 1 eigenvalue close to zero: 5.8e-12
Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's method ['lmerModLmerTest']
Formula: mapped_SceUnd ~ type + (1 | individual)
   Data: .

     AIC      BIC   logLik deviance df.resid 
   662.4    666.4   -327.2    654.4       16 

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-1.53104 -0.32590 -0.02621  0.19221  2.31623 

Random effects:
 Groups     Name        Variance  Std.Dev.
 individual (Intercept) 3.180e+12 1783257 
 Residual               6.856e+12 2618430 
Number of obs: 20, groups:  individual, 10

Fixed effects:
              Estimate Std. Error         df t value Pr(>|t|)    
(Intercept)  1.340e+07  1.002e+06  2.307e+01   13.37 2.37e-12 ***
typefeces   -1.240e+07  1.171e+06  1.172e+26  -10.59  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation of Fixed Effects:
          (Intr)
typefeces -0.584
data_stats %>%
  mutate(
    low_quality = raw_reads - trimmed_reads,
    unmapped_reads = trimmed_reads - mapped_SceUnd - mapped_mags
  ) %>%
  select(sample, low_quality, mapped_SceUnd, mapped_mags, unmapped_reads) %>%
  pivot_longer(-sample) %>%
  separate(col = "sample", into = c("sample", "tissue"), sep = "\\.", remove = FALSE) %>%
  mutate(name=factor(name,levels=c("low_quality","mapped_SceUnd","unmapped_reads","mapped_mags"))) %>%
  mutate(sample=factor(sample,levels=c("Sg1","Sg2","Sg3","Sg4","Sg5","Sg6","Sg7","Sg8","Sg9","Sg10"))) %>%
  ggplot(aes(x = sample, y = value, fill = name)) +
      geom_bar(stat = "identity", position = "fill") +
      scale_fill_manual(name="Sequence type",
                    breaks=c("low_quality","mapped_SceUnd","unmapped_reads","mapped_mags"),
                    labels=c("Low quality","Mapped to host","Unmapped","Mapped to MAGs"),
                    values=c("#CCCCCC", "#bcdee1", "#d8b8a3","#93655c"))+
      facet_wrap(~tissue, scales = "free", labeller = labeller(tissue = c(cloaca="Cloaca",feces="Feces"))) +
      theme_minimal() +
      labs(y="DNA sequence fraction",x="Samples")

3.3 Recovered microbial fraction

data_stats %>%
  mutate(
    unmapped_reads = trimmed_reads - mapped_SceUnd - mapped_mags,
    mag_proportion = mapped_mags / (mapped_mags + unmapped_reads),
    singlem_read_fraction = singlem_read_fraction
  ) %>%
  select(sample, mag_proportion, singlem_read_fraction) %>%
  mutate(
    mag_proportion = if_else(singlem_read_fraction == 0, 0, mag_proportion),
    singlem_read_fraction = if_else(singlem_read_fraction == 0, NA, singlem_read_fraction),
    singlem_read_fraction = if_else(singlem_read_fraction < mag_proportion, NA, singlem_read_fraction),
    singlem_read_fraction = if_else(singlem_read_fraction > 1, 1, singlem_read_fraction)
  ) %>%
  pivot_longer(-sample, names_to = "proportion", values_to = "value") %>%
  mutate(
    proportion = factor(
      proportion,
      levels = c("mag_proportion", "singlem_read_fraction")
    )
  ) %>%
  separate(col = "sample", into = c("sample", "tissue"), sep = "\\.", remove = FALSE) %>%
  mutate(sample=factor(sample,levels=rev(c("Sg1","Sg2","Sg3","Sg4","Sg5","Sg6","Sg7","Sg8","Sg9","Sg10")))) %>%
  ggplot(aes(x = value, y = sample, color = proportion)) +
      geom_line(aes(group = sample), color = "#f8a538") +
      geom_point() +
      scale_color_manual(name="Proportion",
                    breaks=c("mag_proportion","singlem_read_fraction"),
                    labels=c("Recovered","Estimated"),
                    values=c("#52e1e8", "#876b53"))+
      facet_wrap(tissue~., scales = "free", labeller = labeller(tissue = c(cloaca="Cloaca",feces="Feces"))) +
      theme_minimal() +
      labs(y = "Samples", x = "Prokaryotic fraction") +
      scale_x_continuous(limits = c(0, 1)) +
      theme(
        axis.text.x = element_text( angle = 90, vjust = 0.5, hjust = 1, size = 6),
        legend.position = "right"
      )

3.3.1 Domain-adjusted mapping rate (DAMR)

data_stats %>%
  mutate(
    unmapped_reads = trimmed_reads - mapped_SceUnd - mapped_mags,
    mag_proportion = mapped_mags / (mapped_mags + unmapped_reads),
    singlem_read_fraction = singlem_read_fraction
  ) %>%
  mutate(damr=pmin(1, mag_proportion/singlem_read_fraction)) %>%
  select(sample,damr) %>%
  separate_wider_delim(sample,".",names=c("sample", "type")) %>%
  group_by(type) %>%
  summarise(mean=mean(damr),sd=sd(damr)) %>%
  tt()
tinytable_la5ixvwaya4qxouk2dwh
type mean sd
cloaca 0.7730484 0.1249452
feces 0.8350923 0.1097052