Chapter 1 Introduction

1.1 Prepare the R environment

1.1.1 Environment

To reproduce all the analyses locally, clone this repository in your computer using:

RStudio > New Project > Version Control > Git

And indicating the following git repository:

https://github.com/alberdilab/mediterranean_podarcis.git

Once the R project has been created, follow the instructions and code chunks shown in this webbook.

1.1.2 Libraries

The following R packages are required for the data analysis.

# Base
library(R.utils)
library(knitr)
library(tidyverse)
library(devtools)
library(tinytable)
library(rairtable)

# For tree handling
library(ape)
library(phyloseq)
library(phytools)

# For plotting
library(ggplot2)
library(ggrepel)
library(ggpubr)
library(ggnewscale)
library(gridExtra)
library(ggtreeExtra)
library(ggtree)
library(ggh4x)

# For statistics
library(spaa)
library(vegan)
library(Rtsne)
library(geiger)
library(hilldiv2)
library(distillR)
library(broom.mixed)
#library(lmerTest)
library(Hmsc)
library(corrplot)

# Data preparation

1.2 Podarcis filfolensis (PF)

1.2.0.1 Sample metadata

sample_metadata_pf <- read_tsv("data/pf/DMB0159_metadata.tsv.gz") %>%
    rename(sample=1)

1.2.0.2 Genome metadata

genome_metadata_pf <- read_tsv("data/pf/DMB0159_mag_info.tsv.gz") %>%
    rename(length=mag_size)

1.2.0.3 Read counts

read_counts_pf <- read_tsv("data/pf/DMB0159_counts.tsv.gz") %>%
    rename(genome=1) %>%
    select(all_of(c("genome",sample_metadata_pf$sample))) %>% # sort samples
    arrange(match(genome,genome_metadata_pf$genome)) # sort genomes

1.2.0.4 Genome base hits

genome_coverage_pf <- read_tsv("data/pf/DMB0159_coverage.tsv.gz") %>%
    rename(genome=1) %>%
    select(all_of(c("genome",sample_metadata_pf$sample))) %>% # sort samples
    arrange(match(genome,genome_metadata_pf$genome)) # sort genomes

1.2.0.5 Genome tree

genome_tree_pf <- read_tree("data/pf/DMB0159.tree")
genome_tree_pf$tip.label <- str_replace_all(genome_tree_pf$tip.label,"'", "") #remove single quotes in MAG names
genome_tree_pf <- keep.tip(genome_tree_pf, tip=genome_metadata_pf$genome) # keep only MAG tips

1.2.0.6 Genome annotations

Downloading individual annotation files from ERDA using information in Airtable and writing them to a single compressed table takes a while. The following chunk only needs to be run once, to generate the genome_annotations table that is saved in the data directory. Note that the airtable connection requires a personal access token.

airtable("MAGs", "appWbHBNLE6iAsMRV") %>% #get base ID from Airtable browser URL
  read_airtable(., fields = c("ID","mag_name","number_genes","anno_url"), id_to_col = TRUE) %>% #get 3 columns from MAGs table
  filter(mag_name %in% paste0(genome_metadata_pf$genome,".fa")) %>% #filter by MAG name
  filter(number_genes > 0) %>% #genes need to exist
  select(anno_url) %>% #list MAG annotation urls
  pull() %>%
  read_tsv() %>% #load all tables
  rename(gene=1, genome=2, contig=3) %>% #rename first 3 columns
  write_tsv(file="data/pf/genome_annotations.tsv.xz") #write to overall compressed file
genome_annotations_pf <- read_tsv("data/pf/genome_annotations.tsv.xz") %>%
    rename(gene=1, genome=2, contig=3)

1.2.0.7 Create working objects

Transform the original data files into working objects for downstream analyses.

1.2.0.8 Filter reads by coverage

min_coverage=0.3
read_counts_filt_pf <- genome_coverage_pf %>%
  mutate(across(where(is.numeric), ~ ifelse(. > min_coverage, 1, 0))) %>%
  mutate(across(-1, ~ . * read_counts_pf[[cur_column()]])) 

1.2.0.9 Transform reads into genome counts

readlength=150
genome_counts_pf <- read_counts_pf %>%
  mutate(across(where(is.numeric), ~ . / (genome_metadata_pf$length / readlength) ))
readlength=150
genome_counts_filt_pf <- read_counts_filt_pf %>%
  mutate(across(where(is.numeric), ~ . / (genome_metadata_pf$length / readlength) ))

1.2.0.10 Distill annotations into GIFTs

genome_gifts_pf <- distill(genome_annotations_pf,GIFT_db,genomecol=2,annotcol=c(9,10,19), verbosity=F)

1.2.0.11 Prepare color scheme

AlberdiLab projects use unified color schemes developed for the Earth Hologenome Initiative, to facilitate figure interpretation.

phylum_colors_pf <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
    right_join(genome_metadata_pf, by=join_by(phylum == phylum)) %>%
    arrange(match(genome, genome_tree_pf$tip.label)) %>%
    select(phylum, colors) %>% 
    unique() %>%
    arrange(phylum) %>%
    pull(colors, name=phylum)

1.2.0.12 Wrap working objects

All working objects are wrapped into a single Rdata object to facilitate downstream usage.

save(sample_metadata_pf, 
     genome_metadata_pf, 
     read_counts_pf, 
     genome_counts_pf, 
     genome_counts_filt_pf, 
     genome_tree_pf,
     genome_gifts_pf, 
     phylum_colors_pf,
     file = "data/data_podarcis_filfolensis.Rdata")

1.3 Podarcis gaigeae (PG)

1.3.0.1 Sample metadata

sample_metadata_pg <- read_tsv("data/pg/DMB0160_metadata.tsv.gz") %>%
    rename(sample=1)

1.3.0.2 Genome metadata

genome_metadata_pg <- read_tsv("data/pg/DMB0160_mag_info.tsv.gz") %>%
    rename(length=mag_size)

1.3.0.3 Read counts

read_counts_pg <- read_tsv("data/pg/DMB0160_counts.tsv.gz") %>%
    rename(genome=1) %>%
    select(all_of(c("genome",sample_metadata_pg$sample))) %>% # sort samples
    arrange(match(genome,genome_metadata_pg$genome)) # sort genomes

1.3.0.4 Genome base hits

genome_coverage_pg <- read_tsv("data/pg/DMB0160_coverage.tsv.gz") %>%
    rename(genome=1) %>%
    select(all_of(c("genome",sample_metadata_pg$sample))) %>% # sort samples
    arrange(match(genome,genome_metadata_pg$genome)) # sort genomes

1.3.0.5 Genome tree

genome_tree_pg <- read_tree("data/pg/DMB0160.tree")
genome_tree_pg$tip.label <- str_replace_all(genome_tree_pg$tip.label,"'", "") #remove single quotes in MAG names
genome_tree_pg <- keep.tip(genome_tree_pg, tip=genome_metadata_pg$genome) # keep only MAG tips

1.3.0.6 Genome annotations

Downloading individual annotation files from ERDA using information in Airtable and writing them to a single compressed table takes a while. The following chunk only needs to be run once, to generate the genome_annotations table that is saved in the data directory. Note that the airtable connection requires a personal access token.

airtable("MAGs", "appWbHBNLE6iAsMRV") %>% #get base ID from Airtable browser URL
  read_airtable(., fields = c("ID","mag_name","number_genes","anno_url"), id_to_col = TRUE) %>% #get 3 columns from MAGs table
  filter(mag_name %in% paste0(genome_metadata_pg$genome,".fa")) %>% #filter by MAG name
  filter(number_genes > 0) %>% #genes need to exist
  select(anno_url) %>% #list MAG annotation urls
  pull() %>%
  read_tsv() %>% #load all tables
  rename(gene=1, genome=2, contig=3) %>% #rename first 3 columns
  write_tsv(file="data/pg/genome_annotations.tsv.xz") #write to overall compressed file
genome_annotations_pg <- read_tsv("data/pg/genome_annotations.tsv.xz") %>%
    rename(gene=1, genome=2, contig=3)

1.3.0.7 Create working objects

Transform the original data files into working objects for downstream analyses.

1.3.0.8 Filter reads by coverage

min_coverage=0.3
read_counts_filt_pg <- genome_coverage_pg %>%
  mutate(across(where(is.numeric), ~ ifelse(. > min_coverage, 1, 0))) %>%
  mutate(across(-1, ~ . * read_counts_pg[[cur_column()]])) 

1.3.0.9 Transform reads into genome counts

readlength=150
genome_counts_pg <- read_counts_pg %>%
  mutate(across(where(is.numeric), ~ . / (genome_metadata_pg$length / readlength) ))
readlength=150
genome_counts_filt_pg <- read_counts_filt_pg %>%
  mutate(across(where(is.numeric), ~ . / (genome_metadata_pg$length / readlength) ))

1.3.0.10 Distill annotations into GIFTs

genome_gifts_pg <- distill(genome_annotations_pg,GIFT_db,genomecol=2,annotcol=c(9,10,19), verbosity=F)

1.3.0.11 Prepare color scheme

AlberdiLab projects use unified color schemes developed for the Earth Hologenome Initiative, to facilitate figure interpretation.

phylum_colors_pg <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
    right_join(genome_metadata_pg, by=join_by(phylum == phylum)) %>%
    arrange(match(genome, genome_tree_pg$tip.label)) %>%
    select(phylum, colors) %>% 
    unique() %>%
    arrange(phylum) %>%
    pull(colors, name=phylum)

1.3.0.12 Wrap working objects

All working objects are wrapped into a single Rdata object to facilitate downstream usage.

save(sample_metadata_pg, 
     genome_metadata_pg, 
     read_counts_pg, 
     genome_counts_pg, 
     genome_counts_filt_pg, 
     genome_tree_pg,
     genome_gifts_pg, 
     phylum_colors_pg,
     file = "data/data_podarcis_gaigeae.Rdata")

1.4 Podarcis milensis (PM)

1.4.0.1 Sample metadata

sample_metadata_pm <- read_tsv("data/pm/DMB0158_metadata.tsv.gz") %>%
    rename(sample=1)

1.4.0.2 Genome metadata

genome_metadata_pm <- read_tsv("data/pm/DMB0158_mag_info.tsv.gz") %>%
    rename(length=mag_size)

1.4.0.3 Read counts

read_counts_pm <- read_tsv("data/pm/DMB0158_counts.tsv.gz") %>%
    rename(genome=1) %>%
    select(all_of(c("genome",sample_metadata_pm$sample))) %>% # sort samples
    arrange(match(genome,genome_metadata_pm$genome)) # sort genomes

1.4.0.4 Genome base hits

genome_coverage_pm <- read_tsv("data/pm/DMB0158_coverage.tsv.gz") %>%
    rename(genome=1) %>%
    select(all_of(c("genome",sample_metadata_pm$sample))) %>% # sort samples
    arrange(match(genome,genome_metadata_pm$genome)) # sort genomes

1.4.0.5 Genome tree

genome_tree_pm <- read_tree("data/pm/DMB0158.tree")
genome_tree_pm$tip.label <- str_replace_all(genome_tree_pm$tip.label,"'", "") #remove single quotes in MAG names
genome_tree_pm <- keep.tip(genome_tree_pm, tip=genome_metadata_pm$genome) # keep only MAG tips

1.4.0.6 Genome annotations

Downloading individual annotation files from ERDA using information in Airtable and writing them to a single compressed table takes a while. The following chunk only needs to be run once, to generate the genome_annotations table that is saved in the data directory. Note that the airtable connection requires a personal access token.

airtable("MAGs", "appWbHBNLE6iAsMRV") %>% #get base ID from Airtable browser URL
  read_airtable(., fields = c("ID","mag_name","number_genes","anno_url"), id_to_col = TRUE) %>% #get 3 columns from MAGs table
  filter(mag_name %in% paste0(genome_metadata_pm$genome,".fa")) %>% #filter by MAG name
  filter(number_genes > 0) %>% #genes need to exist
  select(anno_url) %>% #list MAG annotation urls
  pull() %>%
  read_tsv() %>% #load all tables
  rename(gene=1, genome=2, contig=3) %>% #rename first 3 columns
  write_tsv(file="data/pm/genome_annotations.tsv.xz") #write to overall compressed file
genome_annotations_pm <- read_tsv("data/pm/genome_annotations.tsv.xz") %>%
    rename(gene=1, genome=2, contig=3)

1.4.0.7 Create working objects

Transform the original data files into working objects for downstream analyses.

1.4.0.8 Filter reads by coverage

min_coverage=0.3
read_counts_filt_pm <- genome_coverage_pm %>%
  mutate(across(where(is.numeric), ~ ifelse(. > min_coverage, 1, 0))) %>%
  mutate(across(-1, ~ . * read_counts_pm[[cur_column()]])) 

1.4.0.9 Transform reads into genome counts

readlength=150
genome_counts_pm <- read_counts_pm %>%
  mutate(across(where(is.numeric), ~ . / (genome_metadata_pm$length / readlength) ))
readlength=150
genome_counts_filt_pm <- read_counts_filt_pm %>%
  mutate(across(where(is.numeric), ~ . / (genome_metadata_pm$length / readlength) ))

1.4.0.10 Distill annotations into GIFTs

genome_gifts_pm <- distill(genome_annotations_pm,GIFT_db,genomecol=2,annotcol=c(9,10,19), verbosity=F)

1.4.0.11 Prepare color scheme

AlberdiLab projects use unified color schemes developed for the Earth Hologenome Initiative, to facilitate figure interpretation.

phylum_colors_pm <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
    right_join(genome_metadata_pm, by=join_by(phylum == phylum)) %>%
    arrange(match(genome, genome_tree_pm$tip.label)) %>%
    select(phylum, colors) %>% 
    unique() %>%
    arrange(phylum) %>%
    pull(colors, name=phylum)

1.4.0.12 Wrap working objects

All working objects are wrapped into a single Rdata object to facilitate downstream usage.

save(sample_metadata_pm, 
     genome_metadata_pm, 
     read_counts_pm, 
     genome_counts_pm, 
     genome_counts_filt_pm, 
     genome_tree_pm,
     genome_gifts_pm, 
     phylum_colors_pm,
     file = "data/data_podarcis_milensis.Rdata")

1.5 Podarcis pityusensis (PP)

1.5.0.1 Sample metadata

sample_metadata_pp <- read_tsv("data/pp/DMB0161_metadata.tsv.gz") %>%
    rename(sample=1)

1.5.0.2 Genome metadata

genome_metadata_pp <- read_tsv("data/pp/DMB0161_mag_info.tsv.gz") %>%
    rename(length=mag_size)

1.5.0.3 Read counts

read_counts_pp <- read_tsv("data/pp/DMB0161_counts.tsv.gz") %>%
    rename(genome=1) %>%
    select(all_of(c("genome",sample_metadata_pp$sample))) %>% # sort samples
    arrange(match(genome,genome_metadata_pp$genome)) # sort genomes

1.5.0.4 Genome base hits

genome_coverage_pp <- read_tsv("data/pp/DMB0161_coverage.tsv.gz") %>%
    rename(genome=1) %>%
    select(all_of(c("genome",sample_metadata_pp$sample))) %>% # sort samples
    arrange(match(genome,genome_metadata_pp$genome)) # sort genomes

1.5.0.5 Genome tree

genome_tree_pp <- read_tree("data/pp/DMB0161.tree")
genome_tree_pp$tip.label <- str_replace_all(genome_tree_pp$tip.label,"'", "") #remove single quotes in MAG names
genome_tree_pp <- keep.tip(genome_tree_pp, tip=genome_metadata_pp$genome) # keep only MAG tips

1.5.0.6 Genome annotations

Downloading individual annotation files from ERDA using information in Airtable and writing them to a single compressed table takes a while. The following chunk only needs to be run once, to generate the genome_annotations table that is saved in the data directory. Note that the airtable connection requires a personal access token.

airtable("MAGs", "appWbHBNLE6iAsMRV") %>% #get base ID from Airtable browser URL
  read_airtable(., fields = c("ID","mag_name","number_genes","anno_url"), id_to_col = TRUE) %>% #get 3 columns from MAGs table
  filter(mag_name %in% paste0(genome_metadata_pp$genome,".fa")) %>% #filter by MAG name
  filter(number_genes > 0) %>% #genes need to exist
  select(anno_url) %>% #list MAG annotation urls
  pull() %>%
  read_tsv() %>% #load all tables
  rename(gene=1, genome=2, contig=3) %>% #rename first 3 columns
  write_tsv(file="data/pp/genome_annotations.tsv.xz") #write to overall compressed file
genome_annotations_pp <- read_tsv("data/pp/genome_annotations.tsv.xz") %>%
    rename(gene=1, genome=2, contig=3)

1.5.0.7 Create working objects

Transform the original data files into working objects for downstream analyses.

1.5.0.8 Filter reads by coverage

min_coverage=0.3
read_counts_filt_pp <- genome_coverage_pp %>%
  mutate(across(where(is.numeric), ~ ifelse(. > min_coverage, 1, 0))) %>%
  mutate(across(-1, ~ . * read_counts_pp[[cur_column()]])) 

1.5.0.9 Transform reads into genome counts

readlength=150
genome_counts_pp <- read_counts_pp %>%
  mutate(across(where(is.numeric), ~ . / (genome_metadata_pp$length / readlength) ))
readlength=150
genome_counts_filt_pp <- read_counts_filt_pp %>%
  mutate(across(where(is.numeric), ~ . / (genome_metadata_pp$length / readlength) ))

1.5.0.10 Distill annotations into GIFTs

genome_gifts_pp <- distill(genome_annotations_pp,GIFT_db,genomecol=2,annotcol=c(9,10,19), verbosity=F)

1.5.0.11 Prepare color scheme

AlberdiLab projects use unified color schemes developed for the Earth Hologenome Initiative, to facilitate figure interpretation.

phylum_colors_pp <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
    right_join(genome_metadata_pp, by=join_by(phylum == phylum)) %>%
    arrange(match(genome, genome_tree_pp$tip.label)) %>%
    select(phylum, colors) %>% 
    unique() %>%
    arrange(phylum) %>%
    pull(colors, name=phylum)

1.5.0.12 Wrap working objects

All working objects are wrapped into a single Rdata object to facilitate downstream usage.

save(sample_metadata_pp, 
     genome_metadata_pp, 
     read_counts_pp, 
     genome_counts_pp, 
     genome_counts_filt_pp, 
     genome_tree_pp,
     genome_gifts_pp, 
     phylum_colors_pp,
     file = "data/data_podarcis_pityusensis.Rdata")

1.6 All

1.6.0.1 Sample metadata

sample_metadata_all <- read_tsv("data/all/DMB0162_metadata.tsv.gz") %>%
    rename(sample=1) %>%
    mutate(
      population = case_when(
        species == "Podarcis filfolensis" ~ str_replace(population,"_","_pf_"),
        species == "Podarcis gaigeae" ~ str_replace(population,"_","_pg_"),
        species == "Podarcis milensis" ~ str_replace(population,"_","_pm_"),
        species == "Podarcis pityusensis" ~ str_replace(population,"_","_pp_"),
        TRUE ~ NA))

1.6.0.2 Genome metadata

genome_metadata_all <- read_tsv("data/all/DMB0162_mag_info.tsv.gz") %>%
    rename(length=mag_size)

1.6.0.3 Read counts

read_counts_all <- read_tsv("data/all/DMB0162_counts.tsv.gz") %>%
    rename(genome=1) %>%
    select(all_of(c("genome",sample_metadata_all$sample))) %>% # sort samples
    arrange(match(genome,genome_metadata_all$genome)) # sort genomes

1.6.0.4 Genome base hits

genome_coverage_all <- read_tsv("data/all/DMB0162_coverage.tsv.gz") %>%
    rename(genome=1) %>%
    select(all_of(c("genome",sample_metadata_all$sample))) %>% # sort samples
    arrange(match(genome,genome_metadata_all$genome)) # sort genomes

1.6.0.5 Genome tree

genome_tree_all <- read_tree("data/all/DMB0162.tree")
genome_tree_all$tip.label <- str_replace_all(genome_tree_all$tip.label,"'", "") #remove single quotes in MAG names
genome_tree_all <- keep.tip(genome_tree_all, tip=genome_metadata_all$genome) # keep only MAG tips

1.6.0.6 Genome annotations

genome_annotations_all <- read_tsv(c("data/pf/genome_annotations.tsv.xz",
                                   "data/pg/genome_annotations.tsv.xz",
                                   "data/pm/genome_annotations.tsv.xz",
                                   "data/pp/genome_annotations.tsv.xz")) %>%
    rename(gene=1, genome=2, contig=3)

1.6.0.7 Create working objects

Transform the original data files into working objects for downstream analyses.

1.6.0.8 Filter reads by coverage

min_coverage=0.3
read_counts_filt_all <- genome_coverage_all %>%
  mutate(across(where(is.numeric), ~ ifelse(. > min_coverage, 1, 0))) %>%
  mutate(across(-1, ~ . * read_counts_all[[cur_column()]])) 

1.6.0.9 Transform reads into genome counts

readlength=150
genome_counts_all <- read_counts_all %>%
  mutate(across(where(is.numeric), ~ . / (genome_metadata_all$length / readlength) ))
readlength=150
genome_counts_filt_all <- read_counts_filt_all %>%
  mutate(across(where(is.numeric), ~ . / (genome_metadata_all$length / readlength) ))

1.6.0.10 Distill annotations into GIFTs

genome_gifts_all <- distill(genome_annotations_all,GIFT_db,genomecol=2,annotcol=c(9,10,19), verbosity=F)

1.6.0.11 Prepare color scheme

AlberdiLab projects use unified color schemes developed for the Earth Hologenome Initiative, to facilitate figure interpretation.

phylum_colors_all <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
    right_join(genome_metadata_all, by=join_by(phylum == phylum)) %>%
    arrange(match(genome, genome_tree_all$tip.label)) %>%
    select(phylum, colors) %>% 
    unique() %>%
    arrange(phylum) %>%
    pull(colors, name=phylum)

1.6.0.12 Wrap working objects

All working objects are wrapped into a single Rdata object to facilitate downstream usage.

save(sample_metadata_all, 
     genome_metadata_all, 
     read_counts_all, 
     genome_counts_all, 
     genome_counts_filt_all, 
     genome_tree_all,
     genome_gifts_all, 
     phylum_colors_all,
     file = "data/data_podarcis_all.Rdata")

  1. University of Copenhagen, ↩︎