---
title: "Core SNP maximum-likelihood tree with metadata"
date: "`r Sys.time()`"
format:
html:
embed-resources: true
---
## Initial setup
```{r}
#| label: setup
#| warning: false
suppressPackageStartupMessages (library (knitr))
suppressPackageStartupMessages (library (tidyverse))
suppressPackageStartupMessages (library (configr))
suppressPackageStartupMessages (library (ggpubr))
suppressPackageStartupMessages (library (here))
suppressPackageStartupMessages (library (ggnewscale))
suppressPackageStartupMessages (library (ape))
suppressPackageStartupMessages (library (treeio))
suppressPackageStartupMessages (library (ggtree))
suppressPackageStartupMessages (library (ggbreak))
suppressPackageStartupMessages (library (ggtreeExtra))
suppressPackageStartupMessages (library (scales))
## visualized phylogenetic tree with metadata
rm (list = ls ())
source ("https://raw.githubusercontent.com/lakhanp1/omics_utils/main/RScripts/utils.R" )
source ("scripts/utils/config_functions.R" )
source ("scripts/utils/phylogeny_functions.R" )
source ("scripts/utils/heatmap_utils.R" )
################################################################################
set.seed (124 )
confs <- prefix_config_paths (
conf = suppressWarnings (configr:: read.config (file = "project_config.yaml" )),
dir = "."
)
treeMethod <- "core_snp_ml" #ani_upgma, kmer_nj
pangenome <- confs$ data$ pangenomes$ pectobacterium.v2$ name
panConf <- confs$ data$ pangenomes[[pangenome]]
outDir <- confs$ analysis$ phylogeny$ core_snp_ml$ path
outPrefix <- paste (outDir, "/" , treeMethod, sep = "" )
```
## Import tree data and required metadata
```{r}
sampleInfo <- get_metadata (file = panConf$ files$ metadata, genus = confs$ genus)
sampleInfoList <- as.list_metadata (
df = sampleInfo, sampleId, sampleName, SpeciesName, strain, nodeLabs, genomeId
)
speciesOrder <- suppressMessages (
readr:: read_tsv (confs$ analysis$ phylogeny[[treeMethod]]$ files$ species_order)
)
metDf <- suppressMessages (
readr:: read_csv (panConf$ db$ metrics$ files$ per_genome, col_names = T)
) %>%
dplyr:: rename_all (
.funs = ~ stringr:: str_replace_all (., " \\ W+" , "_" )
) %>%
dplyr:: rename_all (
.funs = ~ stringr:: str_replace_all (., "_+$" , "" )
) %>%
dplyr:: select (
Genome, Gene_count, mRNA_count, CDS_count, tRNA_count, rRNA_count,
Homology_groups, Singletons, GC_content
) %>%
dplyr:: mutate (genomeId = paste ("g_" , Genome, sep = "" )) %>%
dplyr:: select (- Genome)
## add species order factor levels to SpeciesName column
sampleInfo %<>% dplyr:: mutate (
SpeciesName = forcats:: fct_relevel (SpeciesName, !!! speciesOrder$ SpeciesName)
) %>%
dplyr:: left_join (
y = metDf, by = "genomeId"
) %>%
dplyr:: group_by (SpeciesName) %>%
dplyr:: mutate (
mRNA_med_diff = mRNA_count - median (mRNA_count),
genome_size_med_diff = length - median (length),
) %>%
dplyr:: ungroup ()
```
## Plot the tree
```{r}
## read tree
rawTree <- import_tree (
confs$ analysis$ phylogeny[[treeMethod]]$ files$ tree_rooted
)
## add data to tree
treeTbl <- tidytree:: as_tibble (rawTree) %>%
dplyr:: full_join (y = sampleInfo, by = c ("label" = "genomeId" )) %>%
treeio:: as.treedata ()
```
### Explore the tree topology
```{r out.width='100%', fig.height=18}
#| out-width: '100%'
#| fig-height: 15
rootNode <- rootnode (treeTbl)
#
# scale_color_binned(
# scale_name = "stepsn",
# palette = function(x) c("green", "red", "black"),
# breaks = c(70, 90),
# limits = c(0, 100),
# na.value = "black",
# show.limits = TRUE,
# guide = "colorsteps"
# ) +
# ggnewscale::new_scale_color()
pt_tree <- ggtree:: ggtree (tr = treeTbl, color= "black" , size = 0.7 ) +
theme_tree2 () +
ggtree:: geom_point2 (
mapping = aes (
subset = ! isTip & node != rootNode,
color = cut (bootstrap, c (0 , 70 , 90 , 100 ))),
shape = 16 , size = 2
) +
scale_color_manual (
values = c ("blue" , "#f781bf" , "black" ),
guide = guide_legend (override.aes = list (size = 6 ), order = 1 ),
name = 'Bootstrap %' ,
breaks = c ('(0,70]' , '(70,90]' ),
labels = expression (BP < "70%" , "70%" <= BP * " < 90%" ),
na.value = "transparent"
) +
ggnewscale:: new_scale_fill ()
```
One of the branch is very long and hence trimming its length to 0.5 for visualization
purpose.
```{r}
longestBranch <- which (pt_tree$ data$ x == max (pt_tree$ data$ x))
longestBranchTip <- pt_tree$ data$ label[longestBranch]
trimLongestBranchTo <- 0.47
pt_tree$ data$ x[longestBranch] <- trimLongestBranchTo
```
### Representative species phylogenetic tree
```{r}
# speciesDend <- ape::keep.tip(
# phy = ape::ladderize(rawTree@phylo), tip = representativeSpecies
# ) %>%
# ape::chronos() %>%
# ape::as.hclust.phylo() %>%
# as.dendrogram()
speciesPhy <- representative_genomes_tree (phy = rawTree@ phylo, metadata = sampleInfo) %>%
ape:: ladderize (right = FALSE )
orderConstraints <- dplyr:: left_join (
x = speciesOrder,
y = dplyr:: left_join (
x = tibble:: tibble (genomeId = speciesPhy$ tip.label),
y = dplyr:: select (sampleInfo, genomeId, SpeciesName),
by = "genomeId"
),
by = "SpeciesName"
) %>%
dplyr:: arrange (- dplyr:: row_number ())
speciesPhy <- ape:: rotateConstr (
phy = speciesPhy, constraint = orderConstraints$ genomeId
)
# ape::plotBreakLongEdges(speciesPhy)
# ape::axisPhylo()
speciesTree <- ggtree:: ggtree (speciesPhy, size = 2 , ladderize = FALSE ) %<+%
dplyr:: select (sampleInfo, genomeId, everything ()) +
ggtree:: xlim (NA , 1 ) +
ggtree:: geom_tiplab (
mapping = aes (label = SpeciesName),
align = TRUE , size = 5 , fontface = "italic"
) +
geom_treescale (
x = 0 , y = nrow (speciesOrder) * 0.9 ,
fontsize = 4 , linesize = 2
)
speciesTree$ data$ x[which (speciesTree$ data$ label == longestBranchTip)] <- trimLongestBranchTo
ggsave (
filename = paste (outPrefix, ".representative_species_tree.pdf" , sep = "" ),
plot = speciesTree,
width = 6 , height = 8
)
speciesTree
```
### View tree with metadata
```{r}
# theme for legend adjustment
theme_legend <- ggplot2:: theme (
legend.justification = c (0 , 1 ),
legend.box = "vertical" ,
legend.text = element_text (size = 16 ),
legend.title = element_text (size = 18 , face = "bold" )
)
# theme for ggplot
pt_theme <- ggplot2:: theme_bw () +
theme (
axis.text.y = element_blank (),
panel.grid = element_blank (),
panel.background = element_blank (),
panel.border = element_blank (),
axis.ticks.y = element_blank (),
axis.title = element_blank ()
) +
theme_legend
pt_tree2 <- pt_tree +
theme_tree () +
geom_treescale (
x = 0 , y = nrow (sampleInfo) * 0.5 ,
fontsize = 8 , linesize = 2 , offset = 4
) +
# layout_fan(angle = 10) +
scale_y_continuous (expand= c (0 , 10 )) +
ggnewscale:: new_scale_color () +
ggtree:: geom_tiplab (
mapping = aes (label = NA ),
align = TRUE , linesize = 0.25
) +
ggnewscale:: new_scale_color () +
theme_legend
spKeyDf <- get_species_key_data (
genomes = sampleInfo$ genomeId, speciesInfo = sampleInfo, type = 'wide'
) %>%
tibble:: as_tibble (rownames = "genomeId" ) %>%
tidyr:: pivot_longer (
cols = - genomeId,
names_to = 'sp' , values_to = "spCol"
) %>%
dplyr:: mutate (sp = forcats:: fct_relevel (sp, !!! speciesOrder$ SpeciesName))
# species key
pt_spKey <- dplyr:: rename (spKeyDf, label = genomeId) %>%
ggplot2:: ggplot (mapping = aes (x = sp, y = label)) +
ggplot2:: geom_tile (
mapping = aes (fill = spCol)
) +
scale_fill_manual (
values = c ("0" = "grey95" , "1" = "black" ),
guide = "none"
) +
pt_theme +
theme (
axis.text.x = element_blank (),
axis.ticks = element_blank ()
)
# blackleg pcr
pt_pcr <- dplyr:: select (sampleInfo, label = genomeId, virulence_pcr) %>%
ggplot2:: ggplot (mapping = aes (x = "pcr" , y = label)) +
ggplot2:: geom_tile (mapping = aes (fill = virulence_pcr)) +
# ggplot2::geom_point(mapping = aes(color = virulence_pcr), size = 1) +
scale_fill_manual (
name = "PCR BL diagnosis" ,
values = c ("red" , "green" ),
breaks = c ("positive" , "negative" ),
na.value = alpha ("white" , 0 ),
guide = guide_legend (override.aes = list (size = 6 ), order = 2 )
) +
pt_theme +
theme (
axis.text = element_blank (), axis.ticks = element_blank ()
)
# blackleg field trial
pt_bl <- dplyr:: select (sampleInfo, label = genomeId, virulence) %>%
ggplot2:: ggplot (mapping = aes (x = "pcr" , y = label)) +
ggplot2:: geom_tile (mapping = aes (fill = virulence)) +
# ggplot2::geom_point(mapping = aes(color = virulence), size = 1, shape = 17) +
scale_fill_manual (
name= 'Field trial' ,
values = c ("red" , "green" , "black" ),
breaks = c ("virulent" , "avirulent" , "inconclusive" ),
label = c ("BL-causing" , "BL non-causing" , "Inconclusive" ),
na.value = alpha ("white" , 0 ),
guide = guide_legend (override.aes = list (size = 6 ), order = 3 )
) +
pt_theme +
theme (
axis.text = element_blank (), axis.ticks = element_blank ()
)
# continent
pt_loc <- dplyr:: select (sampleInfo, label = genomeId, continent) %>%
ggplot2:: ggplot (mapping = aes (x = "location" , y = label)) +
ggplot2:: geom_tile (mapping = aes (fill = continent)) +
scale_fill_manual (
name = "Continent of isolation" ,
values = c ("#E6AC00" , "#D12E96" , "#5E8B3A" , "#AA741D" , "#A5E7E1" , "#244360" ),
na.value = alpha ("white" , 1 ),
guide = guide_legend (override.aes = list (size = 6 ), order = 4 ),
na.translate = FALSE
) +
pt_theme +
theme (
axis.text = element_blank (), axis.ticks = element_blank ()
)
# genome size
pt_len <- dplyr:: select (sampleInfo, label = genomeId, length) %>%
ggplot2:: ggplot (mapping = aes (x = length - median (length), y = label)) +
ggplot2:: geom_bar (fill = "white" , color = "black" , stat = "identity" ) +
scale_x_continuous (
breaks = scales:: breaks_pretty (n = 3 ),
labels = scales:: label_number (scale_cut = c (kb = 1000 , mb = 1000000 ))
) +
pt_theme +
theme (
axis.text.x = element_text (angle = 90 , hjust = 1 , vjust = 0.5 , size = 16 )
)
# mRNA count difference with species median
pt_gene_count <- dplyr:: select (sampleInfo, label = genomeId, mRNA_count) %>%
ggplot2:: ggplot (mapping = aes (x = mRNA_count - median (mRNA_count), y = label)) +
ggplot2:: geom_bar (fill = "white" , color = "black" , stat = "identity" ) +
scale_x_continuous (
breaks = scales:: breaks_pretty (n = 3 )
) +
pt_theme +
theme (
axis.text.x = element_text (angle = 90 , hjust = 1 , vjust = 0.5 , size = 16 )
)
# %GC
pt_gc <- dplyr:: select (sampleInfo, label = genomeId, GC_content) %>%
ggplot2:: ggplot (mapping = aes (x = GC_content, y = label)) +
ggplot2:: geom_bar (fill = "white" , color = "black" , stat = "identity" ) +
ggplot2:: coord_cartesian (xlim = c (50 , NA )) +
scale_x_continuous (
n.breaks = 2 ,
breaks = scales:: breaks_pretty (n = 1 )
) +
pt_theme +
theme (
axis.text.x = element_text (angle = 90 , hjust = 1 , vjust = 0.5 , size = 16 )
)
pt_tree3 <- pt_spKey %>% aplot:: insert_left (pt_tree2, width = 6 ) %>%
aplot:: insert_right (pt_pcr, width = 0.2 ) %>%
aplot:: insert_right (pt_bl, width = 0.2 ) %>%
aplot:: insert_right (pt_loc, width = 0.2 ) %>%
# aplot::insert_right(pt_len, width = 0.8) %>%
aplot:: insert_right (pt_gene_count, width = 0.8 ) %>%
aplot:: insert_right (pt_gc, width = 0.5 )
ggsave (
filename = paste (outPrefix, ".tree_plot.pdf" , sep = "" ), plot = pt_tree3,
width = 15 , height = 16
)
```
:::{.scrolling_y}
```{r}
#| column: page-inset-right
#| fig-height: 14
#| fig-width: 14
#| out-width: '100%'
#| layout-valign: top
#| echo: false
pt_tree3
```
:::
Species key in the above figure marks the species on tree from left to right:
`r paste(speciesOrder$SpeciesName, collapse = ", ")` .