Core SNP maximum-likelihood tree with metadata

Author

Lakhansing Pardeshi

Published

July 20, 2025

Initial setup

Code
suppressPackageStartupMessages(library(knitr))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(configr))
suppressPackageStartupMessages(library(ggpubr))
suppressPackageStartupMessages(library(here))
suppressPackageStartupMessages(library(ggnewscale))
suppressPackageStartupMessages(library(ape))
suppressPackageStartupMessages(library(treeio))
suppressPackageStartupMessages(library(ggtree))
suppressPackageStartupMessages(library(ggbreak))
suppressPackageStartupMessages(library(ggtreeExtra))
suppressPackageStartupMessages(library(scales))

## visualized phylogenetic tree with metadata

rm(list = ls())

source("https://raw.githubusercontent.com/lakhanp1/omics_utils/main/RScripts/utils.R")
source("scripts/utils/config_functions.R")
source("scripts/utils/phylogeny_functions.R")
source("scripts/utils/heatmap_utils.R")
################################################################################
set.seed(124)

confs <- prefix_config_paths(
  conf = suppressWarnings(configr::read.config(file = "project_config.yaml")),
  dir = "."
)

treeMethod <- "core_snp_ml"     #ani_upgma, kmer_nj
pangenome <- confs$data$pangenomes$pectobacterium.v2$name
panConf <- confs$data$pangenomes[[pangenome]]

outDir <- confs$analysis$phylogeny$core_snp_ml$path
outPrefix <- paste(outDir, "/", treeMethod, sep = "")

Import tree data and required metadata

Code
sampleInfo <- get_metadata(file = panConf$files$metadata, genus = confs$genus)

sampleInfoList <- as.list_metadata(
  df = sampleInfo, sampleId, sampleName, SpeciesName, strain, nodeLabs, genomeId 
)

speciesOrder <- suppressMessages(
  readr::read_tsv(confs$analysis$phylogeny[[treeMethod]]$files$species_order)
)

metDf <- suppressMessages(
  readr::read_csv(panConf$db$metrics$files$per_genome, col_names = T)
) %>% 
  dplyr::rename_all(
    .funs = ~ stringr::str_replace_all(., "\\W+", "_")
  ) %>% 
  dplyr::rename_all(
    .funs = ~ stringr::str_replace_all(., "_+$", "")
  ) %>% 
  dplyr::select(
    Genome, Gene_count, mRNA_count, CDS_count, tRNA_count, rRNA_count,
    Homology_groups, Singletons, GC_content
  ) %>% 
  dplyr::mutate(genomeId = paste("g_", Genome, sep = "")) %>% 
  dplyr::select(-Genome)
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Code
## add species order factor levels to SpeciesName column
sampleInfo %<>% dplyr::mutate(
  SpeciesName = forcats::fct_relevel(SpeciesName, !!!speciesOrder$SpeciesName)
) %>% 
  dplyr::left_join(
    y = metDf, by = "genomeId"
  ) %>% 
  dplyr::group_by(SpeciesName) %>% 
  dplyr::mutate(
    mRNA_med_diff = mRNA_count - median(mRNA_count),
    genome_size_med_diff = length - median(length),
  ) %>% 
  dplyr::ungroup()

Plot the tree

Code
## read tree
rawTree <- import_tree(
  confs$analysis$phylogeny[[treeMethod]]$files$tree_rooted
)

## add data to tree
treeTbl <- tidytree::as_tibble(rawTree) %>%
  dplyr::full_join(y = sampleInfo, by = c("label" = "genomeId")) %>%
  treeio::as.treedata()

Explore the tree topology

Code
rootNode <- rootnode(treeTbl) 

# 
# scale_color_binned(
#   scale_name = "stepsn",
#   palette = function(x) c("green", "red", "black"),
#   breaks = c(70, 90),
#   limits = c(0, 100),
#   na.value = "black",
#   show.limits = TRUE,
#   guide = "colorsteps"
# ) +
# ggnewscale::new_scale_color()

pt_tree <- ggtree::ggtree(tr = treeTbl, color="black", size = 0.7) +
  theme_tree2() +
  ggtree::geom_point2(
    mapping = aes(
      subset = !isTip & node != rootNode, 
      color = cut(bootstrap, c(0, 70, 90, 100))),
    shape = 16, size = 2
    
  ) +
  scale_color_manual(
    values = c("blue", "#f781bf", "black"),
    guide = guide_legend(override.aes = list(size = 6), order = 1),
    name = 'Bootstrap %', 
    breaks = c('(0,70]', '(70,90]'), 
    labels = expression(BP < "70%", "70%" <= BP * " < 90%"),
    na.value = "transparent"
  ) +
  ggnewscale::new_scale_fill()

One of the branch is very long and hence trimming its length to 0.5 for visualization purpose.

Code
longestBranch <- which(pt_tree$data$x == max(pt_tree$data$x))
longestBranchTip <- pt_tree$data$label[longestBranch]
trimLongestBranchTo <- 0.47
pt_tree$data$x[longestBranch] <- trimLongestBranchTo

Representative species phylogenetic tree

Code
# speciesDend <- ape::keep.tip(
#   phy = ape::ladderize(rawTree@phylo), tip = representativeSpecies
# ) %>%
#   ape::chronos() %>%
#   ape::as.hclust.phylo() %>%
#   as.dendrogram()

speciesPhy <- representative_genomes_tree(phy = rawTree@phylo, metadata = sampleInfo) %>% 
  ape::ladderize(right = FALSE)

orderConstraints <- dplyr::left_join(
  x = speciesOrder,
  y = dplyr::left_join(
    x = tibble::tibble(genomeId = speciesPhy$tip.label),
    y = dplyr::select(sampleInfo, genomeId, SpeciesName),
    by = "genomeId"
  ),
  by = "SpeciesName"
) %>% 
  dplyr::arrange(-dplyr::row_number())

speciesPhy <- ape::rotateConstr(
  phy = speciesPhy, constraint = orderConstraints$genomeId
)

# ape::plotBreakLongEdges(speciesPhy)
# ape::axisPhylo()


speciesTree <- ggtree::ggtree(speciesPhy, size = 2, ladderize = FALSE) %<+%
  dplyr::select(sampleInfo, genomeId, everything()) +
  ggtree::xlim(NA, 1) +
  ggtree::geom_tiplab(
    mapping = aes(label = SpeciesName),
    align = TRUE, size = 5, fontface = "italic"
  ) +
  geom_treescale(
    x = 0, y = nrow(speciesOrder) * 0.9,
    fontsize = 4, linesize = 2
  )

speciesTree$data$x[which(speciesTree$data$label == longestBranchTip)] <- trimLongestBranchTo

ggsave(
  filename = paste(outPrefix, ".representative_species_tree.pdf", sep = ""),
  plot = speciesTree,
  width = 6, height = 8
)

speciesTree

View tree with metadata

Code
# theme for legend adjustment
theme_legend <- ggplot2::theme(
  legend.justification = c(0, 1),
  legend.box = "vertical",
  legend.text = element_text(size = 16),
  legend.title = element_text(size = 18, face = "bold")
)

# theme for ggplot
pt_theme <- ggplot2::theme_bw() +
  theme(
    axis.text.y = element_blank(),
    panel.grid = element_blank(),
    panel.background = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    axis.title = element_blank()
  ) +
  theme_legend

pt_tree2 <- pt_tree +
  theme_tree() +
  geom_treescale(
    x = 0, y = nrow(sampleInfo) * 0.5,
    fontsize = 8, linesize = 2, offset = 4
  ) +
  # layout_fan(angle = 10) +
  scale_y_continuous(expand=c(0, 10)) +
  ggnewscale::new_scale_color() +
  ggtree::geom_tiplab(
    mapping = aes(label = NA),
    align = TRUE, linesize = 0.25
  )  +
  ggnewscale::new_scale_color() +
  theme_legend
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
Code
spKeyDf <- get_species_key_data(
  genomes = sampleInfo$genomeId, speciesInfo = sampleInfo, type = 'wide'
) %>% 
  tibble::as_tibble(rownames = "genomeId") %>% 
  tidyr::pivot_longer(
    cols = -genomeId,
    names_to = 'sp', values_to = "spCol"
  ) %>% 
  dplyr::mutate(sp = forcats::fct_relevel(sp, !!!speciesOrder$SpeciesName))



# species key
pt_spKey <- dplyr::rename(spKeyDf, label = genomeId) %>% 
  ggplot2::ggplot(mapping = aes(x = sp, y = label)) +
  ggplot2::geom_tile(
    mapping = aes(fill = spCol)
  ) +
  scale_fill_manual(
    values = c("0" = "grey95", "1" = "black"),
    guide = "none"
  ) +
  pt_theme +
  theme(
    axis.text.x = element_blank(),
    axis.ticks = element_blank()
  )

# blackleg pcr
pt_pcr <- dplyr::select(sampleInfo, label = genomeId, virulence_pcr) %>% 
  ggplot2::ggplot(mapping = aes(x = "pcr", y = label)) +
  ggplot2::geom_tile(mapping = aes(fill = virulence_pcr)) +
  # ggplot2::geom_point(mapping = aes(color = virulence_pcr), size = 1) +
  scale_fill_manual(
    name = "PCR BL diagnosis",
    values = c("red", "green"),
    breaks = c("positive", "negative"),
    na.value = alpha("white", 0),
    guide = guide_legend(override.aes = list(size = 6), order = 2)
  ) +
  pt_theme +
  theme(
    axis.text = element_blank(), axis.ticks = element_blank()
  )

# blackleg field trial
pt_bl <- dplyr::select(sampleInfo, label = genomeId, virulence) %>% 
  ggplot2::ggplot(mapping = aes(x = "pcr", y = label)) +
  ggplot2::geom_tile(mapping = aes(fill = virulence)) +
  # ggplot2::geom_point(mapping = aes(color = virulence), size = 1, shape = 17) +
  scale_fill_manual(
    name='Field trial',
    values = c("red", "green", "black"),
    breaks = c("virulent", "avirulent", "inconclusive"),
    label = c("BL-causing", "BL non-causing", "Inconclusive"),
    na.value = alpha("white", 0),
    guide = guide_legend(override.aes = list(size = 6), order = 3)
  ) +
  pt_theme +
  theme(
    axis.text = element_blank(), axis.ticks = element_blank()
  )

# continent
pt_loc <- dplyr::select(sampleInfo, label = genomeId, continent) %>% 
  ggplot2::ggplot(mapping = aes(x = "location", y = label)) +
  ggplot2::geom_tile(mapping = aes(fill = continent)) +
  scale_fill_manual(
    name = "Continent of isolation",
    values = c("#E6AC00", "#D12E96", "#5E8B3A", "#AA741D", "#A5E7E1", "#244360"),
    na.value = alpha("white", 1),
    guide = guide_legend(override.aes = list(size = 6), order = 4),
    na.translate = FALSE
  ) +
  pt_theme +
  theme(
    axis.text = element_blank(), axis.ticks = element_blank()
  )

# genome size
pt_len <- dplyr::select(sampleInfo, label = genomeId, length) %>% 
  ggplot2::ggplot(mapping = aes(x = length - median(length), y = label)) +
  ggplot2::geom_bar(fill = "white", color = "black", stat = "identity") +
  scale_x_continuous(
    breaks = scales::breaks_pretty(n = 3),
    labels = scales::label_number(scale_cut = c(kb = 1000, mb = 1000000))
  ) +
  pt_theme +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 16)
  )

# mRNA count difference with species median
pt_gene_count <- dplyr::select(sampleInfo, label = genomeId, mRNA_count) %>% 
  ggplot2::ggplot(mapping = aes(x = mRNA_count - median(mRNA_count), y = label)) +
  ggplot2::geom_bar(fill = "white", color = "black", stat = "identity") +
  scale_x_continuous(
    breaks = scales::breaks_pretty(n = 3)
  ) +
  pt_theme +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 16)
  )

# %GC
pt_gc <- dplyr::select(sampleInfo, label = genomeId, GC_content) %>% 
  ggplot2::ggplot(mapping = aes(x = GC_content, y = label)) +
  ggplot2::geom_bar(fill = "white", color = "black", stat = "identity") +
  ggplot2::coord_cartesian(xlim = c(50, NA)) +
  scale_x_continuous(
    n.breaks = 2,
    breaks = scales::breaks_pretty(n = 1)
  ) +
  pt_theme +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 16)
  )


pt_tree3 <- pt_spKey %>% aplot::insert_left(pt_tree2, width = 6) %>% 
  aplot::insert_right(pt_pcr, width = 0.2) %>% 
  aplot::insert_right(pt_bl, width = 0.2) %>% 
  aplot::insert_right(pt_loc, width = 0.2) %>% 
  # aplot::insert_right(pt_len, width = 0.8) %>% 
  aplot::insert_right(pt_gene_count, width = 0.8) %>%
  aplot::insert_right(pt_gc, width = 0.5)

ggsave(
  filename = paste(outPrefix, ".tree_plot.pdf", sep = ""), plot = pt_tree3,
  width = 15, height = 16
)

Species key in the above figure marks the species on tree from left to right: P. brasiliense, P. polaris, P. parvum, P. quasiaquaticum, P. aquaticum, P. versatile, P. carotovorum, P. odoriferum, P. actinidiae, P. parmentieri, P. wasabiae, P. punjabense, P. polonicum, P. atrosepticum, P. peruviense, P. zantedeschiae, P. betavasculorum, P. aroidearum, Pectobacterium sp. CFBP8739, P. colocasium, P. fontis, P. cacticida.

Back to top