Analyze tail fiber loci haplotypes

Author

Lakhansing Pardeshi

Published

July 20, 2025

Initial setup

Code
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(ape))
suppressPackageStartupMessages(library(treeio))
suppressPackageStartupMessages(library(ggtree))
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(ComplexHeatmap))
suppressPackageStartupMessages(library(org.Pectobacterium.spp.pan.eg.db))

rm(list = ls())

source("https://raw.githubusercontent.com/lakhanp1/omics_utils/main/RScripts/utils.R")
source("scripts/utils/config_functions.R")
source("scripts/utils/phylogeny_functions.R")
source("scripts/utils/homology_groups.R")
source("scripts/utils/heatmap_utils.R")
################################################################################
set.seed(124)

confs <- prefix_config_paths(
  conf = suppressWarnings(configr::read.config(file = "project_config.yaml")),
  dir = "."
)

file_haplotypes <- confs$analysis$ctv$tfl$files$hg_regions

outDir <- dirname(file_haplotypes)

pangenome <- confs$data$pangenomes$pectobacterium.v2$name
panConf <- confs$data$pangenomes[[pangenome]]
panOrgDb <- org.Pectobacterium.spp.pan.eg.db
treeMethod <- "core_snp_ml"     #ani_upgma, kmer_nj

Import data

Code
sampleInfo <- get_metadata(file = panConf$files$metadata, genus = confs$genus)

sampleInfoList <- as.list_metadata(
  df = sampleInfo, sampleId, sampleName, SpeciesName, strain, nodeLabs, genomeId 
)

rawTree <- import_tree(
  confs$analysis$phylogeny[[treeMethod]]$files$tree_rooted,
  phylo = TRUE
)

speciesOrder <- suppressMessages(
  readr::read_tsv(confs$analysis$phylogeny[[treeMethod]]$files$species_order)
) %>% 
  dplyr::mutate(order = n():1)

Import tail regions and distance tables

Build a pairwise tail loci comparison table

Code
regionHaplotypes <- suppressMessages(readr::read_tsv(file_haplotypes)) %>% 
  dplyr::left_join(
    y = dplyr::select(
      sampleInfo, genomeId, SpeciesName, geo_loc_country, collection_year,
      collection_year, nodepath.kmer_nj, host, isolation_source
    ),
    by = "genomeId"
  ) %>% 
  dplyr::add_count(haplotype) %>% 
  dplyr::arrange(desc(n), haplotype, SpeciesName) %>% 
  dplyr::mutate(
    haplotype = forcats::as_factor(haplotype)
  )

readr::write_tsv(
  regionHaplotypes,
  file = paste(outDir, "/haplotypes_phy_data.tab", sep = "")
)

haplotypes <- dplyr::count(
  regionHaplotypes, haplotype, hgs, sort = TRUE
)

Are there any tail loci shared between species?

Code
# generate upset plot
cm <- ComplexHeatmap::make_comb_mat(
  split(regionHaplotypes$haplotype, f = regionHaplotypes$SpeciesName)
)

hgCombTable <- table(regionHaplotypes$haplotype)
combMetadata <- purrr::imap_dfr(
  .x = ComplexHeatmap::comb_degree(cm),
  .f = function(d, n){
    tibble::tibble(
      comb = n,
      n_species = d,
      n_genomes = sum(hgCombTable[extract_comb(m = cm, comb_name = n)]),
      haplotype = paste(extract_comb(m = cm, comb_name = n), collapse = ";")
    )
  }
) %>% 
  dplyr::left_join(haplotypes, by = "haplotype") |> 
  dplyr::mutate(
    comb = forcats::fct_relevel(
      .f = comb, !!ComplexHeatmap::comb_name(cm)
    )
  ) |> 
  dplyr::arrange(comb)

ht_upset <- ComplexHeatmap::UpSet(
  m = cm,
  column_title = "Carotovoricin tail fiber locus similarity across species",
  row_names_gp = gpar(fontface = "italic"),
  column_title_gp = gpar(fontface = "bold", fontsize = 16),
  set_order = order(
    dplyr::pull(speciesOrder, order, name = SpeciesName)[set_name(cm)],
    decreasing = TRUE
  ),
  # comb_order = order(comb_degree(cm), comb_size(cm), decreasing = TRUE),
  right_annotation = upset_right_annotation(cm, add_numbers = TRUE),
  top_annotation = ComplexHeatmap::HeatmapAnnotation(
    "Number of\ngenomes" = anno_barplot(
      x = dplyr::pull(combMetadata, var = n_genomes, name = "comb"),
      border = FALSE, add_numbers = TRUE,
      gp = gpar(fill = "black"), height = unit(1.5, "cm")
    ),
    "Number of\nshared tail\nfiber locus" = anno_barplot(
      x = comb_size(cm), 
      border = FALSE, 
      gp = gpar(fill = "black"), 
      height = unit(1.5, "cm")
    ),
    annotation_name_side = "left", 
    annotation_name_rot = 0,
    annotation_name_gp = gpar(fontface = "bold"),
    gap = unit(4, "mm")
  )
)

ht_upset <- ht_upset +
  ComplexHeatmap::rowAnnotation(
    "Number of\ngenomes" = ComplexHeatmap::anno_barplot(
      x = dplyr::count(regionHaplotypes, SpeciesName) %>% 
        dplyr::pull(var = n, name = SpeciesName),
      border = FALSE,
      add_numbers = TRUE,
      gp = gpar(fill = "black"), width = unit(2, "cm")
    ),
    gap = unit(3, "mm")
  )
quartz_off_screen 
                2 

Show the raw combination data.

Visualize tail-fiber locus haplotypes with the phylogeny

Generate core phylogeny plot and species key heatmap.

Code
pt_phy <- ggtree_with_species(
  phy = rawTree, metadata = sampleInfo,
  genomes = c(regionHaplotypes$genomeId, confs$analysis$phylogeny$outgroup_genome),
  trim_branch = confs$analysis$phylogeny[[treeMethod]]$cut_branch
)

Generate tail-fiber locus haplotype presence-absence heatmap.

Code
# TFL haplotypes plot
hap_df <- dplyr::select(regionHaplotypes, genomeId, haplotype, n) %>% 
  dplyr::filter(n > 1) %>% 
  dplyr::mutate(hap = "y")

theme_haplotypes <- theme_bw() +
  theme(
    panel.grid.major.y = element_blank(),
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    axis.title = element_blank(),
    legend.title = element_text(size = 18, face = "bold")
  )

pt_hap <- dplyr::select(regionHaplotypes, label = genomeId, haplotype, n) %>%
  dplyr::filter(n > 1) %>%
  dplyr::mutate(hap = "y") %>%
  ggplot2::ggplot(mapping = aes(x = haplotype, y = label)) +
  geom_tile(mapping = aes(fill = hap), width = 0.9) +
  scale_fill_manual(
    name = NULL,
    values = c("y" = "#FDA532"),
    na.value = alpha("white", 1),
    guide = NULL,
    na.translate = FALSE
  ) +
  theme_haplotypes

pt_all <- pt_phy$species_key |> 
  aplot::insert_left(pt_phy$tree, width = 3) |> 
  aplot::insert_right(pt_hap, width = 4)
quartz_off_screen 
                2 

Species order from left to right on species key heatmap: P. brasiliense, P. polaris, P. parvum, P. aquaticum, P. quasiaquaticum, P. versatile, P. carotovorum, P. odoriferum, P. actinidiae, P. parmentieri, P. wasabiae, P. punjabense, P. polonicum, P. zantedeschiae, P. peruviense, P. aroidearum, Pectobacterium sp. CFBP8739, P. colocasium, P. fontis, P. cacticida

Use the TFL signature present in diverse species to select MSA candidates.

Code
egTfl <- dplyr::arrange(combMetadata, desc(n_species)) %>% 
  dplyr::slice(1:3) %>% 
  dplyr::pull(haplotype) %>% 
  stringr::str_split(pattern = ";") %>% 
  unlist()

msaGenomes <- dplyr::filter(regionHaplotypes, haplotype %in% !!egTfl) %>%
  dplyr::group_by(haplotype, SpeciesName) %>% 
  dplyr::slice(1L) %>% 
  dplyr::select(regionId, genomeId, haplotype, SpeciesName, everything())


# # type strain genomes
# msaGenomes <- union(
#   msaGenomes,
#   c("g_158", "g_446", "g_66", "g_222", "g_296", "g_442", "g_8",
#     "g_38", "g_273", "g_259", "g_305", "g_378", "g_428", "g_248",
#     "g_449", "g_54", "g_116", "g_423", "g_375", "g_381")
# )

paste(msaGenomes$genomeId, collapse = ",")
[1] "g_279,g_425,g_299,g_377,g_100,g_106,g_331,g_249,g_125,g_221,g_53,g_278,g_324,g_103"
Back to top