CTV HGT using Mash distance for variable and constant locus

Author

Lakhansing Pardeshi

Published

July 20, 2025

Pangenome analysis of the carotovoricin (CTV) region shows a constant and variable loci. The tail fiber loci (TFL) is the most variable locus followed by the tail tape measure protein. Below, we explore this variable loci at the pangenome scale.

Initial setup

Code
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(ape))
suppressPackageStartupMessages(library(treeio))
suppressPackageStartupMessages(library(ggtree))
suppressPackageStartupMessages(library(ggbeeswarm))
suppressPackageStartupMessages(library(ggrastr))
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(patchwork))
suppressPackageStartupMessages(library(ComplexHeatmap))
suppressPackageStartupMessages(library(org.Pectobacterium.spp.pan.eg.db))

rm(list = ls())

source("https://raw.githubusercontent.com/lakhanp1/omics_utils/main/RScripts/utils.R")
source("scripts/utils/config_functions.R")
source("scripts/utils/phylogeny_functions.R")
source("scripts/utils/homology_groups.R")
source("scripts/utils/heatmap_utils.R")
################################################################################
set.seed(124)

confs <- prefix_config_paths(
  conf = suppressWarnings(configr::read.config(file = "project_config.yaml")),
  dir = "."
)

outDir <- paste(confs$analysis$ctv$tfl_hgt$path)

pangenome <- confs$data$pangenomes$pectobacterium.v2$name
panConf <- confs$data$pangenomes[[pangenome]]
panOrgDb <- org.Pectobacterium.spp.pan.eg.db
prophageLenCutoff <- confs$analysis$prophages$cutoff_length
treeMethod <- "core_snp_ml" # ani_upgma, kmer_nj

Import data

Code
sampleInfo <- get_metadata(file = panConf$files$metadata, genus = confs$genus)

sampleInfoList <- as.list_metadata(
  df = sampleInfo, sampleId, sampleName, SpeciesName, strain, nodeLabs, genomeId
)

rawTree <- import_tree(
  confs$analysis$phylogeny[[treeMethod]]$files$tree_rooted,
  phylo = TRUE
)

rawTree <- ape::ladderize(rawTree, right = FALSE)

speciesOrder <- suppressMessages(
  readr::read_tsv(confs$analysis$phylogeny[[treeMethod]]$files$species_order)
) %>%
  dplyr::mutate(order = n():1)

distCophen <- tibble::as_tibble(x = cophenetic.phylo(rawTree), rownames = "r1_genomeId") %>%
  tidyr::pivot_longer(
    cols = -r1_genomeId,
    names_to = "r2_genomeId", values_to = "cophenetic"
  )

Import tail regions and distance tables

Build a pairwise tail loci comparison table

Code
tailRegions <- suppressMessages(
  readr::read_tsv(confs$analysis$ctv$tfl$files$hg_regions)
) %>%
  dplyr::left_join(
    y = dplyr::select(
      sampleInfo, genomeId, SpeciesName, geo_loc_country, collection_year,
      collection_year, nodepath.kmer_nj, host, isolation_source
    ),
    by = "genomeId"
  )

# make unique combinations for ctv regions
tailCombs <- tidyr::expand_grid(
  r1 = tailRegions$regionId, r2 = tailRegions$regionId
) %>%
  dplyr::filter(r1 != r2) %>%
  dplyr::mutate(
    pair = paste(pmin(r1, r2), pmax(r1, r2), sep = ",")
  ) %>%
  dplyr::distinct(pair, .keep_all = TRUE) %>%
  dplyr::select(-pair) %>%
  dplyr::left_join(
    y = dplyr::select(tailRegions, regionId, r1_tail = haplotype),
    by = c("r1" = "regionId")
  ) %>%
  dplyr::left_join(
    y = dplyr::select(tailRegions, regionId, r2_tail = haplotype),
    by = c("r2" = "regionId")
  ) %>%
  dplyr::mutate(
    tail_loci = dplyr::if_else(
      r1_tail == r2_tail, "same", "different"
    )
  )

Import Mash distance

Code
tailMash <- suppressMessages(
  readr::read_tsv(
    file = confs$analysis$ctv$tfl$files$mash_dist,
    col_names = c("r1", "r2", "mash_tail", "mash_pval_tail", "hash_tail")
  )
)

conservedMash <- suppressMessages(
  readr::read_tsv(
    file = confs$analysis$ctv$conserved$files$mash_dist,
    col_names = c("r1", "r2", "mash_conserv", "mash_pval_conserved", "hash_conserved")
  )
)

Import Jaccard distance from dashing tool

Code
import_dashing_matrix <- function(file) {
  dashingDist <- suppressMessages(readr::read_tsv(file = file)) %>%
    dplyr::rename(g1 = "##Names") %>%
    dplyr::rename_with(
      .fn = ~ stringr::str_replace(
        string = .x, pattern = ".*/(g_\\d+\\..*)\\.fasta", replacement = "\\1"
      )
    ) %>%
    dplyr::mutate(
      g1 = stringr::str_replace(
        string = g1, pattern = ".*/(g_\\d+\\..*)\\.fasta", replacement = "\\1"
      )
    ) %>%
    tidyr::pivot_longer(cols = -g1, names_to = "g2", values_to = "distance") %>%
    dplyr::filter(distance != "-") %>%
    dplyr::mutate(
      distance = as.numeric(distance)
    )
  
  return(
    dplyr::bind_rows(
      dplyr::rename(dashingDist, r1 = g1, r2 = g2),
      dplyr::rename(dashingDist, r1 = g2, r2 = g1)
    )
  )
}

tailDash <- import_dashing_matrix(confs$analysis$ctv$tfl$files$dashing_dist) %>%
  dplyr::rename(dash_tail = distance)

conservedDash <- import_dashing_matrix(confs$analysis$ctv$conserved$files$dashing_dist) %>%
  dplyr::rename(dash_conserv = distance)

Combine the data with pairwise tail loci in the pangenome

Code
ctvDist <- dplyr::left_join(
  x = tailCombs, y = tailMash, by = c("r1", "r2")
) %>%
  dplyr::left_join(y = conservedMash, by = c("r1", "r2")) %>%
  dplyr::left_join(y = tailDash, by = c("r1", "r2")) %>%
  dplyr::left_join(y = conservedDash, by = c("r1", "r2")) %>%
  tidyr::separate_wider_delim(
    cols = c(r1, r2), delim = ".",
    names = c("genomeId", "species"), names_sep = "_"
  ) %>%
  dplyr::filter(dplyr::if_all(.cols = starts_with("dist_"), .fns = ~ !is.na(.x))) %>%
  dplyr::select(-starts_with("hash_"))

Analyze Mash distance matrix for all TFLs

Code
combinedDist <- dplyr::left_join(
  x = ctvDist, y = distCophen, by = c("r1_genomeId", "r2_genomeId")
) %>%
  dplyr::mutate(
    species_pair = dplyr::if_else(
      r1_species == r2_species, "intra-species", "inter-species"
    ),
    tail_loci = forcats::fct_relevel(tail_loci, "same"),
    species_pair = forcats::fct_relevel(species_pair, "intra-species")
  ) %>%
  dplyr::filter(
    dplyr::if_all(.cols = c(mash_tail, mash_conserv), .fns = ~ !is.na(.x))
  )

readr::write_tsv(
  x = combinedDist,
  file = paste(outDir, "/conserved_tfl_loci_distances.tab", sep = "")
)

distLongDf <- tidyr::pivot_longer(
  data = dplyr::select(combinedDist, !(contains("_pval") | starts_with("dash"))),
  cols = c(mash_tail, mash_conserv),
  names_to = "locus",
  values_to = "mash"
)

Visualize the distribution of cophenetic distances

Code
dens_cophn <- ggplot2::ggplot(
  data = combinedDist,
  mapping = aes(x = cophenetic)
) +
  ggplot2::geom_density() +
  ggplot2::labs(
    title = "Density distribution of cophenetic(core-phylo)"
  ) +
  theme_bw(base_size = 16)

dens_cophn

Compare Mash distances distributions between conserved and tail fiber loci

Shapiro-Wilk test to check if the Mash distances are normally distributed:

Code
shapiro.test(x = sample(combinedDist$mash_tail, size = 5000))

    Shapiro-Wilk normality test

data:  sample(combinedDist$mash_tail, size = 5000)
W = 0.92166, p-value < 2.2e-16
Code
shapiro.test(x = sample(combinedDist$mash_conserv, size = 5000))

    Shapiro-Wilk normality test

data:  sample(combinedDist$mash_conserv, size = 5000)
W = 0.94976, p-value < 2.2e-16

The Mash distances are not normally distributed. Therefore, non-parametric pairwise Wilcoxon test is used to compare Mash distance distributions of TFL and conserved loci.

Code
pairwise.wilcox.test(x = distLongDf$mash, g = distLongDf$locus)

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  distLongDf$mash and distLongDf$locus 

          mash_conserv
mash_tail <2e-16      

P value adjustment method: holm 
Code
pt_distribution <- ggplot2::ggplot(
  data = distLongDf,
  mapping = aes(x = locus, y = mash)
) +
  ggrastr::rasterize(ggbeeswarm::geom_quasirandom(), dpi = 300) +
  ggpubr::stat_compare_means(
    # method = "wilcox.test", paired = TRUE,
    label.y.npc = 0.9, label.x.npc = "center", vjust = -1, hjust = 0.5, size = 6
  ) +
  labs(y = "Mash distance") +
  scale_x_discrete(
    labels = c("mash_tail" = "TFL", "mash_conserv" = "Conserved")
  ) +
  guides(colour = guide_legend(override.aes = list(size = 5))) +
  theme_bw(base_size = 20) +
  theme(
    # axis.text.x = element_text(angle = 45, hjust = 1, face = "bold.italic"),
    axis.title.x = element_blank()
  ) 

Compare cophenetic distances of core SNP tree with inter-genome Mash distances for CTV loci. Use conserved locus for control and compare with TFL locus.

Code
scatter_ctvPhylo <- ggplot2::ggplot(
  data = dplyr::arrange(distLongDf, desc(tail_loci), species_pair),
  mapping = aes(
    x = cophenetic, y = mash
    # x = cophenetic, y = dash,
  )
) +
  ggplot2::geom_point(
    alpha = 1, size = 2,
    mapping = aes(color = tail_loci, shape = species_pair)
  ) +
  ggpubr::stat_cor(label.x.npc = 0, label.y.npc = 1, hjust = 0) +
  ggplot2::labs(
    x = "dist.cophenetic(core-phylo)",
  ) +
  ggplot2::scale_shape_manual(
    values = c("intra-species" = 1, "inter-species" = 17),
    name = "Species pairs"
  ) +
  ggplot2::scale_color_manual(
    values = c("same" = "#E69F00", "different" = "black"),
    name = "Tail locus pair"
  ) +
  ggplot2::facet_wrap(
    facets = ~locus, nrow = 1,
    labeller = ggplot2::as_labeller(
      x = c("mash_tail" = "TFL", "mash_conserv" = "Conserved")
    )
  ) +
  ggplot2::theme_bw(base_size = 18) +
  ggplot2::theme(
    legend.position = "bottom",
    legend.justification = c(0, 1),
    panel.grid = element_blank()
  )

Compare the pairwise genomic Mash distance between conserved CTV loci vs TFL loci.

Code
scatter_tailConserved <- ggplot2::ggplot(
  data = dplyr::arrange(combinedDist, desc(tail_loci), species_pair)
) +
  ggplot2::geom_point(
    mapping = aes(
      x = mash_conserv, y = mash_tail,
      # x = dash_conserv, y = dash_tail,
      color = tail_loci, shape = species_pair
    ),
    alpha = 1, size = 2
  ) +
  scale_x_continuous(expand = expansion(0.01)) +
  scale_y_continuous(expand = expansion(0.01)) +
  # geom_abline(slope = 1, color = "red", linetype = "dashed", linewidth = 1) +
  ggplot2::labs(
    x = "dist.Mash (conserved locus)",
    y = "dist.Mash (tail locus)"
  ) +
  ggplot2::scale_shape_manual(
    values = c("intra-species" = 1, "inter-species" = 17),
    name = "Species pairs"
  ) +
  ggplot2::scale_color_manual(
    values = c("same" = "#E69F00", "different" = "black"),
    name = "Tail locus pair"
  ) +
  ggplot2::theme_bw(base_size = 18) +
  ggplot2::theme(
    legend.position = c(0.01, 0.99),
    legend.justification = c(0, 1),
    # plot.margin = unit(c(0, 0, 0, 0), units = "cm"),
    panel.grid = element_blank()
  )

# sideTheme <- theme_void() +
#   theme(
#     legend.position = "none",
#     # plot.margin = unit(c(0, 0, 0, 0), units = "cm"),
#     panel.background = element_rect(color = "black", linewidth = 1)
#   )
#
# xplot <-  ggplot2::ggplot(data = combinedDist) +
#   ggplot2::geom_density(
#     mapping = aes(x = mash_conserv, linetype = species_pair), linewidth = 1
#   ) +
#   scale_x_continuous(expand = expansion(0.01)) +
#   scale_linetype_manual(
#     values = c("intra-species" = 1, "inter-species" = 2)
#   ) +
#   sideTheme
#
# yplot <- ggplot2::ggplot(data = combinedDist) +
#   ggplot2::geom_density(
#     mapping = aes(y = mash_tail, linetype = species_pair), linewidth = 1
#   ) +
#   scale_y_continuous(expand = expansion(0.01)) +
#   scale_linetype_manual(
#     values = c("intra-species" = 2, "inter-species" = 1)
#   ) +
#   sideTheme
#
# ggpubr::ggarrange(
#   xplot, NULL, scatter_tailConserved, yplot,
#   ncol = 2, nrow = 2, align = "hv",
#   widths = c(5, 1), heights = c(1, 5)
# )

Inter-species TFLs with smaller Mash distance:

Process conserved and TFL region phylogenetic trees

Tree generated from the region upstream of carotovoricin cluster locus.

Code
upstreamTree <- treeio::read.newick(
  file.path(confs$analysis$ctv$tfl_hgt$path, "iqtree", "genomeset_1_upstream_core.msa.fasta.treefile"),
  node.label = "support"
)

pt_upstream_tree <- ggtree::ggtree(tr = upstreamTree, size = 1.5) +
  scale_x_continuous(expand = expansion(mult = c(0.02, 0.2))) +
  geom_treescale(
    x = 0, y = length(upstreamTree@phylo$tip.label) * 0.8,
    fontsize = 8, linesize = 2, offset = 0.5
  ) +
  ggtree::geom_tiplab(
    mapping = aes(label = label),
    size = 5, align = TRUE, linesize = 1
  ) +
  ggtree::geom_nodelab(
    mapping = aes(label = support),
    node = "internal", size = 5, hjust = 1.3, vjust = -1, color = "red"
  )

Conserved locus tree:

Code
conservedTree <- treeio::read.newick(
  file.path(confs$analysis$ctv$tfl_hgt$path, "iqtree", "genomeset_1_conserved.msa.fasta.treefile"),
  node.label = "support"
) |> 
  dplyr::left_join(
    y = dplyr::select(tailRegions, label = regionId, haplotype),
    by = "label"
  )


pt_conserved_tree <- ggtree::ggtree(tr = conservedTree, size = 1.5) +
  scale_x_continuous(expand = expansion(mult = c(0.02, 0.3))) +
  ggtree::geom_treescale(
    x = 0, y = length(conservedTree@phylo$tip.label) * 0.8,
    fontsize = 5, linesize = 2, offset = 0.5
  ) +
  ggtree::geom_tiplab(
    mapping = aes(label = label),
    size = 3, align = TRUE, linesize = 0.7
  ) +
  ggtree::geom_nodelab(
    mapping = aes(label = support),
    node = "internal", size = 3, hjust = 1.3, vjust = -1, color = "red"
  ) +
  ggtree::geom_tippoint(mapping = aes(color = haplotype), size = 2) +
  theme(legend.position = "bottom")

# pt_conserved_tree

Tail-fiber locus tree:

Code
tflTree <- treeio::read.newick(
  file.path(confs$analysis$ctv$tfl_hgt$path, "iqtree", "genomeset_1_tfl.msa.fasta.treefile"),
  node.label = "support"
) |> 
  dplyr::left_join(
    y = dplyr::select(tailRegions, label = regionId, haplotype),
    by = "label"
  )

pt_tfl_tree <- ggtree::ggtree(tr = tflTree, size = 1) +
  scale_x_reverse(expand = expansion(mult = c(0.3, 0.02))) +
  ggtree::geom_treescale(
    x = 0, y = length(tflTree@phylo$tip.label) * 0.8,
    fontsize = 5, linesize = 2, offset = 0.5
  ) +
  ggtree::geom_tiplab(
    mapping = aes(label = label),
    size = 3, align = TRUE, linesize = 0.7, hjust = 1
  ) +
  ggtree::geom_nodelab(
    mapping = aes(label = support),
    node = "internal", size = 3, hjust = -0.5, vjust = -1, color = "red"
  ) +
  ggtree::geom_tippoint(mapping = aes(color = haplotype), size = 2) +
  theme(legend.position = "bottom")

# pt_tfl_tree

Combine the conserved and TFL trees for comparison

Back to top