Prophage clustering using syntenic Jaccard index

Author

Lakhansing Pardeshi

Published

July 20, 2025


This script cluster prophages using syntenic Jaccard index distance.

Initial setup

Code
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(here))
suppressPackageStartupMessages(library(magrittr))
suppressPackageStartupMessages(library(igraph))
suppressPackageStartupMessages(library(apcluster))
suppressPackageStartupMessages(library(dendextend))
suppressPackageStartupMessages(library(ComplexHeatmap))
suppressPackageStartupMessages(library(ape))
suppressPackageStartupMessages(library(logger))
suppressPackageStartupMessages(library(configr))

# cluster prophages to get representative phages

rm(list = ls())

source("https://raw.githubusercontent.com/lakhanp1/omics_utils/main/RScripts/utils.R")
source("scripts/utils/config_functions.R")
source("scripts/utils/homology_groups.R")
source("scripts/utils/heatmap_utils.R")
source("scripts/utils/compare_hg_sets.R")
################################################################################
set.seed(124)

confs <- prefix_config_paths(
    conf = suppressWarnings(configr::read.config(file = "project_config.yaml")),
    dir = "."
)

pangenome <- confs$data$pangenomes$pectobacterium.v2$name
panConf <- confs$data$pangenomes[[pangenome]]
prophageLenCutoff <- confs$analysis$prophages$cutoff_length

outDir <- confs$analysis$prophages$preprocessing$path

colorList <- list(
    jaccard = list(
        breaks = c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.97, 1),
        colors = viridisLite::viridis(n = 13, option = "magma")
    ),
    mash = list(
        breaks = c(0, 0.03, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1),
        colors = viridisLite::viridis(n = 13, option = "magma")
    )
)

Import data

Code
sampleInfo <- get_metadata(file = panConf$files$metadata, genus = confs$genus)

sampleInfoList <- as.list_metadata(
    df = sampleInfo, sampleId, sampleName, SpeciesName, strain, nodeLabs, genomeId
)

prophagePool <- suppressMessages(
    readr::read_tsv(confs$analysis$prophages$preprocessing$files$prophage_pool)
) %>%
    dplyr::select(
        prophage_id, fragments, nFragments, parent, prophage_length, nHg, genomeId,
        completeness, checkv_quality, taxonomy
    )

fragmentedPhages <- dplyr::filter(prophagePool, nFragments > 1)
unfragPhages <- dplyr::filter(prophagePool, nFragments == 1) %>%
    dplyr::select(-parent)


simDf <- suppressMessages(
    readr::read_tsv(confs$analysis$prophages$preprocessing$files$pair_comparison)
)

mashMat <- suppressMessages(
    readr::read_tsv(confs$analysis$prophages$preprocessing$files$mash_dist)
) %>%
    tibble::column_to_rownames(var = "prophage_id") %>%
    as.matrix()

mashMat <- mashMat[unfragPhages$prophage_id, unfragPhages$prophage_id]

mashUpgma <- ape::read.tree(confs$analysis$prophages$preprocessing$files$mash_hclust)

Import HGs for raw prophages stored locally and map them to consolidated prophages

Code
# read prophage HGs stored locally
rawProHgs <- suppressMessages(
    readr::read_tsv(confs$analysis$prophages$preprocessing$files$raw_prophage_hg)
) %>%
    dplyr::select(prophage_id = id, hgs) %>%
    dplyr::mutate(
        hgs = stringr::str_split(hgs, ";")
    )

# merge the HGs from multiple prophage fragments when they are merged
prophageHgs <- dplyr::select(prophagePool, prophage_id, fragments) %>%
    dplyr::mutate(fragments = stringr::str_split(fragments, ";")) %>%
    tidyr::unnest(fragments) %>%
    dplyr::left_join(y = rawProHgs, by = c("fragments" = "prophage_id")) %>%
    dplyr::summarize(
        hgs = list(unique(unlist(hgs))),
        .by = prophage_id
    )

Total prophages to be clustered: 1369

Process syntenic Jaccard similarity and MASH distance data

Code
# process data
phageCmpDf <- dplyr::bind_rows(
    simDf,
    dplyr::rename(simDf, p2 = phage1, p1 = phage2) %>%
        dplyr::rename(phage1 = p1, phage2 = p2)
)

allPhageJaccardMat <- tidyr::pivot_wider(
    phageCmpDf,
    id_cols = phage1,
    names_from = phage2,
    values_from = jaccardIndex
) %>%
    tibble::column_to_rownames(var = "phage1") %>%
    as.matrix()
Important

To remove the noise while clustering exclude:

  • fragmented prophages
  • all the prophages that show syntenic Jaccard index 0.5 or lower against other prophages
Code
jaccardMat <- allPhageJaccardMat[unfragPhages$prophage_id, unfragPhages$prophage_id]

if (all(is.na(diag(jaccardMat)))) {
    diag(jaccardMat) <- 1
}

if (!isSymmetric(jaccardMat)) {
    stop("pairwise Jaccard index matrix is not symmetric")
}

# convert to distance matrix
jacDistMat <- max(jaccardMat) - jaccardMat

quantile(jaccardMat, c(0, 0.25, 0.5, 0.75, seq(0.9, 0.99, by = 0.01), 0.995, 1))
    0%    25%    50%    75%    90%    91%    92%    93%    94%    95%    96% 
0.0000 0.0000 0.0000 0.0000 0.2647 0.6296 0.6800 0.7200 0.7391 0.7500 0.7826 
   97%    98%    99%  99.5%   100% 
0.8000 0.8261 0.9000 0.9524 1.0000 
Code
breakPoint <- 0.5
tempJcMat <- jaccardMat
diag(tempJcMat) <- NA
maxJc <- matrixStats::rowMaxs(tempJcMat, na.rm = TRUE, useNames = TRUE)

noisyNodes <- which(maxJc <= breakPoint)
trimmedNodes <- which(maxJc > breakPoint)
trimmedJaccardMat <- jaccardMat[unname(trimmedNodes), unname(trimmedNodes)]

Hierarchical clustering of Jaccard similarity

A heatmap showing the syntenic Jaccard index between the singleton nodes identified above and the remaining nodes that will be used for clustering.

Code
ht_noise <- Heatmap(
    matrix = jaccardMat[unname(noisyNodes), unname(trimmedNodes)],
    name = "noisy_jaccard",
    column_title = "selected nodes for clustering",
    row_title = "excluded singleton prophages",
    col = circlize::colorRamp2(
        breaks = colorList$mash$breaks, colors = colorList$mash$colors
    ),
    heatmap_legend_param = list(
        direction = "horizontal", legend_width = unit(5, "cm")
    ),
    use_raster = TRUE, raster_quality = 3,
    show_row_names = FALSE, show_column_names = FALSE
)

ComplexHeatmap::draw(
    ht_noise,
    column_title = "Syntenic Jaccard index",
    column_title_gp = gpar(fontsize = 16, fontface = "bold"),
    heatmap_legend_side = "bottom"
)

Important

Noisy nodes identified above will be excluded from the hierarchical clustering of the prophages based on syntenic Jaccard index. Later, these noisy nodes will be added as singletons to the clusters identified by hclust.

Code
phageHc <- hclust(
    as.dist(jacDistMat[unname(trimmedNodes), unname(trimmedNodes)]),
    method = "complete"
)
# plot(phageHc, hang = -1)

# hclust(as.dist(jacDistMat), method = "ward.D2") %>%
#   as.dendrogram() %>%
#   dendextend::ladderize() %>%
#   plot(horiz = TRUE)

phageDend <- as.dendrogram(phageHc) %>%
    dendextend::ladderize()

phageDend %>%
    dendextend::get_nodes_attr("height") %>%
    hist()

Code
# dendextend::cutree(as.dendrogram(blacl), h = 0.8) %>% table()

phagePhylo <- ape::as.phylo(phageDend)
Code
rev(phageDend) %>%
    plot(
        horiz = TRUE,
        main = "Hierarchical clustering of syntenic Jaccard distance"
    )

Visualize the clusters with data

Syntenic Jaccard distance

Important

The heatmap below excludes obvious singletons i.e. noisy nodes identified above (n = 318). Thus this heatmap shows comparison of 1001 prophages.

Code
ht_jaccard <- plot_species_ANI_heatmap(
    mat = jaccardMat[phagePhylo$tip.label, phagePhylo$tip.label],
    phy = phagePhylo,
    name = "jaccard",
    column_title = "Syntenic Jaccard index",
    col = circlize::colorRamp2(
        breaks = colorList$jaccard$breaks, colors = colorList$jaccard$colors
    ),
    show_column_dend = TRUE, column_dend_height = unit(3, "cm"),
    heatmap_legend_param = list(
        direction = "horizontal", legend_width = unit(5, "cm")
    ),
    use_raster = TRUE, raster_quality = 2
)

Add MASH distance heatmap

Code
ht_mash <- plot_species_ANI_heatmap(
    mat = mashMat[phagePhylo$tip.label, phagePhylo$tip.label],
    phy = phagePhylo,
    name = "mash",
    column_title = "MASH distance",
    show_column_dend = FALSE,
    col = circlize::colorRamp2(
        breaks = colorList$mash$breaks, colors = colorList$mash$colors
    ),
    heatmap_legend_param = list(
        direction = "horizontal", legend_width = unit(5, "cm")
    ),
    use_raster = TRUE, raster_quality = 3
)

htList <- ht_jaccard + ht_mash
quartz_off_screen 
                2 

Cut tree to generate clusters

Important

Here, the noisy nodes will be added as singletons to the prophage clusters.

Code
treeCut <- dendextend::cutree(tree = phageDend, h = 0.66)

phageGroups <- tibble::enframe(
    treeCut,
    name = "prophage_id", value = "phage_grp"
) %>%
    dplyr::bind_rows(
        tibble::tibble(
            prophage_id = names(noisyNodes),
            phage_grp = (1:length(noisyNodes)) + length(unique(treeCut))
        )
    ) %>%
    dplyr::mutate(
        phage_grp = paste("phage_grp_", phage_grp, sep = "")
    ) %>%
    dplyr::left_join(unfragPhages, by = "prophage_id")

Finally, add the 50 fragmented prophages which could be mapped to a prophage in the pool to the respective clusters of their best matching parent prophage.

Code
fragmentsToAdd <- dplyr::left_join(
    x = fragmentedPhages,
    y = dplyr::select(phageGroups, prophage_id, phage_grp),
    by = c("parent" = "prophage_id")
) %>%
    dplyr::select(-parent)

phageGroups <- dplyr::bind_rows(phageGroups, fragmentsToAdd) %>%
    tidyr::replace_na(list(nFragments = 1)) %>%
    dplyr::mutate(
        fragments = dplyr::if_else(is.na(fragments), prophage_id, fragments)
    ) %>%
    dplyr::select(prophage_id, phage_grp, fragments, nFragments, everything())

Cluster representatives are determined based on the mean Jaccard index of cluster members against all members. The cluster member with highest mean Jaccard index against the cluster members is selected as a cluster representative.

Cluster representatives are determined using the following criteria:

  • Completeness of the prophages in the cluster determined by checkV (higher -> better)
  • mean Jaccard index of the prophage against cluster members

Prophages in the cluster are ranked based on these two criteria and the best prophage is selected as the cluster representative.

Code
# get cluster roots
clusterRoots <- split(x = phageGroups$prophage_id, f = phageGroups$phage_grp) %>%
    # .[c("phage_grp_114", "phage_grp_12", "phage_grp_131", "phage_grp_132", "phage_grp_173", "phage_grp_174")] %>%
    purrr::map_dfr(
        .f = function(x) {
            # root -> highest mean Jaccard index across group
            if (length(x) == 1) {
                rm <- setNames(object = 1, nm = x)
            } else {
                subJc <- allPhageJaccardMat[x, x]
                diag(subJc) <- NA

                rm <- matrixStats::rowMeans2(subJc, na.rm = TRUE, useNames = TRUE) %>%
                    sort(decreasing = TRUE) %>%
                    round(digits = 3)
            }

            return(
                tibble::tibble(
                    prophage_id = names(rm),
                    mean_grp_sim = rm,
                    grp_size = length(x),
                )
            )
        }
    )

Combine the clusters with cluster roots.

Code
rootedClusters <- dplyr::left_join(
    phageGroups, clusterRoots,
    by = "prophage_id"
) %>%
    dplyr::group_by(phage_grp) %>%
    dplyr::arrange(
        dplyr::desc(completeness),
        dplyr::desc(prophage_length),
        dplyr::desc(mean_grp_sim),
        .by_group = TRUE
    ) %>%
    dplyr::mutate(is_root = 1:n()) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(
        is_root = dplyr::if_else(is_root == 1, 1, 0)
    ) %>%
    dplyr::left_join(y = prophageHgs, by = "prophage_id")


representatives <- dplyr::filter(rootedClusters, is_root == 1)

rootedClusters %<>% dplyr::left_join(
    y = dplyr::select(representatives, phage_grp, root_id = prophage_id),
    by = "phage_grp"
)

Clustering statistics

Total prophages used for clustering: 1369

Total unique homology groups from prophages before clustering: 4801

Total prophage clusters: 436

Total unique homology groups of prophage representatives after clustering: 4493

Code
phageClusters <- dplyr::select(rootedClusters, -hgs) %>%
    dplyr::arrange(
        dplyr::desc(grp_size), phage_grp, desc(completeness), desc(mean_grp_sim)
    ) %>%
    dplyr::left_join(
        y = dplyr::select(
            sampleInfo, genomeId, sampleId, SpeciesName, nodepath.kmer_upgma,
            geo_loc_country, host, isolation_source, env_broad_scale, collection_year
        ),
        by = "genomeId"
    )

readr::write_tsv(
    phageClusters,
    file = confs$analysis$prophages$files$clusters
)
Note

What is the variation in genome ANI of the phages in the same group?

If the clustering is good, the representatives prophages should not have high syntenic Jaccard index against other representative prophages.

Code
repJacMat <- jaccardMat[representatives$prophage_id, representatives$prophage_id]

diag(repJacMat) <- NA

matrixStats::rowMaxs(repJacMat, useNames = TRUE, na.rm = TRUE) %>%
    sort(decreasing = TRUE) %>%
    head()
g_142.vir_2 g_327.vir_6 g_233.vir_4 g_150.vir_2   g_3.vir_3   g_4.vir_3 
     0.5333      0.5333      0.5167      0.5167      0.5091      0.5091 

Draw figures again with representatives

Representative prophages syntenic Jaccard index heatmap

Code
repDend <- hclust(
    as.dist(
        jacDistMat[representatives$prophage_id, representatives$prophage_id]
    ),
    method = "complete"
) %>%
    as.dendrogram() %>%
    dendextend::ladderize()

ht_jaccardRep <- plot_species_ANI_heatmap(
    mat = jaccardMat[representatives$prophage_id, representatives$prophage_id],
    phy = repDend,
    name = "jaccard",
    column_title = "Syntenic Jaccard index for representative prophages",
    col = circlize::colorRamp2(
        breaks = colorList$jaccard$breaks, colors = colorList$jaccard$colors
    ),
    use_raster = TRUE, raster_quality = 2,
    heatmap_legend_param = list(
        direction = "horizontal", legend_width = unit(5, "cm")
    )
)

ht_mashRep <- plot_species_ANI_heatmap(
    mat = mashMat[representatives$prophage_id, representatives$prophage_id],
    phy = repDend,
    name = "mash",
    column_title = "MASH distance",
    col = circlize::colorRamp2(
        breaks = colorList$mash$breaks, colors = colorList$mash$colors
    ),
    use_raster = TRUE, raster_quality = 2,
    heatmap_legend_param = list(
        direction = "horizontal", legend_width = unit(5, "cm")
    )
)

htListRep <- ht_jaccardRep + ht_mashRep

Visualize the final dendrogram

All the 436 representatives which includes noisy nodes (n = 318) and cluster representatives (n = 118) obtained by clustering 1001 prophages, are visualized in this dendrogram. The later 118 nodes are colored in red.

Code
rev(repDend) %>%
    dendextend::set(
        what = "by_labels_branches_col",
        value = setdiff(representatives$prophage_id, names(noisyNodes))
    ) %>%
    dendextend::set("labels_cex", 0.5) %>%
    plot(
        horiz = TRUE,
        main = "Cluster representatives dendrogram"
    )

Back to top