---
title: "Prophage clustering using syntenic Jaccard index"
date: "`r Sys.Date()`"
format:
html:
embed-resources: true
df-print: paged
knitr:
opts_chunk:
fig.height: 7
---
***
This script cluster prophages using syntenic Jaccard index distance.
## Initial setup
```{r}
#| label: setup
#| warning: false
suppressPackageStartupMessages (library (tidyverse))
suppressPackageStartupMessages (library (here))
suppressPackageStartupMessages (library (magrittr))
suppressPackageStartupMessages (library (igraph))
suppressPackageStartupMessages (library (apcluster))
suppressPackageStartupMessages (library (dendextend))
suppressPackageStartupMessages (library (ComplexHeatmap))
suppressPackageStartupMessages (library (ape))
suppressPackageStartupMessages (library (logger))
suppressPackageStartupMessages (library (configr))
# cluster prophages to get representative phages
rm (list = ls ())
source ("https://raw.githubusercontent.com/lakhanp1/omics_utils/main/RScripts/utils.R" )
source ("scripts/utils/config_functions.R" )
source ("scripts/utils/homology_groups.R" )
source ("scripts/utils/heatmap_utils.R" )
source ("scripts/utils/compare_hg_sets.R" )
################################################################################
set.seed (124 )
confs <- prefix_config_paths (
conf = suppressWarnings (configr:: read.config (file = "project_config.yaml" )),
dir = "."
)
pangenome <- confs$ data$ pangenomes$ pectobacterium.v2$ name
panConf <- confs$ data$ pangenomes[[pangenome]]
prophageLenCutoff <- confs$ analysis$ prophages$ cutoff_length
outDir <- confs$ analysis$ prophages$ preprocessing$ path
colorList <- list (
jaccard = list (
breaks = c (0 , 0.1 , 0.2 , 0.3 , 0.4 , 0.5 , 0.6 , 0.7 , 0.8 , 0.9 , 0.95 , 0.97 , 1 ),
colors = viridisLite:: viridis (n = 13 , option = "magma" )
),
mash = list (
breaks = c (0 , 0.03 , 0.05 , 0.1 , 0.15 , 0.2 , 0.3 , 0.4 , 0.5 , 0.6 , 0.7 , 0.8 , 1 ),
colors = viridisLite:: viridis (n = 13 , option = "magma" )
)
)
```
## Import data
```{r}
sampleInfo <- get_metadata (file = panConf$ files$ metadata, genus = confs$ genus)
sampleInfoList <- as.list_metadata (
df = sampleInfo, sampleId, sampleName, SpeciesName, strain, nodeLabs, genomeId
)
prophagePool <- suppressMessages (
readr:: read_tsv (confs$ analysis$ prophages$ preprocessing$ files$ prophage_pool)
) %>%
dplyr:: select (
prophage_id, fragments, nFragments, parent, prophage_length, nHg, genomeId,
completeness, checkv_quality, taxonomy
)
fragmentedPhages <- dplyr:: filter (prophagePool, nFragments > 1 )
unfragPhages <- dplyr:: filter (prophagePool, nFragments == 1 ) %>%
dplyr:: select (- parent)
simDf <- suppressMessages (
readr:: read_tsv (confs$ analysis$ prophages$ preprocessing$ files$ pair_comparison)
)
mashMat <- suppressMessages (
readr:: read_tsv (confs$ analysis$ prophages$ preprocessing$ files$ mash_dist)
) %>%
tibble:: column_to_rownames (var = "prophage_id" ) %>%
as.matrix ()
mashMat <- mashMat[unfragPhages$ prophage_id, unfragPhages$ prophage_id]
mashUpgma <- ape:: read.tree (confs$ analysis$ prophages$ preprocessing$ files$ mash_hclust)
```
Import HGs for raw prophages stored locally and map them to consolidated prophages
```{r}
# read prophage HGs stored locally
rawProHgs <- suppressMessages (
readr:: read_tsv (confs$ analysis$ prophages$ preprocessing$ files$ raw_prophage_hg)
) %>%
dplyr:: select (prophage_id = id, hgs) %>%
dplyr:: mutate (
hgs = stringr:: str_split (hgs, ";" )
)
# merge the HGs from multiple prophage fragments when they are merged
prophageHgs <- dplyr:: select (prophagePool, prophage_id, fragments) %>%
dplyr:: mutate (fragments = stringr:: str_split (fragments, ";" )) %>%
tidyr:: unnest (fragments) %>%
dplyr:: left_join (y = rawProHgs, by = c ("fragments" = "prophage_id" )) %>%
dplyr:: summarize (
hgs = list (unique (unlist (hgs))),
.by = prophage_id
)
```
Total prophages to be clustered: `r nrow(prophagePool)`
## Process syntenic Jaccard similarity and MASH distance data
```{r}
# process data
phageCmpDf <- dplyr:: bind_rows (
simDf,
dplyr:: rename (simDf, p2 = phage1, p1 = phage2) %>%
dplyr:: rename (phage1 = p1, phage2 = p2)
)
allPhageJaccardMat <- tidyr:: pivot_wider (
phageCmpDf,
id_cols = phage1,
names_from = phage2,
values_from = jaccardIndex
) %>%
tibble:: column_to_rownames (var = "phage1" ) %>%
as.matrix ()
```
:::{.callout-important}
To remove the noise while clustering exclude:
- fragmented prophages
- all the prophages that show syntenic Jaccard index 0.5 or lower
against other prophages
:::
```{r}
jaccardMat <- allPhageJaccardMat[unfragPhages$ prophage_id, unfragPhages$ prophage_id]
if (all (is.na (diag (jaccardMat)))) {
diag (jaccardMat) <- 1
}
if (! isSymmetric (jaccardMat)) {
stop ("pairwise Jaccard index matrix is not symmetric" )
}
# convert to distance matrix
jacDistMat <- max (jaccardMat) - jaccardMat
quantile (jaccardMat, c (0 , 0.25 , 0.5 , 0.75 , seq (0.9 , 0.99 , by = 0.01 ), 0.995 , 1 ))
```
```{r}
breakPoint <- 0.5
tempJcMat <- jaccardMat
diag (tempJcMat) <- NA
maxJc <- matrixStats:: rowMaxs (tempJcMat, na.rm = TRUE , useNames = TRUE )
noisyNodes <- which (maxJc <= breakPoint)
trimmedNodes <- which (maxJc > breakPoint)
trimmedJaccardMat <- jaccardMat[unname (trimmedNodes), unname (trimmedNodes)]
```
## Hierarchical clustering of Jaccard similarity
A heatmap showing the syntenic Jaccard index between the singleton nodes
identified above and the remaining nodes that will be used for clustering.
```{r}
#| column: page
#| fig-height: 7
#| fig-width: 13
#| out-width: '100%'
#| layout-valign: top
ht_noise <- Heatmap (
matrix = jaccardMat[unname (noisyNodes), unname (trimmedNodes)],
name = "noisy_jaccard" ,
column_title = "selected nodes for clustering" ,
row_title = "excluded singleton prophages" ,
col = circlize:: colorRamp2 (
breaks = colorList$ mash$ breaks, colors = colorList$ mash$ colors
),
heatmap_legend_param = list (
direction = "horizontal" , legend_width = unit (5 , "cm" )
),
use_raster = TRUE , raster_quality = 3 ,
show_row_names = FALSE , show_column_names = FALSE
)
ComplexHeatmap:: draw (
ht_noise,
column_title = "Syntenic Jaccard index" ,
column_title_gp = gpar (fontsize = 16 , fontface = "bold" ),
heatmap_legend_side = "bottom"
)
```
:::{.callout-important}
Noisy nodes identified above will be excluded from the hierarchical clustering
of the prophages based on syntenic Jaccard index. Later, these noisy nodes will
be added as singletons to the clusters identified by `hclust` .
:::
```{r}
phageHc <- hclust (
as.dist (jacDistMat[unname (trimmedNodes), unname (trimmedNodes)]),
method = "complete"
)
# plot(phageHc, hang = -1)
# hclust(as.dist(jacDistMat), method = "ward.D2") %>%
# as.dendrogram() %>%
# dendextend::ladderize() %>%
# plot(horiz = TRUE)
phageDend <- as.dendrogram (phageHc) %>%
dendextend:: ladderize ()
phageDend %>%
dendextend:: get_nodes_attr ("height" ) %>%
hist ()
# dendextend::cutree(as.dendrogram(blacl), h = 0.8) %>% table()
phagePhylo <- ape:: as.phylo (phageDend)
```
```{r}
#| fig-height: 10
#| fig-width: 7
#| out-width: '100%'
#| layout-valign: top
rev (phageDend) %>%
plot (
horiz = TRUE ,
main = "Hierarchical clustering of syntenic Jaccard distance"
)
```
## Visualize the clusters with data
### Syntenic Jaccard distance
:::{.callout-important}
The heatmap below excludes obvious singletons i.e. noisy nodes identified above
(n = `r length(noisyNodes)` ). Thus this heatmap shows comparison of
`r length(trimmedNodes)` prophages.
:::
```{r}
ht_jaccard <- plot_species_ANI_heatmap (
mat = jaccardMat[phagePhylo$ tip.label, phagePhylo$ tip.label],
phy = phagePhylo,
name = "jaccard" ,
column_title = "Syntenic Jaccard index" ,
col = circlize:: colorRamp2 (
breaks = colorList$ jaccard$ breaks, colors = colorList$ jaccard$ colors
),
show_column_dend = TRUE , column_dend_height = unit (3 , "cm" ),
heatmap_legend_param = list (
direction = "horizontal" , legend_width = unit (5 , "cm" )
),
use_raster = TRUE , raster_quality = 2
)
```
```{r}
#| echo: false
#| column: page
#| fig-height: 8
#| fig-width: 10
#| out-width: '100%'
#| layout-valign: top
ComplexHeatmap:: draw (
ht_jaccard,
heatmap_legend_side = "bottom"
)
```
### Add MASH distance heatmap
```{r}
ht_mash <- plot_species_ANI_heatmap (
mat = mashMat[phagePhylo$ tip.label, phagePhylo$ tip.label],
phy = phagePhylo,
name = "mash" ,
column_title = "MASH distance" ,
show_column_dend = FALSE ,
col = circlize:: colorRamp2 (
breaks = colorList$ mash$ breaks, colors = colorList$ mash$ colors
),
heatmap_legend_param = list (
direction = "horizontal" , legend_width = unit (5 , "cm" )
),
use_raster = TRUE , raster_quality = 3
)
htList <- ht_jaccard + ht_mash
```
```{r}
#| echo: false
#| column: page
#| fig-height: 8
#| fig-width: 15
#| out-width: '200%'
#| layout-valign: top
pdf (
file = file.path (outDir, "prophage_jaccard_clustering.pdf" ),
width = 15 , height = 8
)
ComplexHeatmap:: draw (
htList,
main_heatmap = "jaccard" ,
row_dend_side = "left" ,
heatmap_legend_side = "bottom"
)
dev.off ()
ComplexHeatmap:: draw (
htList,
main_heatmap = "jaccard" ,
row_dend_side = "left" ,
heatmap_legend_side = "bottom"
)
```
## Cut tree to generate clusters
:::{.callout-important}
Here, the noisy nodes will be added as singletons to the prophage clusters.
:::
```{r}
treeCut <- dendextend:: cutree (tree = phageDend, h = 0.66 )
phageGroups <- tibble:: enframe (
treeCut,
name = "prophage_id" , value = "phage_grp"
) %>%
dplyr:: bind_rows (
tibble:: tibble (
prophage_id = names (noisyNodes),
phage_grp = (1 : length (noisyNodes)) + length (unique (treeCut))
)
) %>%
dplyr:: mutate (
phage_grp = paste ("phage_grp_" , phage_grp, sep = "" )
) %>%
dplyr:: left_join (unfragPhages, by = "prophage_id" )
```
Finally, add the `r nrow(fragmentedPhages)` fragmented prophages which could be
mapped to a prophage in the pool to the respective clusters of their best
matching parent prophage.
```{r}
fragmentsToAdd <- dplyr:: left_join (
x = fragmentedPhages,
y = dplyr:: select (phageGroups, prophage_id, phage_grp),
by = c ("parent" = "prophage_id" )
) %>%
dplyr:: select (- parent)
phageGroups <- dplyr:: bind_rows (phageGroups, fragmentsToAdd) %>%
tidyr:: replace_na (list (nFragments = 1 )) %>%
dplyr:: mutate (
fragments = dplyr:: if_else (is.na (fragments), prophage_id, fragments)
) %>%
dplyr:: select (prophage_id, phage_grp, fragments, nFragments, everything ())
```
Cluster representatives are determined based on the mean Jaccard index of cluster
members against all members. The cluster member with highest mean Jaccard index
against the cluster members is selected as a cluster representative.
Cluster representatives are determined using the following criteria:
- Completeness of the prophages in the cluster determined by checkV (higher -> better)
- mean Jaccard index of the prophage against cluster members
Prophages in the cluster are ranked based on these two criteria and the best
prophage is selected as the cluster representative.
```{r}
# get cluster roots
clusterRoots <- split (x = phageGroups$ prophage_id, f = phageGroups$ phage_grp) %>%
# .[c("phage_grp_114", "phage_grp_12", "phage_grp_131", "phage_grp_132", "phage_grp_173", "phage_grp_174")] %>%
purrr:: map_dfr (
.f = function (x) {
# root -> highest mean Jaccard index across group
if (length (x) == 1 ) {
rm <- setNames (object = 1 , nm = x)
} else {
subJc <- allPhageJaccardMat[x, x]
diag (subJc) <- NA
rm <- matrixStats:: rowMeans2 (subJc, na.rm = TRUE , useNames = TRUE ) %>%
sort (decreasing = TRUE ) %>%
round (digits = 3 )
}
return (
tibble:: tibble (
prophage_id = names (rm),
mean_grp_sim = rm,
grp_size = length (x),
)
)
}
)
```
Combine the clusters with cluster roots.
```{r}
rootedClusters <- dplyr:: left_join (
phageGroups, clusterRoots,
by = "prophage_id"
) %>%
dplyr:: group_by (phage_grp) %>%
dplyr:: arrange (
dplyr:: desc (completeness),
dplyr:: desc (prophage_length),
dplyr:: desc (mean_grp_sim),
.by_group = TRUE
) %>%
dplyr:: mutate (is_root = 1 : n ()) %>%
dplyr:: ungroup () %>%
dplyr:: mutate (
is_root = dplyr:: if_else (is_root == 1 , 1 , 0 )
) %>%
dplyr:: left_join (y = prophageHgs, by = "prophage_id" )
representatives <- dplyr:: filter (rootedClusters, is_root == 1 )
rootedClusters %<>% dplyr:: left_join (
y = dplyr:: select (representatives, phage_grp, root_id = prophage_id),
by = "phage_grp"
)
```
## Clustering statistics
Total prophages used for clustering: `r nrow(rootedClusters)`
Total unique homology groups from prophages before clustering:
`r length(unique(unlist(rootedClusters$hgs)))`
Total prophage clusters: `r nrow(representatives)`
Total unique homology groups of prophage representatives after clustering:
`r length(unique(unlist(representatives$hgs)))`
```{r}
phageClusters <- dplyr:: select (rootedClusters, - hgs) %>%
dplyr:: arrange (
dplyr:: desc (grp_size), phage_grp, desc (completeness), desc (mean_grp_sim)
) %>%
dplyr:: left_join (
y = dplyr:: select (
sampleInfo, genomeId, sampleId, SpeciesName, nodepath.kmer_upgma,
geo_loc_country, host, isolation_source, env_broad_scale, collection_year
),
by = "genomeId"
)
readr:: write_tsv (
phageClusters,
file = confs$ analysis$ prophages$ files$ clusters
)
```
:::{.callout-note}
What is the variation in genome ANI of the phages in the same group?
:::
If the clustering is good, the representatives prophages should not have high
syntenic Jaccard index against other representative prophages.
```{r}
repJacMat <- jaccardMat[representatives$ prophage_id, representatives$ prophage_id]
diag (repJacMat) <- NA
matrixStats:: rowMaxs (repJacMat, useNames = TRUE , na.rm = TRUE ) %>%
sort (decreasing = TRUE ) %>%
head ()
```
## Draw figures again with representatives
### Representative prophages syntenic Jaccard index heatmap
```{r}
repDend <- hclust (
as.dist (
jacDistMat[representatives$ prophage_id, representatives$ prophage_id]
),
method = "complete"
) %>%
as.dendrogram () %>%
dendextend:: ladderize ()
ht_jaccardRep <- plot_species_ANI_heatmap (
mat = jaccardMat[representatives$ prophage_id, representatives$ prophage_id],
phy = repDend,
name = "jaccard" ,
column_title = "Syntenic Jaccard index for representative prophages" ,
col = circlize:: colorRamp2 (
breaks = colorList$ jaccard$ breaks, colors = colorList$ jaccard$ colors
),
use_raster = TRUE , raster_quality = 2 ,
heatmap_legend_param = list (
direction = "horizontal" , legend_width = unit (5 , "cm" )
)
)
ht_mashRep <- plot_species_ANI_heatmap (
mat = mashMat[representatives$ prophage_id, representatives$ prophage_id],
phy = repDend,
name = "mash" ,
column_title = "MASH distance" ,
col = circlize:: colorRamp2 (
breaks = colorList$ mash$ breaks, colors = colorList$ mash$ colors
),
use_raster = TRUE , raster_quality = 2 ,
heatmap_legend_param = list (
direction = "horizontal" , legend_width = unit (5 , "cm" )
)
)
htListRep <- ht_jaccardRep + ht_mashRep
```
```{r}
#| echo: false
#| column: page
#| fig-height: 8
#| fig-width: 15
#| out-width: '200%'
#| layout-valign: top
ComplexHeatmap:: draw (
htListRep,
main_heatmap = "jaccard" ,
row_dend_side = "left" ,
heatmap_legend_side = "bottom"
)
```
:::{.scrolling_y}
### Visualize the final dendrogram
All the `r nrow(representatives)` representatives which includes noisy nodes
(n = `r length(noisyNodes)` ) and cluster representatives (n = `r max(treeCut)` )
obtained by clustering `r length(trimmedNodes)` prophages, are visualized in this
dendrogram. The later `r max(treeCut)` nodes are colored in red.
```{r}
#| column: page
#| fig-height: 30
#| fig-width: 10
#| out-width: '100%'
#| layout-valign: top
rev (repDend) %>%
dendextend:: set (
what = "by_labels_branches_col" ,
value = setdiff (representatives$ prophage_id, names (noisyNodes))
) %>%
dendextend:: set ("labels_cex" , 0.5 ) %>%
plot (
horiz = TRUE ,
main = "Cluster representatives dendrogram"
)
```
:::