---
title: "CTV HGT using Mash distance for variable and constant locus"
date: "`r Sys.time()`"
format:
html:
embed-resources: true
df-print: paged
knitr:
opts_chunk:
fig.height: 7
---
Pangenome analysis of the carotovoricin (CTV) region shows a constant and
variable loci. The tail fiber loci (TFL) is the most variable locus followed by
the tail tape measure protein. Below, we explore this variable loci at the
pangenome scale.
## Initial setup
```{r}
#| label: setup
#| warning: false
suppressPackageStartupMessages (library (tidyverse))
suppressPackageStartupMessages (library (ape))
suppressPackageStartupMessages (library (treeio))
suppressPackageStartupMessages (library (ggtree))
suppressPackageStartupMessages (library (ggbeeswarm))
suppressPackageStartupMessages (library (ggrastr))
suppressPackageStartupMessages (library (cowplot))
suppressPackageStartupMessages (library (patchwork))
suppressPackageStartupMessages (library (ComplexHeatmap))
suppressPackageStartupMessages (library (org.Pectobacterium.spp.pan.eg.db))
rm (list = ls ())
source ("https://raw.githubusercontent.com/lakhanp1/omics_utils/main/RScripts/utils.R" )
source ("scripts/utils/config_functions.R" )
source ("scripts/utils/phylogeny_functions.R" )
source ("scripts/utils/homology_groups.R" )
source ("scripts/utils/heatmap_utils.R" )
################################################################################
set.seed (124 )
confs <- prefix_config_paths (
conf = suppressWarnings (configr:: read.config (file = "project_config.yaml" )),
dir = "."
)
outDir <- paste (confs$ analysis$ ctv$ tfl_hgt$ path)
pangenome <- confs$ data$ pangenomes$ pectobacterium.v2$ name
panConf <- confs$ data$ pangenomes[[pangenome]]
panOrgDb <- org.Pectobacterium.spp.pan.eg.db
prophageLenCutoff <- confs$ analysis$ prophages$ cutoff_length
treeMethod <- "core_snp_ml" # ani_upgma, kmer_nj
```
## Import data
```{r}
sampleInfo <- get_metadata (file = panConf$ files$ metadata, genus = confs$ genus)
sampleInfoList <- as.list_metadata (
df = sampleInfo, sampleId, sampleName, SpeciesName, strain, nodeLabs, genomeId
)
rawTree <- import_tree (
confs$ analysis$ phylogeny[[treeMethod]]$ files$ tree_rooted,
phylo = TRUE
)
rawTree <- ape:: ladderize (rawTree, right = FALSE )
speciesOrder <- suppressMessages (
readr:: read_tsv (confs$ analysis$ phylogeny[[treeMethod]]$ files$ species_order)
) %>%
dplyr:: mutate (order = n (): 1 )
distCophen <- tibble:: as_tibble (x = cophenetic.phylo (rawTree), rownames = "r1_genomeId" ) %>%
tidyr:: pivot_longer (
cols = - r1_genomeId,
names_to = "r2_genomeId" , values_to = "cophenetic"
)
```
## Import tail regions and distance tables
Build a pairwise tail loci comparison table
```{r}
tailRegions <- suppressMessages (
readr:: read_tsv (confs$ analysis$ ctv$ tfl$ files$ hg_regions)
) %>%
dplyr:: left_join (
y = dplyr:: select (
sampleInfo, genomeId, SpeciesName, geo_loc_country, collection_year,
collection_year, nodepath.kmer_nj, host, isolation_source
),
by = "genomeId"
)
# make unique combinations for ctv regions
tailCombs <- tidyr:: expand_grid (
r1 = tailRegions$ regionId, r2 = tailRegions$ regionId
) %>%
dplyr:: filter (r1 != r2) %>%
dplyr:: mutate (
pair = paste (pmin (r1, r2), pmax (r1, r2), sep = "," )
) %>%
dplyr:: distinct (pair, .keep_all = TRUE ) %>%
dplyr:: select (- pair) %>%
dplyr:: left_join (
y = dplyr:: select (tailRegions, regionId, r1_tail = haplotype),
by = c ("r1" = "regionId" )
) %>%
dplyr:: left_join (
y = dplyr:: select (tailRegions, regionId, r2_tail = haplotype),
by = c ("r2" = "regionId" )
) %>%
dplyr:: mutate (
tail_loci = dplyr:: if_else (
r1_tail == r2_tail, "same" , "different"
)
)
```
Import Mash distance
```{r}
tailMash <- suppressMessages (
readr:: read_tsv (
file = confs$ analysis$ ctv$ tfl$ files$ mash_dist,
col_names = c ("r1" , "r2" , "mash_tail" , "mash_pval_tail" , "hash_tail" )
)
)
conservedMash <- suppressMessages (
readr:: read_tsv (
file = confs$ analysis$ ctv$ conserved$ files$ mash_dist,
col_names = c ("r1" , "r2" , "mash_conserv" , "mash_pval_conserved" , "hash_conserved" )
)
)
```
Import Jaccard distance from `dashing` tool
```{r}
import_dashing_matrix <- function (file) {
dashingDist <- suppressMessages (readr:: read_tsv (file = file)) %>%
dplyr:: rename (g1 = "##Names" ) %>%
dplyr:: rename_with (
.fn = ~ stringr:: str_replace (
string = .x, pattern = ".*/(g_ \\ d+ \\ ..*) \\ .fasta" , replacement = " \\ 1"
)
) %>%
dplyr:: mutate (
g1 = stringr:: str_replace (
string = g1, pattern = ".*/(g_ \\ d+ \\ ..*) \\ .fasta" , replacement = " \\ 1"
)
) %>%
tidyr:: pivot_longer (cols = - g1, names_to = "g2" , values_to = "distance" ) %>%
dplyr:: filter (distance != "-" ) %>%
dplyr:: mutate (
distance = as.numeric (distance)
)
return (
dplyr:: bind_rows (
dplyr:: rename (dashingDist, r1 = g1, r2 = g2),
dplyr:: rename (dashingDist, r1 = g2, r2 = g1)
)
)
}
tailDash <- import_dashing_matrix (confs$ analysis$ ctv$ tfl$ files$ dashing_dist) %>%
dplyr:: rename (dash_tail = distance)
conservedDash <- import_dashing_matrix (confs$ analysis$ ctv$ conserved$ files$ dashing_dist) %>%
dplyr:: rename (dash_conserv = distance)
```
Combine the data with pairwise tail loci in the pangenome
```{r}
ctvDist <- dplyr:: left_join (
x = tailCombs, y = tailMash, by = c ("r1" , "r2" )
) %>%
dplyr:: left_join (y = conservedMash, by = c ("r1" , "r2" )) %>%
dplyr:: left_join (y = tailDash, by = c ("r1" , "r2" )) %>%
dplyr:: left_join (y = conservedDash, by = c ("r1" , "r2" )) %>%
tidyr:: separate_wider_delim (
cols = c (r1, r2), delim = "." ,
names = c ("genomeId" , "species" ), names_sep = "_"
) %>%
dplyr:: filter (dplyr:: if_all (.cols = starts_with ("dist_" ), .fns = ~ ! is.na (.x))) %>%
dplyr:: select (- starts_with ("hash_" ))
```
## Analyze Mash distance matrix for all TFLs
```{r}
combinedDist <- dplyr:: left_join (
x = ctvDist, y = distCophen, by = c ("r1_genomeId" , "r2_genomeId" )
) %>%
dplyr:: mutate (
species_pair = dplyr:: if_else (
r1_species == r2_species, "intra-species" , "inter-species"
),
tail_loci = forcats:: fct_relevel (tail_loci, "same" ),
species_pair = forcats:: fct_relevel (species_pair, "intra-species" )
) %>%
dplyr:: filter (
dplyr:: if_all (.cols = c (mash_tail, mash_conserv), .fns = ~ ! is.na (.x))
)
readr:: write_tsv (
x = combinedDist,
file = paste (outDir, "/conserved_tfl_loci_distances.tab" , sep = "" )
)
distLongDf <- tidyr:: pivot_longer (
data = dplyr:: select (combinedDist, ! (contains ("_pval" ) | starts_with ("dash" ))),
cols = c (mash_tail, mash_conserv),
names_to = "locus" ,
values_to = "mash"
)
```
### Visualize the distribution of cophenetic distances
```{r}
dens_cophn <- ggplot2:: ggplot (
data = combinedDist,
mapping = aes (x = cophenetic)
) +
ggplot2:: geom_density () +
ggplot2:: labs (
title = "Density distribution of cophenetic(core-phylo)"
) +
theme_bw (base_size = 16 )
dens_cophn
```
### Compare Mash distances distributions between conserved and tail fiber loci
Shapiro-Wilk test to check if the Mash distances are normally distributed:
```{r}
shapiro.test (x = sample (combinedDist$ mash_tail, size = 5000 ))
```
```{r}
shapiro.test (x = sample (combinedDist$ mash_conserv, size = 5000 ))
```
The Mash distances are not normally distributed. Therefore, non-parametric
pairwise Wilcoxon test is used to compare Mash distance distributions
of TFL and conserved loci.
```{r}
pairwise.wilcox.test (x = distLongDf$ mash, g = distLongDf$ locus)
```
```{r}
pt_distribution <- ggplot2:: ggplot (
data = distLongDf,
mapping = aes (x = locus, y = mash)
) +
ggrastr:: rasterize (ggbeeswarm:: geom_quasirandom (), dpi = 300 ) +
ggpubr:: stat_compare_means (
# method = "wilcox.test", paired = TRUE,
label.y.npc = 0.9 , label.x.npc = "center" , vjust = - 1 , hjust = 0.5 , size = 6
) +
labs (y = "Mash distance" ) +
scale_x_discrete (
labels = c ("mash_tail" = "TFL" , "mash_conserv" = "Conserved" )
) +
guides (colour = guide_legend (override.aes = list (size = 5 ))) +
theme_bw (base_size = 20 ) +
theme (
# axis.text.x = element_text(angle = 45, hjust = 1, face = "bold.italic"),
axis.title.x = element_blank ()
)
```
```{r}
#| echo: false
ggsave (
filename = paste (outDir, "/conserved_vs_tfl_mash_distribution.pdf" , sep = "" ),
plot = pt_distribution, width = 8 , height = 8
)
pt_distribution
```
Compare cophenetic distances of core SNP tree with inter-genome Mash distances
for CTV loci. Use conserved locus for control and compare with TFL locus.
```{r}
scatter_ctvPhylo <- ggplot2:: ggplot (
data = dplyr:: arrange (distLongDf, desc (tail_loci), species_pair),
mapping = aes (
x = cophenetic, y = mash
# x = cophenetic, y = dash,
)
) +
ggplot2:: geom_point (
alpha = 1 , size = 2 ,
mapping = aes (color = tail_loci, shape = species_pair)
) +
ggpubr:: stat_cor (label.x.npc = 0 , label.y.npc = 1 , hjust = 0 ) +
ggplot2:: labs (
x = "dist.cophenetic(core-phylo)" ,
) +
ggplot2:: scale_shape_manual (
values = c ("intra-species" = 1 , "inter-species" = 17 ),
name = "Species pairs"
) +
ggplot2:: scale_color_manual (
values = c ("same" = "#E69F00" , "different" = "black" ),
name = "Tail locus pair"
) +
ggplot2:: facet_wrap (
facets = ~ locus, nrow = 1 ,
labeller = ggplot2:: as_labeller (
x = c ("mash_tail" = "TFL" , "mash_conserv" = "Conserved" )
)
) +
ggplot2:: theme_bw (base_size = 18 ) +
ggplot2:: theme (
legend.position = "bottom" ,
legend.justification = c (0 , 1 ),
panel.grid = element_blank ()
)
```
:::{.scrolling_x}
```{r}
#| fig-height: 8
#| fig-width: 12
#| out-width: '100%'
#| layout-valign: top
#| column: page-right
#| echo: false
ggsave (
filename = paste (outDir, "/conserved_vs_tfl_scatter.pdf" , sep = "" ),
plot = scatter_ctvPhylo, width = 14 , height = 8
)
scatter_ctvPhylo
```
:::
Compare the pairwise genomic Mash distance between conserved CTV loci vs TFL loci.
```{r}
scatter_tailConserved <- ggplot2:: ggplot (
data = dplyr:: arrange (combinedDist, desc (tail_loci), species_pair)
) +
ggplot2:: geom_point (
mapping = aes (
x = mash_conserv, y = mash_tail,
# x = dash_conserv, y = dash_tail,
color = tail_loci, shape = species_pair
),
alpha = 1 , size = 2
) +
scale_x_continuous (expand = expansion (0.01 )) +
scale_y_continuous (expand = expansion (0.01 )) +
# geom_abline(slope = 1, color = "red", linetype = "dashed", linewidth = 1) +
ggplot2:: labs (
x = "dist.Mash (conserved locus)" ,
y = "dist.Mash (tail locus)"
) +
ggplot2:: scale_shape_manual (
values = c ("intra-species" = 1 , "inter-species" = 17 ),
name = "Species pairs"
) +
ggplot2:: scale_color_manual (
values = c ("same" = "#E69F00" , "different" = "black" ),
name = "Tail locus pair"
) +
ggplot2:: theme_bw (base_size = 18 ) +
ggplot2:: theme (
legend.position = c (0.01 , 0.99 ),
legend.justification = c (0 , 1 ),
# plot.margin = unit(c(0, 0, 0, 0), units = "cm"),
panel.grid = element_blank ()
)
# sideTheme <- theme_void() +
# theme(
# legend.position = "none",
# # plot.margin = unit(c(0, 0, 0, 0), units = "cm"),
# panel.background = element_rect(color = "black", linewidth = 1)
# )
#
# xplot <- ggplot2::ggplot(data = combinedDist) +
# ggplot2::geom_density(
# mapping = aes(x = mash_conserv, linetype = species_pair), linewidth = 1
# ) +
# scale_x_continuous(expand = expansion(0.01)) +
# scale_linetype_manual(
# values = c("intra-species" = 1, "inter-species" = 2)
# ) +
# sideTheme
#
# yplot <- ggplot2::ggplot(data = combinedDist) +
# ggplot2::geom_density(
# mapping = aes(y = mash_tail, linetype = species_pair), linewidth = 1
# ) +
# scale_y_continuous(expand = expansion(0.01)) +
# scale_linetype_manual(
# values = c("intra-species" = 2, "inter-species" = 1)
# ) +
# sideTheme
#
# ggpubr::ggarrange(
# xplot, NULL, scatter_tailConserved, yplot,
# ncol = 2, nrow = 2, align = "hv",
# widths = c(5, 1), heights = c(1, 5)
# )
```
```{r}
#| fig-height: 8
#| fig-width: 10
#| out-width: '100%'
#| layout-valign: top
#| echo: false
scatter_tailConserved
```
Inter-species TFLs with smaller Mash distance:
```{r}
#| tbl-cap: Inter-species TFLs with smaller Mash distance
#| echo: false
#| out-height: 200px
dplyr:: filter (
combinedDist, mash_tail < 0.1 , cophenetic > 0.14 ,
species_pair == "inter-species"
) %>%
dplyr:: count (r1_tail, r2_tail) %>%
dplyr:: arrange (desc (n)) %>%
DT:: datatable (
rownames = FALSE ,
filter = "top" ,
class = "compact hover" ,
extensions = c ("KeyTable" , "Scroller" , "Select" , "SearchBuilder" , "FixedColumns" ),
options = list (
autoWidth = FALSE ,
dom = "Qlfrtip" ,
scrollX = TRUE ,
fixedColumns = list (leftColumns = 2 ),
keys = TRUE ,
scroller = TRUE ,
scrollY = 400
),
selection = "none"
)
```
## Process conserved and TFL region phylogenetic trees
Tree generated from the region upstream of carotovoricin cluster locus.
```{r}
upstreamTree <- treeio:: read.newick (
file.path (confs$ analysis$ ctv$ tfl_hgt$ path, "iqtree" , "genomeset_1_upstream_core.msa.fasta.treefile" ),
node.label = "support"
)
pt_upstream_tree <- ggtree:: ggtree (tr = upstreamTree, size = 1.5 ) +
scale_x_continuous (expand = expansion (mult = c (0.02 , 0.2 ))) +
geom_treescale (
x = 0 , y = length (upstreamTree@ phylo$ tip.label) * 0.8 ,
fontsize = 8 , linesize = 2 , offset = 0.5
) +
ggtree:: geom_tiplab (
mapping = aes (label = label),
size = 5 , align = TRUE , linesize = 1
) +
ggtree:: geom_nodelab (
mapping = aes (label = support),
node = "internal" , size = 5 , hjust = 1.3 , vjust = - 1 , color = "red"
)
```
Conserved locus tree:
```{r}
conservedTree <- treeio:: read.newick (
file.path (confs$ analysis$ ctv$ tfl_hgt$ path, "iqtree" , "genomeset_1_conserved.msa.fasta.treefile" ),
node.label = "support"
) |>
dplyr:: left_join (
y = dplyr:: select (tailRegions, label = regionId, haplotype),
by = "label"
)
pt_conserved_tree <- ggtree:: ggtree (tr = conservedTree, size = 1.5 ) +
scale_x_continuous (expand = expansion (mult = c (0.02 , 0.3 ))) +
ggtree:: geom_treescale (
x = 0 , y = length (conservedTree@ phylo$ tip.label) * 0.8 ,
fontsize = 5 , linesize = 2 , offset = 0.5
) +
ggtree:: geom_tiplab (
mapping = aes (label = label),
size = 3 , align = TRUE , linesize = 0.7
) +
ggtree:: geom_nodelab (
mapping = aes (label = support),
node = "internal" , size = 3 , hjust = 1.3 , vjust = - 1 , color = "red"
) +
ggtree:: geom_tippoint (mapping = aes (color = haplotype), size = 2 ) +
theme (legend.position = "bottom" )
# pt_conserved_tree
```
Tail-fiber locus tree:
```{r}
tflTree <- treeio:: read.newick (
file.path (confs$ analysis$ ctv$ tfl_hgt$ path, "iqtree" , "genomeset_1_tfl.msa.fasta.treefile" ),
node.label = "support"
) |>
dplyr:: left_join (
y = dplyr:: select (tailRegions, label = regionId, haplotype),
by = "label"
)
pt_tfl_tree <- ggtree:: ggtree (tr = tflTree, size = 1 ) +
scale_x_reverse (expand = expansion (mult = c (0.3 , 0.02 ))) +
ggtree:: geom_treescale (
x = 0 , y = length (tflTree@ phylo$ tip.label) * 0.8 ,
fontsize = 5 , linesize = 2 , offset = 0.5
) +
ggtree:: geom_tiplab (
mapping = aes (label = label),
size = 3 , align = TRUE , linesize = 0.7 , hjust = 1
) +
ggtree:: geom_nodelab (
mapping = aes (label = support),
node = "internal" , size = 3 , hjust = - 0.5 , vjust = - 1 , color = "red"
) +
ggtree:: geom_tippoint (mapping = aes (color = haplotype), size = 2 ) +
theme (legend.position = "bottom" )
# pt_tfl_tree
```
Combine the conserved and TFL trees for comparison
```{r}
#| fig-height: 7
#| fig-width: 12
#| out-width: '100%'
#| layout-valign: top
#| column: page-right
#| echo: false
pt_compare <- pt_conserved_tree + pt_tfl_tree
ggsave (
filename = paste (outDir, "/conserved_vs_tfl_tree.pdf" , sep = "" ),
plot = pt_compare, width = 14 , height = 7
)
pt_compare
```