---
title: "Analyze tail fiber loci haplotypes"
date: "`r Sys.time()`"
knitr:
opts_chunk:
fig.height: 7
---
## Initial setup
```{r}
#| label: setup
#| warning: false
suppressPackageStartupMessages (library (tidyverse))
suppressPackageStartupMessages (library (ape))
suppressPackageStartupMessages (library (treeio))
suppressPackageStartupMessages (library (ggtree))
suppressPackageStartupMessages (library (cowplot))
suppressPackageStartupMessages (library (ComplexHeatmap))
suppressPackageStartupMessages (library (org.Pectobacterium.spp.pan.eg.db))
rm (list = ls ())
source ("https://raw.githubusercontent.com/lakhanp1/omics_utils/main/RScripts/utils.R" )
source ("scripts/utils/config_functions.R" )
source ("scripts/utils/phylogeny_functions.R" )
source ("scripts/utils/homology_groups.R" )
source ("scripts/utils/heatmap_utils.R" )
################################################################################
set.seed (124 )
confs <- prefix_config_paths (
conf = suppressWarnings (configr:: read.config (file = "project_config.yaml" )),
dir = "."
)
file_haplotypes <- confs$ analysis$ ctv$ tfl$ files$ hg_regions
outDir <- dirname (file_haplotypes)
pangenome <- confs$ data$ pangenomes$ pectobacterium.v2$ name
panConf <- confs$ data$ pangenomes[[pangenome]]
panOrgDb <- org.Pectobacterium.spp.pan.eg.db
treeMethod <- "core_snp_ml" #ani_upgma, kmer_nj
```
## Import data
```{r}
sampleInfo <- get_metadata (file = panConf$ files$ metadata, genus = confs$ genus)
sampleInfoList <- as.list_metadata (
df = sampleInfo, sampleId, sampleName, SpeciesName, strain, nodeLabs, genomeId
)
rawTree <- import_tree (
confs$ analysis$ phylogeny[[treeMethod]]$ files$ tree_rooted,
phylo = TRUE
)
speciesOrder <- suppressMessages (
readr:: read_tsv (confs$ analysis$ phylogeny[[treeMethod]]$ files$ species_order)
) %>%
dplyr:: mutate (order = n (): 1 )
```
## Import tail regions and distance tables
Build a pairwise tail loci comparison table
```{r}
regionHaplotypes <- suppressMessages (readr:: read_tsv (file_haplotypes)) %>%
dplyr:: left_join (
y = dplyr:: select (
sampleInfo, genomeId, SpeciesName, geo_loc_country, collection_year,
collection_year, nodepath.kmer_nj, host, isolation_source
),
by = "genomeId"
) %>%
dplyr:: add_count (haplotype) %>%
dplyr:: arrange (desc (n), haplotype, SpeciesName) %>%
dplyr:: mutate (
haplotype = forcats:: as_factor (haplotype)
)
readr:: write_tsv (
regionHaplotypes,
file = paste (outDir, "/haplotypes_phy_data.tab" , sep = "" )
)
haplotypes <- dplyr:: count (
regionHaplotypes, haplotype, hgs, sort = TRUE
)
```
## Are there any tail loci shared between species?
```{r}
# generate upset plot
cm <- ComplexHeatmap:: make_comb_mat (
split (regionHaplotypes$ haplotype, f = regionHaplotypes$ SpeciesName)
)
hgCombTable <- table (regionHaplotypes$ haplotype)
combMetadata <- purrr:: imap_dfr (
.x = ComplexHeatmap:: comb_degree (cm),
.f = function (d, n){
tibble:: tibble (
comb = n,
n_species = d,
n_genomes = sum (hgCombTable[extract_comb (m = cm, comb_name = n)]),
haplotype = paste (extract_comb (m = cm, comb_name = n), collapse = ";" )
)
}
) %>%
dplyr:: left_join (haplotypes, by = "haplotype" ) |>
dplyr:: mutate (
comb = forcats:: fct_relevel (
.f = comb, !! ComplexHeatmap:: comb_name (cm)
)
) |>
dplyr:: arrange (comb)
ht_upset <- ComplexHeatmap:: UpSet (
m = cm,
column_title = "Carotovoricin tail fiber locus similarity across species" ,
row_names_gp = gpar (fontface = "italic" ),
column_title_gp = gpar (fontface = "bold" , fontsize = 16 ),
set_order = order (
dplyr:: pull (speciesOrder, order, name = SpeciesName)[set_name (cm)],
decreasing = TRUE
),
# comb_order = order(comb_degree(cm), comb_size(cm), decreasing = TRUE),
right_annotation = upset_right_annotation (cm, add_numbers = TRUE ),
top_annotation = ComplexHeatmap:: HeatmapAnnotation (
"Number of \n genomes" = anno_barplot (
x = dplyr:: pull (combMetadata, var = n_genomes, name = "comb" ),
border = FALSE , add_numbers = TRUE ,
gp = gpar (fill = "black" ), height = unit (1.5 , "cm" )
),
"Number of \n shared tail \n fiber locus" = anno_barplot (
x = comb_size (cm),
border = FALSE ,
gp = gpar (fill = "black" ),
height = unit (1.5 , "cm" )
),
annotation_name_side = "left" ,
annotation_name_rot = 0 ,
annotation_name_gp = gpar (fontface = "bold" ),
gap = unit (4 , "mm" )
)
)
ht_upset <- ht_upset +
ComplexHeatmap:: rowAnnotation (
"Number of \n genomes" = ComplexHeatmap:: anno_barplot (
x = dplyr:: count (regionHaplotypes, SpeciesName) %>%
dplyr:: pull (var = n, name = SpeciesName),
border = FALSE ,
add_numbers = TRUE ,
gp = gpar (fill = "black" ), width = unit (2 , "cm" )
),
gap = unit (3 , "mm" )
)
```
```{r}
#| fig-height: 8
#| fig-width: 8
#| out-width: '100%'
#| layout-valign: top
#| echo: false
pdf (file = paste (outDir, "/haplotype_species_upset.pdf" , sep = "" ), width = 8 , height = 6 )
ht_upset
dev.off ()
ht_upset
```
Show the raw combination data.
```{r}
#| echo: false
#| tbl-cap: TFL HG combinations across species
DT:: datatable (
data = combMetadata,
rownames = FALSE ,
filter = "top" ,
class = 'compact hover' ,
extensions = c ('KeyTable' , 'Scroller' , 'Select' , 'SearchBuilder' , 'FixedColumns' ),
options = list (
autoWidth = FALSE ,
dom = 'Qlfrtip' ,
scrollX = TRUE ,
fixedColumns = list (leftColumns = 2 ),
keys = TRUE ,
scroller = TRUE ,
scrollY = 300
),
selection = 'none'
)
```
## Visualize tail-fiber locus haplotypes with the phylogeny
Generate core phylogeny plot and species key heatmap.
```{r}
pt_phy <- ggtree_with_species (
phy = rawTree, metadata = sampleInfo,
genomes = c (regionHaplotypes$ genomeId, confs$ analysis$ phylogeny$ outgroup_genome),
trim_branch = confs$ analysis$ phylogeny[[treeMethod]]$ cut_branch
)
```
Generate tail-fiber locus haplotype presence-absence heatmap.
```{r}
# TFL haplotypes plot
hap_df <- dplyr:: select (regionHaplotypes, genomeId, haplotype, n) %>%
dplyr:: filter (n > 1 ) %>%
dplyr:: mutate (hap = "y" )
theme_haplotypes <- theme_bw () +
theme (
panel.grid.major.y = element_blank (),
axis.text = element_blank (),
axis.ticks = element_blank (),
axis.title = element_blank (),
legend.title = element_text (size = 18 , face = "bold" )
)
pt_hap <- dplyr:: select (regionHaplotypes, label = genomeId, haplotype, n) %>%
dplyr:: filter (n > 1 ) %>%
dplyr:: mutate (hap = "y" ) %>%
ggplot2:: ggplot (mapping = aes (x = haplotype, y = label)) +
geom_tile (mapping = aes (fill = hap), width = 0.9 ) +
scale_fill_manual (
name = NULL ,
values = c ("y" = "#FDA532" ),
na.value = alpha ("white" , 1 ),
guide = NULL ,
na.translate = FALSE
) +
theme_haplotypes
pt_all <- pt_phy$ species_key |>
aplot:: insert_left (pt_phy$ tree, width = 3 ) |>
aplot:: insert_right (pt_hap, width = 4 )
```
```{r}
#| fig-height: 8
#| fig-width: 10
#| out-width: '100%'
#| layout-valign: top
#| echo: false
#| column: page-right
pdf (file = paste (outDir, "/tfl_haplotype_phylo.pdf" , sep = "" ), width = 10 , height = 10 )
pt_all
dev.off ()
pt_all
```
Species order from left to right on species key heatmap:
`r paste(pt_phy$species_order, sep = "; ")`
## Use the TFL signature present in diverse species to select MSA candidates.
```{r}
egTfl <- dplyr:: arrange (combMetadata, desc (n_species)) %>%
dplyr:: slice (1 : 3 ) %>%
dplyr:: pull (haplotype) %>%
stringr:: str_split (pattern = ";" ) %>%
unlist ()
msaGenomes <- dplyr:: filter (regionHaplotypes, haplotype %in% !! egTfl) %>%
dplyr:: group_by (haplotype, SpeciesName) %>%
dplyr:: slice (1 L) %>%
dplyr:: select (regionId, genomeId, haplotype, SpeciesName, everything ())
# # type strain genomes
# msaGenomes <- union(
# msaGenomes,
# c("g_158", "g_446", "g_66", "g_222", "g_296", "g_442", "g_8",
# "g_38", "g_273", "g_259", "g_305", "g_378", "g_428", "g_248",
# "g_449", "g_54", "g_116", "g_423", "g_375", "g_381")
# )
paste (msaGenomes$ genomeId, collapse = "," )
```
```{r}
#| tbl-cap: representatives used to align and compare tree
#| out-height: "200px"
#| echo: false
DT:: datatable (
data = msaGenomes,
rownames = FALSE ,
filter = "top" ,
class = 'compact hover' ,
extensions = c ('KeyTable' , 'Scroller' , 'Select' , 'SearchBuilder' , 'FixedColumns' ),
options = list (
autoWidth = FALSE ,
dom = 'Qlfrtip' ,
scrollX = TRUE ,
fixedColumns = list (leftColumns = 2 ),
keys = TRUE ,
scroller = TRUE ,
scrollY = 600
),
selection = 'none'
)
```