Pangenome homology groups summary

Author

Lakhansing Pardeshi

Published

July 20, 2025

Initial setup

Code
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(configr))
suppressPackageStartupMessages(library(ggpubr))
suppressPackageStartupMessages(library(here))
suppressPackageStartupMessages(library(matrixStats))
suppressPackageStartupMessages(library(org.Pectobacterium.spp.pan.eg.db))

rm(list = ls())

source("https://raw.githubusercontent.com/lakhanp1/omics_utils/main/RScripts/utils.R")
source("https://raw.githubusercontent.com/lakhanp1/omics_utils/main/GO_enrichment/enrichment_functions.R")
source("scripts/utils/config_functions.R")
source("scripts/utils/phylogeny_functions.R")
source("scripts/utils/homology_groups.R")
################################################################################
set.seed(124)

confs <- prefix_config_paths(
    conf = suppressWarnings(configr::read.config(file = "project_config.yaml")),
    dir = "."
)

pangenome <- confs$data$pangenomes$pectobacterium.v2$name
panConf <- confs$data$pangenomes[[pangenome]]

analysisName <- "homology_groups"
outDir <- confs$analysis$homology_groups$path
outPrefix <- file.path(outDir, analysisName)
panOrgDb <- org.Pectobacterium.spp.pan.eg.db

Import data

Code
sampleInfo <- get_metadata(file = panConf$files$metadata, genus = confs$genus)

sampleInfoList <- as.list_metadata(
    df = sampleInfo, sampleId, sampleName, SpeciesName, strain, nodeLabs, genomeId
)

spOrder <- suppressMessages(
    readr::read_tsv(confs$analysis$phylogeny$core_snp_ml$files$species_order)
)

hgTable <- AnnotationDbi::select(
    x = panOrgDb, keys = keys(panOrgDb),
    columns = c("GID", "genomeId", "class")
) %>%
    dplyr::rename(hgId = GID) %>%
    dplyr::count(hgId, genomeId, name = "nGenes")
'select()' returned 1:many mapping between keys and columns
Code
hgMeta <- AnnotationDbi::select(
    x = panOrgDb, keys = keys(panOrgDb),
    columns = c("GID", "class")
) %>%
    dplyr::rename(hgId = GID) %>%
    dplyr::mutate(
        class = dplyr::if_else(
            condition = class == "core & single copy orthologous",
            true = "core", false = class
        )
    )
'select()' returned 1:1 mapping between keys and columns
Code
## pangenome version HG stats
grpStats <- suppressMessages(readr::read_tsv(file.path(outDir, "group_stats.tab"))) %>%
    dplyr::mutate(
        data = forcats::as_factor(data),
        group = forcats::fct_relevel(group, "core", "accessory", "unique")
    )

## binary matrix for homology_group PAV
hgPavMat <- homology_groups_mat(pandb = panOrgDb, type = "pav")
'select()' returned 1:many mapping between keys and columns

Species level homology group statistics

Code
spNames <- dplyr::count(sampleInfo, SpeciesName) %>%
    # dplyr::filter(n >= 20) %>%
    dplyr::pull(SpeciesName)

sppGrpStats <- NULL
sppGrpGo <- NULL

# matrixStats::colSums2(x = hgPavMat, useNames = T) %>%
#   tibble::enframe(name = "hgId", value = "nGenomes") %>%
#   dplyr::mutate(
#     class = dplyr::case_when(
#       nGenomes == !!nrow(hgPavMat) ~ "core",
#       nGenomes == 1 ~ "unique",
#       nGenomes < !!nrow(hgPavMat) & nGenomes > 1 ~ "accessory"
#     )
#   ) %>%
#   dplyr::count(class)

## get species wise core, accessory, unique group stats and GO
for (sp in spNames) {
    spGenomes <- dplyr::filter(sampleInfo, SpeciesName == .env$sp) %>%
        dplyr::pull(genomeId)

    cat(sp, ": ", length(spGenomes), "\n")

    hgSum <- matrixStats::colSums2(
        x = hgPavMat, useNames = T,
        rows = which(rownames(hgPavMat) %in% spGenomes)
    ) %>%
        tibble::enframe(name = "hgId", value = "nGenomes") %>%
        dplyr::filter(nGenomes != 0) %>%
        dplyr::mutate(
            subpan_class = dplyr::case_when(
                nGenomes == !!length(spGenomes) ~ "core",
                nGenomes == 1 ~ "unique",
                nGenomes < !!length(spGenomes) & nGenomes > 1 ~ "accessory"
            )
        ) %>%
        dplyr::left_join(
            y = dplyr::rename(hgMeta, pangenome_class = class),
            by = "hgId"
        )

    ## group stats
    sppGrpStats <- dplyr::count(hgSum, pangenome_class, subpan_class, name = "count") %>%
        dplyr::mutate(
            SpeciesName = .env$sp, nHgs = !!nrow(hgSum),
            n_genomes = length(spGenomes)
        ) %>%
        dplyr::bind_rows(sppGrpStats)
}
P. actinidiae :  5 
P. aquaticum :  7 
P. aroidearum :  33 
P. atrosepticum :  16 
P. betavasculorum :  2 
P. brasiliense :  138 
P. cacticida :  1 
P. carotovorum :  33 
P. colocasium :  1 
P. fontis :  1 
P. odoriferum :  21 
P. parmentieri :  37 
P. parvum :  10 
P. peruviense :  4 
P. polaris :  25 
P. polonicum :  3 
P. punjabense :  9 
P. quasiaquaticum :  8 
P. versatile :  93 
P. wasabiae :  3 
P. zantedeschiae :  3 
Pectobacterium sp. CFBP8739 :  1 

Proportion of genus pangenome HGs in the species pangenome

Code
hgStats <- dplyr::count(hgMeta, class, name = "count") %>%
    dplyr::mutate(
        SpeciesName = "Pangenome",
        nHgs = sum(count),
        pangenome_class = class,
        subpan_class = class,
        n_genomes = nrow(hgPavMat)
    ) %>%
    dplyr::bind_rows(sppGrpStats) %>%
    dplyr::select(SpeciesName, n_genomes, pangenome_class, subpan_class, count, nHgs) %>%
    dplyr::mutate(
        fraction = round(count / nHgs, digits = 3),
        dplyr::across(
            .cols = c(subpan_class, pangenome_class),
            .fns = ~ forcats::fct_relevel(.x, "core", "accessory", "unique")
        ),
        SpeciesName = forcats::fct_relevel(SpeciesName, "Pangenome", !!!spOrder$SpeciesName)
    ) %>%
    dplyr::arrange(desc(n_genomes), SpeciesName, pangenome_class, subpan_class)

readr::write_tsv(
    x = hgStats, file = confs$analysis$homology_groups$files$spp_group_stats
)

Plot the data

Code
species_to_show <- dplyr::count(sampleInfo, SpeciesName) %>%
    dplyr::filter(n >= 20) %>%
    dplyr::arrange(desc(n)) %>%
    dplyr::mutate(
        SpeciesName = forcats::as_factor(SpeciesName)
    )

pt_stats <- dplyr::left_join(
    x = species_to_show, y = hgStats, by = "SpeciesName"
) %>%
    ggplot() +
    geom_bar(
        mapping = aes(
            x = count, y = SpeciesName,
            fill = forcats::fct_rev(subpan_class)
        ),
        stat = "sum", position = position_dodge(), width = 0.8
    ) +
    scale_fill_manual(
        values = c(
            confs$colors$core, confs$colors$accessory, confs$colors$unique
        ),
        breaks = c("core", "accessory", "unique")
    ) +
    scale_x_continuous(expand = expansion(mult = c(0, 0.01))) +
    theme_bw(base_size = 16) +
    theme(
        panel.grid = element_blank(),
        axis.title = element_blank(),
        axis.text = element_text(face = "bold"),
        axis.text.y = element_text(face = "bold.italic"),
        legend.position = "bottom",
        legend.title = element_blank()
    )


ggsave(
    plot = pt_stats, filename = file.path(outDir, "pangenome_spp_stats.pdf"),
    width = 6, height = 4
)

pt_stats

Homology group counts

Compare pangenome versions

Code
pt_hgStats <- ggplot2::ggplot(grpStats) +
    geom_bar(
        mapping = aes(x = data, y = count, fill = forcats::fct_rev(group)),
        stat = "identity",
        position = position_stack()
    ) +
    scale_y_continuous(expand = expansion(add = c(0, 100))) +
    scale_fill_manual(
        name = NULL,
        values = c(
            "core" = confs$colors$core, "accessory" = confs$colors$accessory,
            "unique" = confs$colors$unique
        )
    ) +
    theme_bw(base_size = 24) +
    theme(
        panel.grid = element_blank(),
        legend.position = "bottom",
        axis.title = element_blank(),
        axis.text.x = element_text(face = "italic")
    )

ggsave(filename = file.path(outDir, "hg_cmp_stats.pdf"), plot = pt_hgStats, width = 6, height = 6)

pt_hgStats

Pangenome version comparison

Heap’s law alpha for multiple species in pangenome

Code
heaps <- suppressMessages(readr::read_tsv(file.path(outDir, "heaps_law.tab"))) %>%
    dplyr::filter(species != "pangenome") %>%
    dplyr::mutate(
        complete = "complete",
        species = forcats::fct_relevel(species, !!!levels(species_to_show$SpeciesName))
    )

pt_alpha <- ggplot(data = heaps) +
    geom_bar(
        mapping = aes(y = species, x = alpha),
        fill = "black", stat = "identity", width = 0.8
    ) +
    scale_x_continuous(expand = expansion(mult = c(0, 0.01))) +
    theme_bw(base_size = 16) +
    theme(
        panel.grid = element_blank(),
        axis.title = element_blank(),
        axis.text = element_text(face = "bold"),
        axis.text.y = element_text(face = "bold.italic")
    )


ggsave(
    plot = pt_alpha, filename = file.path(outDir, "pangenome_spp_alpha.pdf"),
    width = 5, height = 4
)

pt_alpha

Heaps law alpha

Species level GO enrichment results

Code
panGo <- suppressMessages(
    readr::read_tsv(confs$analysis$homology_groups$files$spp_group_go)
) %>%
    dplyr::filter(SpeciesName == "pangenome")

ptDf <- dplyr::group_by(panGo, class) %>%
    dplyr::arrange(pvalue, .by_group = TRUE) %>%
    dplyr::slice(1:8) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(
        class = forcats::fct_relevel(class, "core", "accessory", "unique")
    )

pt_go <- enrichment_bar(df = ptDf, title = "pangenome GO", colorCol = "class")

pt_go <- pt_go +
    scale_y_continuous(expand = expansion(mult = c(0, 0.2))) +
    scale_x_discrete(labels = label_wrap(60)) +
    scale_fill_manual(
        values = c(
            "core" = confs$colors$core, "accessory" = confs$colors$accessory,
            "unique" = confs$colors$unique
        )
    )

ggsave(
    plot = pt_go, filename = file.path(outDir, "pangenome_GO.pdf"),
    width = 8, height = 6
)

pt_go

Homology group GO enrichment
Back to top