Exploratory analysis of pangenome metadata

Author

Lakhansing Pardeshi

Published

July 20, 2025

Here, various exploratory analysis figures are generated for P. brasiliense population metadata as well as genome assembly metadata.

Initial setup

Code
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(configr))
suppressPackageStartupMessages(library(ggpubr))
suppressPackageStartupMessages(library(ggdist))
suppressPackageStartupMessages(library(ggpattern))
suppressPackageStartupMessages(library(here))
suppressPackageStartupMessages(library(skimr))
suppressPackageStartupMessages(library(DT))

## pangenome data summary and comparison

rm(list = ls())

source("https://raw.githubusercontent.com/lakhanp1/omics_utils/main/RScripts/utils.R")
source("scripts/utils/config_functions.R")
################################################################################
set.seed(124)

confs <- prefix_config_paths(
    conf = suppressWarnings(configr::read.config(file = "project_config.yaml")),
    dir = "."
)

outDir <- confs$analysis$summary$path

pangenome <- confs$data$pangenomes$pectobacterium.v2$name
panConf <- confs$data$pangenomes[[pangenome]]

Import data

Code
sampleMeta <- suppressMessages(
    readr::read_tsv(confs$data$reference_data$files$metadata)
)

panMetrix <- suppressMessages(
    readr::read_csv(panConf$db$metrics$files$per_genome)
) %>%
    dplyr::rename_all(
        .funs = ~ stringr::str_replace_all(
            ., c("\\s+" = "_", "%" = "per", "(\\(|\\))" = "")
        )
    ) %>%
    dplyr::select(
        Genome, Gene_count, mRNA_count, rRNA_count, tRNA_count,
        Singletons, Homology_groups,
        GC_per = GC_content_per
    ) %>%
    dplyr::mutate(Genome = as.character(Genome))
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Code
panMeta <- get_metadata(file = panConf$files$metadata, genus = confs$genus) %>%
    dplyr::left_join(y = panMetrix, by = "Genome") %>%
    dplyr::mutate(
        pbr = dplyr::if_else(
            SpeciesName == "P. brasiliense", SpeciesName, "other"
        ),
        geo_loc_country = dplyr::if_else(
            geo_loc_country == "Netherlands", geo_loc_country, "other", missing = "other"
        ),
        virType = dplyr::case_when(
            virulence == "virulent" & virulence_pcr == "positive" ~ "TP",
            virulence == "virulent" & virulence_pcr == "negative" ~ "FN",
            virulence == "avirulent" & virulence_pcr == "positive" ~ "FP",
            virulence == "avirulent" & virulence_pcr == "negative" ~ "TN",
            TRUE ~ "unknown"
        ),
        pbr = forcats::fct_relevel(pbr, "other", "P. brasiliense"),
        geo_loc_country = forcats::fct_relevel(geo_loc_country, "Netherlands"),
        virType = forcats::fct_relevel(virType, "TP", "FN", "TN", "FP", "unknown")
    ) %>%
    dplyr::left_join(
        y = dplyr::select(sampleMeta, sampleId, AssemblyStatus),
        by = "sampleId"
    )

Species wise genome statistics

Code
pt_speciesCount <- ggplot2::ggplot(
    data = panMeta
) +
    geom_bar(
        mapping = aes(
            y = forcats::fct_infreq(SpeciesName),
            fill = AssemblyStatus
        )
    ) +
    labs(
        x = "#genomes",
        y = NULL,
        title = "# of genomes"
    ) +
    scale_x_continuous(expand = expansion(add = c(0, 2))) +
    ggplot2::scale_fill_viridis_d(option = "C") +
    theme_bw(base_size = 20) +
    theme(
        axis.text.y = element_text(face = "bold.italic", size = 14, color = "black"),
        axis.text.x = element_text(face = "bold", size = 16, color = "black"),
        plot.title = element_text(face = "bold", size = 18, color = "black"),
        axis.title = element_text(face = "bold", size = 16, color = "black"),
        legend.position = "bottom",
        panel.grid = element_blank()
    )

Explore P. brasiliense population metadata

P. brasiliense collection in the Netherlands

Code
(
    pt_pbrTimeline <- dplyr::filter(
        panMeta, geo_loc_country == "Netherlands", !is.na(collection_year)
    ) %>%
        ggplot2::ggplot() +
        geom_histogram(
            mapping = aes(x = collection_year, fill = forcats::fct_rev(pbr)),
            binwidth = 2, color = "black"
        ) +
        geom_vline(xintercept = 2015, color = "blue", linewidth = 1, linetype = "dashed") +
        scale_fill_manual(
            values = c("P. brasiliense" = "black", "other" = "grey90")
        ) +
        # labs(
        #   title = "P. brasiliense collection after 2015 in the Netherlands"
        # ) +
        scale_x_continuous(expand = expansion(add = 0)) +
        scale_y_continuous(expand = expansion(add = c(0, 5))) +
        theme_bw(base_size = 20) +
        theme(
            legend.position = c(0.02, 0.9),
            legend.justification = c(0, 1),
            legend.key.size = unit(1, "cm"),
            legend.text = element_text(size = 20),
            legend.title = element_blank(),
            axis.title = element_blank(),
            plot.margin = margin(1, 1, 1, 1, "cm"),
            panel.grid = element_blank()
        )
)

P. brasiliense collection after 2015 in the Netherlands

Appearance of new P. brasiliense virulent isolates in the Netherlands

Code
#|
(
    pt_fnPbr <- dplyr::filter(
        panMeta, geo_loc_country == "Netherlands", pbr == "P. brasiliense"
    ) %>%
        ggplot2::ggplot() +
        ggpattern::geom_histogram_pattern(
            mapping = aes(x = collection_year, fill = virType, pattern_density = virType),
            binwidth = 1,
            pattern = "stripe", pattern_fill = "black",
            # pattern_density = 0.6,
            color = "black", pattern_spacing = 0.02, pattern_colour = "black"
        ) +
        scale_fill_manual(
            values = c(
                "TP" = "red", "FP" = alpha("green", 1),
                "TN" = "green", "FN" = alpha("red", 1), "unknown" = "grey"
            )
        ) +
        scale_pattern_density_manual(
            values = c(
                "TP" = 0, "FP" = 0.5, "TN" = 0,
                "FN" = 0.5, "unknown" = 0
            )
        ) +
        # labs(
        #   title = "FN-Pbr emergence in the Netherlands"
        # ) +
        scale_x_continuous(expand = expansion(add = 0)) +
        scale_y_continuous(expand = expansion(add = c(0, 2))) +
        theme_bw(base_size = 20) +
        theme(
            legend.position = c(0.01, 0.9),
            legend.justification = c(0, 1),
            legend.key.size = unit(1, "cm"),
            legend.text = element_text(size = 20),
            legend.title = element_blank(),
            axis.title = element_blank(),
            panel.grid = element_blank()
        )
)
Warning: Removed 2 rows containing non-finite values (`stat_bin()`).

FN-Pbr emergence in the Netherlands
Warning: Removed 2 rows containing non-finite values (`stat_bin()`).

Explore P. brasiliense genome metdata

Prepare plotting data

Code
plotDf <- dplyr::select(
    panMeta, Genome, source, sampleId, SpeciesName, geo_loc_country, virulence,
    virulence_pcr, length, GC_per, N50, L50,
    Gene_count, mRNA_count, tRNA_count, rRNA_count,
    Homology_groups, Singletons
) %>%
    dplyr::filter(SpeciesName == "P. brasiliense") %>%
    tidyr::pivot_longer(
        cols = -c(
            Genome, sampleId, source, SpeciesName, geo_loc_country,
            virulence, virulence_pcr
        ),
        names_to = "field",
        values_to = "value"
    ) %>%
    dplyr::mutate(
        field = forcats::fct_relevel(
            field, "length", "Gene_count", "mRNA_count", "tRNA_count", "rRNA_count"
        )
    )

Do P. brasiliense from the Netherlands have more genes?

Here, we compare the mRNA count of P. brasiliense genomes between the isolates from the Netherlands and rest of the world. A good quality genome can have more mRNA and to control for this, we also compare the N50 values between the same groups.

Code
(
    pt_loc <- plotDf %>%
        # dplyr::filter(field %in% c("length")) %>%
        # dplyr::filter(field %in% c("mRNA_count", "length", "N50", "N90", "L50", "L90")) %>%
        dplyr::filter(field %in% c("mRNA_count", "N50")) %>%
        # dplyr::filter(field %in% c("mRNA_count", "tRNA_count", "rRNA_count", "N50")) %>%
        ggplot2::ggplot(
            mapping = aes(x = forcats::fct_rev(geo_loc_country), y = value)
        ) +
        geom_boxplot(width = .5, outlier.shape = NA, alpha = 0, linewidth = 1) +
        ggbeeswarm::geom_quasirandom(size = 3) +
        ggpubr::stat_compare_means(label.y.npc = "top", vjust = -1) +
        scale_y_continuous(
            labels = scales::label_comma(scale_cut = c(Mb = 1000000, Gb = 1000000000)),
            expand = expansion(mult = c(0.1))
        ) +
        facet_wrap(
            facets = ~field, nrow = 1, scales = "free_y",
            labeller = ggplot2::labeller(
                field = c("mRNA_count" = "Gene count", "length" = "Genome size", "N50" = "N50")
            )
        ) +
        # labs(
        #   title = "Pbr isolates from the Netherlands have more protein coding genes"
        # ) +
        theme_bw(base_size = 20) +
        theme(
            axis.title = element_blank(),
            axis.text.x = element_text(angle = 45, hjust = 1),
            legend.position = "bottom",
            panel.grid = element_blank()
        )
)

Pbr isolates from the Netherlands have more protein coding genes

Do virulent P. brasiliense isolates have more mRNAs?

Code
(
    pt_vir <- dplyr::filter(plotDf, virulence %in% c("virulent", "avirulent")) %>%
        dplyr::filter(field %in% c("mRNA_count", "length", "N50")) %>%
        ggplot2::ggplot(mapping = aes(x = virulence, y = value)) +
        ggplot2::geom_boxplot(width = .5, outlier.shape = NA, alpha = 0, linewidth = 1) +
        ggbeeswarm::geom_quasirandom(
            mapping = aes(color = virulence),
            size = 3
        ) +
        ggpubr::stat_compare_means(label.y.npc = "top", vjust = -1) +
        ggplot2::scale_x_discrete(
            labels = c("virulent" = "BL causing", "avirulent" = "BL non-causing")
        ) +
        ggplot2::scale_color_manual(
            name = "Blackleg phenotype",
            values = c("red", "black"),
            breaks = c("virulent", "avirulent"),
            label = c("BL-causing", "BL non-causing"),
            guide = guide_legend(override.aes = list(size = 6))
        ) +
        ggplot2::scale_y_continuous(
            labels = scales::label_comma(scale_cut = c(Mb = 1000000, Gb = 1000000000)),
            expand = expansion(mult = c(0.1))
        ) +
        ggplot2::facet_wrap(
            facets = ~field, nrow = 1, scales = "free_y",
            labeller = ggplot2::labeller(
                field = c("mRNA_count" = "Gene count", "length" = "Genome size", "N50" = "N50")
            )
        ) +
        ggplot2::theme_bw(base_size = 22) +
        ggplot2::theme(
            axis.title = element_blank(),
            # axis.text.x = element_text(angle = 45, hjust = 1),
            axis.text.x = element_blank(),
            legend.position = "bottom",
            panel.grid = element_blank()
        )
)

Virulent Pbr isolates have more protein coding genes than avirulent
Back to top