Pectobacterium pangenome
Setup
#!/usr/bin/env bash
shopt -s expand_aliases
source ~/.bash_aliases
# set -e
# set -u
set -o pipefail
# source scripts/utils/setup_analysis.sh 'pectobacterium.v2'
source scripts/utils/setup_analysis.sh $@
if [ -z ${pan_db+x} ];
then
echo "\$pan_db is unset"
error_exit 1
fi
source $TOOLS_PATH/miniconda3/etc/profile.d/conda.sh
conda activate pantools_master
export PANTOOLS="$PANTOOLS_4_1"
#export PANTOOLS="$PANTOOLS_DEV"
######################################################################
Construct pangenome
TMPDIR="$LOCAL_DIR_PATH/tmp"
[ ! -d ${TMPDIR} ] && mkdir -p ${TMPDIR}
[ -d ${TMPDIR}/pantools ] && rm -rd ${TMPDIR}/pantools
[ ! -d ${TMPDIR}/pantools ] && mkdir -p ${TMPDIR}/pantools
process_start build_pangenome_parallel
$PANTOOLS build_pangenome --threads 40 --scratch-directory ${TMPDIR}/pantools \
--num-db-writer-threads 10 --cache-size 25000000 \
${pan_db} $PANGENOME_DIR/genomes_fa.list
error_exit $?
cp -rp ${pan_db} $PANGENOME_DIR/backup/${PANGENOME_NAME}.DB.raw
## add annotations
export PANTOOLS="$PANTOOLS_DEV"
process_start add_annotations
$PANTOOLS add_annotations --connect ${pan_db} $PANGENOME_DIR/genomes_gff3.list
error_exit $?
export PANTOOLS="$PANTOOLS_4_1"
## add phenotypes
process_start add_phenotypes
$PANTOOLS remove_phenotype ${pan_db}
$PANTOOLS add_phenotypes ${pan_db} $PANGENOME_DIR/genomes_metadata.csv
error_exit $?
Add InterProScan and COG annotations
## InterProScan
process_start add_InterProScan_annotations
$PANTOOLS add_functions ${pan_db} $PANGENOME_DIR/functional_annotations.txt
error_exit $?
## COG
process_start add_COG_annotations
$PANTOOLS add_functions ${pan_db} $PANGENOME_DIR/eggnog_annotations.txt
error_exit $?
cp -rp ${pan_db} $PANGENOME_DIR/backup/${PANGENOME_NAME}.DB.fn
######################################################################
Grouping
cp -rp $PANGENOME_DIR/backup/${PANGENOME_NAME}.DB.backup_fn ${pan_db}
## BUSCO
process_start busco_protein
$PANTOOLS busco_protein -t 20 --busco10 enterobacterales_odb10 ${pan_db}
error_exit $?
optimal_grouping
using subset of genomes or type strains
optimal_grouping
at the Pangenome level is time consuming. An alternative approch using subset of genomes or only type strains was tried and the results were compared with the pangenome scale optimal_grouping
.
This script will generate a two column TSV file where first column is a set identifier and second column include comma separated genome identifers.
typeStrain 374,96,385,386,379,388,116,375,377,347,250,335,338,265,256,32,269,387,266
rand_001 451,375,343,346,220,403,251,228,431,248,116,440,383,322,306,351,430,15,14,314
rand_002 451,375,374,224,32,360,252,226,442,248,116,266,386,385,306,387,205,15,265,92
rand_003 451,375,374,334,279,391,103,445,442,248,116,266,386,27,305,351,393,15,265,21
rand_004 451,375,374,88,96,193,251,235,431,248,116,440,386,124,335,387,294,15,423,123
.
.
.
An example to run optimal grouping on subset of genomes in the pangenome.
## type strains
bash ./scripts/build/grouping_subsets_process.sh "typeStrain" "374,96,385,386,379,388,116,375,377,347,250,335,338,265,256,32,269,387,266"
Use GNU parallel to run optimal grouping on all random sets.
## run `optimal_grouping` on these random subsets
nohup \
# sed -n '1,6!p' analysis/03_pangenome_pecto_v2/subset_optimal_group/subsets_conf.tab | \
cat analysis/03_pangenome_pecto_v2/subset_optimal_group/subsets_conf.tab | \
parallel --colsep '\t' --jobs 10 --keep-order --workdir $PWD --halt soon,fail=1 \
--load 100% --results logs/pantools/sub_opt_group/{1} \
--joblog logs/pantools/sub_opt_group/sub_opt_group.log \
bash ./scripts/build/grouping_subsets_process.sh {1} {2} \
>>logs/pantools/sub_opt_group/nohup.out 2>&1 &
Summarize the results by plotting in R
Grouping at the pangenome level
## grouping with relaxation 4 setting
process_start group_v4
nice $PANTOOLS group -t 30 --relaxation 4 ${pan_db}
error_exit $?
cp -rp ${pan_db} $PANGENOME_DIR/backup/${PANGENOME_NAME}.DB.grp
## optimized grouping
process_start optimal_grouping
nice $PANTOOLS optimal_grouping -t 30 ${pan_db} ${pan_db}/busco/enterobacterales_odb10
error_exit $?
Rscript ${pan_db}/optimal_grouping/optimal_grouping.R
$PANTOOLS grouping_overview ${pan_db}
## fix a grouping setting
$PANTOOLS change_grouping -v 4 ${pan_db}
$PANTOOLS grouping_overview ${pan_db}
cp -rp ${pan_db} $PANGENOME_DIR/backup/${PANGENOME_NAME}.DB.opt_grp
######################################################################
Pangenome exploration
## extract the metrics from pangenome
process_start extract_pangenome_metrics
$PANTOOLS metrics ${pan_db}
error_exit $?
gene classification
core-unique variation w.r.t. cutoffs
core and unique genes
soft core (95%) and cloud (5%) genes
## gene classification: soft core and cloud
process_start gene_classification_softcore_cloud
$PANTOOLS gene_classification --core-threshold 95 --unique-threshold 5 ${pan_db}
error_exit $?
## Gene distance tree
Rscript ${pan_db}/gene_classification/gene_distance_tree.R
mv ${pan_db}/gene_classification ${pan_db}/gene_classification.95.5
Extract information for all homology groups
k-mer classification
core and unique kmers
soft core (95%) and cloud (5%) kmers
## K-mer classification: soft core and cloud
process_start kmer_classification_softcore_cloud
$PANTOOLS k_mer_classification --core-threshold 95 --unique-threshold 5 ${pan_db}
error_exit $?
Rscript ${pan_db}/kmer_classification/genome_kmer_distance_tree.R
mv ${pan_db}/kmer_classification ${pan_db}/kmer_classification.95.5
Pangenome structure
Use all genomes to determine pangenome structure
## Pangenome structure: genes
process_start pangenome_structure_gene
$PANTOOLS pangenome_structure -t 20 ${pan_db}
error_exit $?
Rscript ${pan_db}/pangenome_size/gene/pangenome_growth.R
Rscript ${pan_db}/pangenome_size/gene/gains_losses_median_or_average.R
Rscript ${pan_db}/pangenome_size/gene/gains_losses_median_and_average.R
Rscript ${pan_db}/pangenome_size/gene/heaps_law.R
mv ${pan_db}/pangenome_size/gene ${pan_db}/pangenome_size/gene.pangenome
## Pangenome structure: kmer
process_start pangenome_structure_kmer
$PANTOOLS pangenome_structure -t 20 --kmer ${pan_db}
error_exit $?
Rscript ${pan_db}/pangenome_size/kmer/pangenome_growth.R
######################################################################
Functional classification
core and unique functional classification
softcore (95%) and cloud (5%) functional classification
## functional_classification: softcore and cloud
process_start functional_classification_softcore_cloud
$PANTOOLS functional_classification --core-threshold 95 --unique-threshold 5 ${pan_db}
error_exit $?
mv ${pan_db}/function/functional_classification ${pan_db}/function/functional_classification.95.5
function overview
## function_overview
process_start function_overview
$PANTOOLS function_overview ${pan_db}
error_exit $?
Rscript ${pan_db}/cog_per_class.R
## GO enrichment for core, accessory and unique homology groups
for grp in core accessory unique
do
process_start GO_enrichment:$grp
$PANTOOLS go_enrichment -H ${pan_db}/gene_classification.100.0/${grp}_groups.csv ${pan_db}
error_exit $?
mv ${pan_db}/function/go_enrichment ${pan_db}/function/go_enrichment.100.0.${grp}
done
## GO enrichment for soft-core, accessory and cloud homology groups
for grp in core accessory unique
do
process_start GO_enrichment:$grp
$PANTOOLS go_enrichment -H ${pan_db}/gene_classification.95.5/${grp}_groups.csv ${pan_db}
error_exit $?
mv ${pan_db}/function/go_enrichment ${pan_db}/function/go_enrichment.95.5.${grp}
done
#######################################################################
MSA for all homology groups
Phylogeny analysis
## SNP tree using core gene SNPs
process_start core_phylogeny
cp -rp ${pan_db}/gene_classification.100.0 ${pan_db}/gene_classification
$PANTOOLS core_phylogeny -t 20 --clustering-mode ML ${pan_db}
error_exit $?
rm -r ${pan_db}/gene_classification
## running_job: leunissen
mkdir ${pan_db}/core_snp_tree/ML_tree
nohup nice iqtree -T 40 -s data/pangenomes/pectobacterium.v2/backup/pectobacterium.v2.DB.msa/core_snp_tree/informative.fasta -redo -B 1000 \
--prefix data/pangenomes/pectobacterium.v2/backup/pectobacterium.v2.DB.msa/core_snp_tree/ML_tree/informative.fasta \
>> logs/v2_pecto/core_tree.log 2>&1 &
nohup nice iqtree -T 40 -s informative.fasta -B 1000 --prefix ML_tree/informative.fasta \
>> core_tree.log 2>&1 &
## run IQ-tree with a specific model
nohup iqtree -T 30 -s ${pan_db}/core_snp_tree/informative.fasta -redo -B 1000 \
-m GTR+F+ASC --prefix ${pan_db}/core_snp_tree/informative.fasta.GTR_F_ASC \
> logs/v2_pecto/iqtree_GTR_F_ASC.log 2>&1 &
######################################################################
Build pangenome org-db
object
Extract pangenome data from Neo4j database using Python script.
Create org.db
AnnotationHub like object for the pangenome for easier exploratory analysis in R.
Combine protein sequences
Annotate proteins with PHROG annotations for bacteriophages
Remove redundency in pan-proteome for efficiency
conda activate mmseq2
mkdir data/pangenomes/pectobacterium.v2/proteins_db/proteins_combined.mmseq_db
MMSEQ_DB="data/pangenomes/pectobacterium.v2/proteins_db/proteins_combined.mmseq_db/proteins_combined.mmseq_db"
# create mmseq db for pan-proteome
mmseqs createdb data/pangenomes/pectobacterium.v2/proteins_db/proteins_combined.faa \
${MMSEQ_DB} > \
logs/mmseq_db.log 2>&1
mkdir data/pangenomes/pectobacterium.v2/proteins_db/proteins_nr_hash
## create non-redundant protein set
mmseqs clusthash ${MMSEQ_DB} \
data/pangenomes/pectobacterium.v2/proteins_db/proteins_nr_hash/nr_hash \
--min-seq-id 1 --threads 32
MMSEQ_NR_CLUSTER_DB="data/pangenomes/pectobacterium.v2/proteins_db/proteins_nr.clust/proteins_nr.clust"
mmseqs clust ${MMSEQ_DB} \
data/pangenomes/pectobacterium.v2/proteins_db/proteins_nr_hash/nr_hash \
${MMSEQ_NR_CLUSTER_DB}
## create TSV file with cluster representatives
mmseqs createtsv \
${MMSEQ_DB} ${MMSEQ_DB} ${MMSEQ_NR_CLUSTER_DB} \
data/pangenomes/pectobacterium.v2/proteins_db/proteins_nr.clust/proteins_nr.clust.tsv
MMSEQ_NR_SUB_DB=data/pangenomes/pectobacterium.v2/proteins_db/proteins_nr.clust/proteins_rep.db
## create a sub-db of only cluster representative sequences
mmseqs createsubdb ${MMSEQ_NR_CLUSTER_DB} ${MMSEQ_DB} ${MMSEQ_NR_SUB_DB}
## Extract representative sequence
mmseqs convert2fasta ${MMSEQ_NR_SUB_DB} \
data/pangenomes/pectobacterium.v2/proteins_db/proteins_nr.clust/proteins_rep.faa
Search PHROGS against the representative proteins using MMSeq2
conda activate mmseq2
# search pan-proteome using PHROG profiles
mmseqs search ${TOOLS_PATH}/phrog_profiles/phrogs_mmseqs_db/phrogs_profile_db \
${MMSEQ_NR_SUB_DB} \
data/phrog_annotation/phrog_nr_proteome_result \
./tmp -s 7 --threads 64 > logs/mmseq_search.log 2>&1
# create TSV results file
mmseqs createtsv ${TOOLS_PATH}/phrog_profiles/phrogs_mmseqs_db/phrogs_profile_db \
${MMSEQ_NR_SUB_DB} data/phrog_annotation/phrog_nr_proteome_result \
data/phrog_annotation/phrog_nr_proteome_result.tsv