Fig. S22 - Pangenome analysis

Author

Benjamin Doran

Published

January 17, 2025

Julia Setup
using DrWatson
@quickactivate projectdir()
using CSV, DataFrames, DataFramesMeta
using Chain
using StatsBase
using FreqTables
using Downloads: request, download

Main - Multispecies pangenome analysis

ddir = projectdir("_research", "BBpangenome") |> mkpath
fullfasta_dir = joinpath(ddir, "fasta_all")
baseurl = "https://sra-download.ncbi.nlm.nih.gov/traces";
biobankdf = CSV.read(datadir("exp_raw","BB669","BB669_rowmeta.tsv"), DataFrame; delim="\t", normalizenames=true);
keptspecies = @chain biobankdf.Species begin
    countmap
    sort(byvalue=true, rev=true)
    filter(x -> x[2] >= (20), _)
    filter(x -> x[1] != "unclassified", _)
end
OrderedCollections.OrderedDict{String, Int64} with 11 entries:
  "Phocaeicola vulgatus"         => 88
  "[Ruminococcus] gnavus"        => 41
  "Bacteroides thetaiotaomicron" => 35
  "Anaerostipes hadrus"          => 31
  "Bacteroides uniformis"        => 27
  "Blautia luti"                 => 24
  "Bifidobacterium breve"        => 24
  "Coprococcus comes"            => 23
  "Dorea formicigenerans"        => 22
  "Blautia wexlerae"             => 21
  "[Eubacterium] rectale"        => 20
strvardf = biobankdf[biobankdf.Species .∈ Ref(keys(keptspecies)), :];
strvardf_downsampled = @chain strvardf begin
    @orderby(:Species, :Donor, :Strain_ID)
    @groupby(:Species, :Donor)
    @transform(
        $(nrow => :nstrains_per_species_donor),
        $(eachindex => :nth_species_donor_strain),
        :species_donor = :Species .* "_" .* :Donor
    )
    @rsubset(
        :nstrains_per_species_donor >= 3,
        :nth_species_donor_strain <= 3,
        # :nth_species_strain <= 9,
        # :nth_donor <= 3,
    )
    @orderby(:Species, :Donor, :nth_species_donor_strain)
    @groupby(:Species)
    @transform($(eachindex => :nth_species_strain))
    @rsubset(
        :nth_species_strain <= 9,
    )
    @orderby(:Species, :Donor, :nth_species_donor_strain)
    @rsubset(
        # drop species that don't have 3 donors with 3 strain level replicates
        :Species  [
            "Dorea formicigenerans",
            "Blautia luti",
            "Bifidobacterium breve",
        ],
    )
    @rtransform(
        :Accession_ID = first(:Accession, 7) .* "1.1",
    )
    @rtransform(
        :Label = join([last(split(:Species, " ")), :Strain_ID, :Accession_ID], '_')
    )
end

## Get resource urls for each strain
resource_urls = map(strvardf_downsampled.Accession_ID) do accid
    resource_str = joinpath(accid[1:2], accid[3:4], accid[5:6], accid[1:end-2], accid * ".fsa_nt.gz")
    for rsrc_hub in ["wgs01", "wgs04", "wgs02", "wgs03"]
        url = joinpath(baseurl, rsrc_hub, "wgs_aux", resource_str)
        req = request(url)
        if req.status == 200
            return url
        end
    end
    return
end
@assert all(!isnothing, resource_urls)
strvardf_downsampled.NCBI_resource_url = resource_urls;
CSV.write(joinpath(ddir, "strvar_downsampled.tsv"), strvardf_downsampled; delim="\t")
"/Users/bend/projects/Doran_etal_2023/_research/BBpangenome/strvar_downsampled.tsv"
first(strvardf_downsampled, 5)
5×28 DataFrame
Row Strain_ID Accession Phylum Class Order Family Genus Species Donor NCBI_Phylum NCBI_Class NCBI_Order NCBI_Family NCBI_Genus NCBI_Species GTDB_Phylum GTDB_Class GTDB_Order GTDB_Family GTDB_Genus GTDB_Species nstrains_per_species_donor nth_species_donor_strain species_donor nth_species_strain Accession_ID Label NCBI_resource_url
String15 String15 String15 String31 String31 String31 String31 String String7 String15 String31 String31 String31 String31 String String31 String31 String31 String31 String31 String Int64 Int64 String Int64 String String String
1 DFI.4.141 JAJCMT000000000 Firmicutes Clostridia Eubacteriales Lachnospiraceae Anaerostipes Anaerostipes hadrus DFI.4 Firmicutes Clostridia Eubacteriales Lachnospiraceae Anaerostipes Anaerostipes hadrus Firmicutes_A Clostridia Lachnospirales Lachnospiraceae Anaerostipes Anaerostipes hadrus 4 1 Anaerostipes hadrus_DFI.4 1 JAJCMT01.1 hadrus_DFI.4.141_JAJCMT01.1 https://sra-download.ncbi.nlm.nih.gov/traces/wgs03/wgs_aux/JA/JC/MT/JAJCMT01/JAJCMT01.1.fsa_nt.gz
2 DFI.4.155 JAJCRN000000000 Firmicutes Clostridia Clostridiales Lachnospiraceae Anaerostipes Anaerostipes hadrus DFI.4 Firmicutes Clostridia Eubacteriales Lachnospiraceae Anaerostipes Anaerostipes hadrus Firmicutes_A Clostridia Lachnospirales Lachnospiraceae Anaerostipes Anaerostipes hadrus 4 2 Anaerostipes hadrus_DFI.4 2 JAJCRN01.1 hadrus_DFI.4.155_JAJCRN01.1 https://sra-download.ncbi.nlm.nih.gov/traces/wgs03/wgs_aux/JA/JC/RN/JAJCRN01/JAJCRN01.1.fsa_nt.gz
3 DFI.4.26 JAJCMP000000000 Firmicutes Clostridia Eubacteriales Lachnospiraceae Anaerostipes Anaerostipes hadrus DFI.4 Firmicutes Clostridia Eubacteriales Lachnospiraceae Anaerostipes Anaerostipes hadrus Firmicutes_A Clostridia Lachnospirales Lachnospiraceae Anaerostipes Anaerostipes hadrus 4 3 Anaerostipes hadrus_DFI.4 3 JAJCMP01.1 hadrus_DFI.4.26_JAJCMP01.1 https://sra-download.ncbi.nlm.nih.gov/traces/wgs04/wgs_aux/JA/JC/MP/JAJCMP01/JAJCMP01.1.fsa_nt.gz
4 MSK.10.20 JAJBMG000000000 Firmicutes Clostridia Eubacteriales Lachnospiraceae Anaerostipes Anaerostipes hadrus MSK.10 Firmicutes Clostridia Eubacteriales Lachnospiraceae Anaerostipes Anaerostipes hadrus Firmicutes_A Clostridia Lachnospirales Lachnospiraceae Anaerostipes Anaerostipes hadrus 4 1 Anaerostipes hadrus_MSK.10 4 JAJBMG01.1 hadrus_MSK.10.20_JAJBMG01.1 https://sra-download.ncbi.nlm.nih.gov/traces/wgs01/wgs_aux/JA/JB/MG/JAJBMG01/JAJBMG01.1.fsa_nt.gz
5 MSK.10.27 JAAIQV000000000 Firmicutes Clostridia Eubacteriales Lachnospiraceae Anaerostipes Anaerostipes hadrus MSK.10 Firmicutes Clostridia Eubacteriales Lachnospiraceae Anaerostipes Anaerostipes hadrus Firmicutes_A Clostridia Lachnospirales Lachnospiraceae Anaerostipes Anaerostipes hadrus 4 2 Anaerostipes hadrus_MSK.10 5 JAAIQV01.1 hadrus_MSK.10.27_JAAIQV01.1 https://sra-download.ncbi.nlm.nih.gov/traces/wgs01/wgs_aux/JA/AI/QV/JAAIQV01/JAAIQV01.1.fsa_nt.gz
download.(strvardf_downsampled.NCBI_resource_url, joinpath.(fullfasta_dir, strvardf_downsampled.Label .* ".fsa_nt.gz"))
72-element Vector{String}:
 "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_DFI.4.141_JAJCMT01.1.fsa_nt.gz"
 "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_DFI.4.155_JAJCRN01.1.fsa_nt.gz"
 "/Users/bend/projects/Doran_etal" ⋯ 43 bytes ⋯ "s_DFI.4.26_JAJCMP01.1.fsa_nt.gz"
 "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_MSK.10.20_JAJBMG01.1.fsa_nt.gz"
 "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_MSK.10.27_JAAIQV01.1.fsa_nt.gz"
 "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_MSK.10.31_JAAIQU01.1.fsa_nt.gz"
 "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_MSK.11.19_JAAIQS01.1.fsa_nt.gz"
 "/Users/bend/projects/Doran_etal" ⋯ 43 bytes ⋯ "s_MSK.11.2_JAAIQR01.1.fsa_nt.gz"
 "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_MSK.11.20_JAAIQQ01.1.fsa_nt.gz"
 "/Users/bend/projects/Doran_etal" ⋯ 54 bytes ⋯ "_DFI.1.128_JAJCQT01.1.fsa_nt.gz"
 ⋮
 "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_MSK.15.10_JAAIRX01.1.fsa_nt.gz"
 "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_MSK.15.18_JAAIRW01.1.fsa_nt.gz"
 "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_MSK.15.32_JAAIRV01.1.fsa_nt.gz"
 "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_MSK.22.24_JAAIRN01.1.fsa_nt.gz"
 "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_MSK.22.91_JAJBPM01.1.fsa_nt.gz"
 "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_MSK.22.96_JAJBPJ01.1.fsa_nt.gz"
 "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_MSK.23.10_JAJBPH01.1.fsa_nt.gz"
 "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_MSK.23.17_JAJBON01.1.fsa_nt.gz"
 "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_MSK.23.18_JAJBOM01.1.fsa_nt.gz"
# uncompress for prokka
for f in readdir(fullfasta_dir, join=true)
    run(`gunzip $f`)
end

Bacteroidaceae - prokka

## Prokka
prokka_base_dir = joinpath(ddir, "bacteroides_prokka") |> mkpath
prokka_gff = joinpath(ddir, "bacteroides_prokka_gff") |> mkpath;
Bacteroidaceae_df = @rsubset(strvardf_downsampled, :Family == "Bacteroidaceae");
# 48 minutes per 27 samples
@eachrow Bacteroidaceae_df begin
    smp = :Label
    genera = :Genus
    outdir = joinpath("/data/bacteroides_prokka", smp)
    command = `docker run --rm -v $ddir:/data staphb/prokka prokka --force --kingdom Bacteria --genus $genera --locustag $smp --outdir $outdir /data/fasta_all/$(smp).fsa_nt`
    # @show command
    run(command)
end;
for dir in readdir(prokka_base_dir)
    cp(joinpath(prokka_base_dir, dir, "PROKKA_10152024.gff"), joinpath(prokka_gff, dir * ".gff"))
end

Bacteroidaceae - Roary

Roury on nine P. vulgatus strains, 3 strain replicates from 3 different donors.

run(`
docker run --rm -v $ddir\:/data staphb/roary roary -f ./roary_vulgatus -e -n -v $(
"/data/bacteroides_prokka_gff/" .* filter(x -> contains(x, r"(vulgatus)"), readdir(prokka_gff))
)
`)

Roury on nine P. vulgatus strains, 3 strain replicates from 3 different donors

Plus 9 strains of B. thetaiotaomicron (3 strain replicates from 3 different donors)

run(`
docker run --rm -v $ddir\:/data staphb/roary roary -f ./roary_vulgatus_theta -e -n -v $(
"/data/bacteroides_prokka_gff/" .* filter(x -> contains(x, r"(vulgatus)|(theta)"), readdir(prokka_gff))
)
`)

Roury on nine P. vulgatus strains, 3 strain replicates from 3 different donors

Plus 9 strains of B. thetaiotaomicron (3 strain replicates from 3 different donors)

Plus 9 strains of B. uniformis (3 strain replicates from 3 different donors)

# took 30 minutes
run(`
docker run --rm -v $ddir\:/data staphb/roary roary -f ./roary_vulgatus_theta_uniformis -e -n -v $(
"/data/bacteroides_prokka_gff/" .* filter(x -> contains(x, r"(vulgatus)|(theta)|(uniformis)"), readdir(prokka_gff))
)
`)

Tree analysis

using SpectralInference
using NewickTree
using StatsPlots

getlims(x) = extrema(x) .|> abs |> maximum |> x->(-x, x)

function center_rooting!(orig_tree)
    leaves = getleaves(orig_tree)
    centered_node = argmin(prewalk(orig_tree)) do node
        mean(patristic_distance.(Ref(node), leaves).^2)
    end
    newtree = NewickTree.set_outgroup!(centered_node)
    return newtree
end
function center_rooting(tree)
    tmp_tree = deepcopy(tree)
    center_rooting!(tmp_tree)
end
center_rooting (generic function with 1 method)
pdir = plotsdir("roary_pangenome") |> mkpath
"/Users/bend/projects/Doran_etal_2023/plots/roary_pangenome"
vtree = readnw(readline(joinpath(ddir, "roary_vulgatus", "accessory_binary_genes.fa.newick")))
vulgatus_names = getleafnames(vtree)
as_polytomy!(n -> NewickTree.support(n) < 0.5, vtree)
vtree = (readnw  nwstr)(NewickTree.extract(vtree, vulgatus_names))
# vtree = center_rooting(vtree)
# SpectralInference.ladderize!(vtree)

v_pij = patristic_distances(vtree)
color_lims = (0, maximum(v_pij))

tplot = plot(vtree,
    fs=6,
    rightmargin=7Plots.cm,
    size=(500, 500),
    title="roary_vulgatus",
    xlims = color_lims,
)

hplot = heatmap(v_pij;
    cmap=:viridis,
    ratio=1,
)
lyt = @layout [a{0.25w} b]
plot(tplot, hplot, layout=lyt, size=(800, 400), link=:y, clims=color_lims)
savefig(joinpath(pdir, "roary_v_plots.pdf"))
"/Users/bend/projects/Doran_etal_2023/plots/roary_pangenome/roary_v_plots.pdf"
vt_tree = readnw(readline(joinpath(ddir, "roary_vulgatus_theta", "accessory_binary_genes.fa.newick")))
as_polytomy!(n -> NewickTree.support(n) < 0.5, vt_tree)
vt_tree = (readnw  nwstr)(NewickTree.extract(vt_tree, vulgatus_names))
vt_pij = patristic_distances(vt_tree)
color_lims = (0, maximum(v_pij))
# vt_tree = center_rooting(vt_tree)
# SpectralInference.ladderize!(vt_tree)
tplot = plot(vt_tree;
    fs=6, 
    rightmargin=7Plots.cm, 
    title="roary_vulgatus_theta",
    xlims = color_lims,
)

hplot = heatmap(vt_pij; 
    cmap=:viridis,
    ratio=1,
)
lyt = @layout [a{0.25w} b]
plot(tplot, hplot, layout=lyt, size=(800, 400), link=:y, clims=color_lims)
savefig(joinpath(pdir, "roary_vt_plots.pdf"))
"/Users/bend/projects/Doran_etal_2023/plots/roary_pangenome/roary_vt_plots.pdf"
vtu_tree = readnw(readline(joinpath(ddir, "roary_vulgatus_theta_uniformis", "accessory_binary_genes.fa.newick")))
as_polytomy!(n -> NewickTree.support(n) < 0.5, vtu_tree)
vtu_tree = (readnw  nwstr)(NewickTree.extract(vtu_tree, vulgatus_names))
vtu_pij = patristic_distances(vtu_tree)
color_lims = (0, maximum(v_pij))

tplot = plot(vtu_tree;
    fs=6,
    rightmargin=7Plots.cm,
    title="roary_vulgatus_theta_uniformis",
    xlims = (0, maximum(v_pij)),
)
hplot = heatmap(vtu_pij;
    cmap=:viridis,
    ratio=1,
)
lyt = @layout [a{0.25w} b]
plot(tplot, hplot, layout=lyt, size=(800, 400), link=:y, clims=color_lims)
savefig(joinpath(pdir, "roary_vtu_plots.pdf"))
"/Users/bend/projects/Doran_etal_2023/plots/roary_pangenome/roary_vtu_plots.pdf"

Full trees

v_tree = readnw(readline(joinpath(ddir, "roary_vulgatus", "accessory_binary_genes.fa.newick")))
vt_tree = readnw(readline(joinpath(ddir, "roary_vulgatus_theta", "accessory_binary_genes.fa.newick")))
vtu_tree = readnw(readline(joinpath(ddir, "roary_vulgatus_theta_uniformis", "accessory_binary_genes.fa.newick")))
(((thetaiotaomicron_DFI.2.13_JAHOON01.1:0.0,thetaiotaomicron_DFI.2.69_JAHOOB01.1:0.0):5.0e-9,(((uniformis_DFI.1.173_JAJCQC01.1:5.0e-9,(uniformis_DFI.1.135_JAHOOY01.1:5.0e-9,uniformis_DFI.1.247_JAHOOT01.1:0.000500301)0.887:0.000500002)1.0:0.603367675,(((uniformis_MSK.16.39_JAHOKS01.1:0.0,uniformis_MSK.16.44_JAHOKO01.1:0.0):5.0e-9,uniformis_MSK.16.46_JAHOKN01.1:0.000249885)1.0:0.277643945,(uniformis_MSK.18.25_JAHOIE01.1:0.0,uniformis_MSK.18.30_JAHOID01.1:0.0,uniformis_MSK.18.33_JAHOIC01.1:0.0):5.0e-9)0.994:5.0e-9)1.0:0.603392367,(((vulgatus_DFI.1.126_JAJCQV01.1:0.0,vulgatus_DFI.1.127_JAJCQU01.1:0.0,vulgatus_DFI.1.133_JAJCQR01.1:0.0):0.008662128,vulgatus_DFI.3.23_JAJCMZ01.1:0.000147566)1.0:0.021838157,((vulgatus_MSK.16.10_JAHOLK01.1:0.000247821,(vulgatus_MSK.16.26_JAHOLB01.1:0.0,vulgatus_MSK.16.2_JAHOLF01.1:0.0):0.000502492)1.0:0.020982885,(vulgatus_DFI.3.15_JAJCNC01.1:5.0e-9,vulgatus_DFI.3.65_JAJCMX01.1:0.001000293)0.96:0.002430647)0.987:0.005639065)0.208:5.0e-9)1.0:0.070612547)1.0:0.011688911,(thetaiotaomicron_DFI.4.151_JAHONL01.1:5.0e-9,(thetaiotaomicron_DFI.4.108_JAHONP01.1:0.000500452,thetaiotaomicron_DFI.4.133_JAHONN01.1:0.000500396)0.841:0.000249888)0.997:0.002972506,(thetaiotaomicron_DFI.2.27_JAHOOK01.1:0.000249958,(thetaiotaomicron_DFI.1.169_JAJCQF01.1:5.0e-9,(thetaiotaomicron_DFI.1.128_JAJCQT01.1:0.000249958,thetaiotaomicron_DFI.1.150_JAHOOX01.1:5.0e-9)0.769:0.000249979)0.468:6.0e-9)1.0:0.015528138);
ps = []
for tree in [v_tree, vt_tree,vtu_tree]
    ladderize!(tree)
    p = plot(treeb)
    push!(ps, p)
end
plot(ps..., layout=grid(1,3), size=(1200, 400), rightmargin=4Plots.cm)
savefig(joinpath(pdir, "full_pangenome_trees.pdf"))
"/Users/bend/projects/Doran_etal_2023/plots/roary_pangenome/full_pangenome_trees.pdf"