Julia Setup
using DrWatson
@quickactivate projectdir()
using CSV, DataFrames, DataFramesMeta
using Chain
using StatsBase
using FreqTables
using Downloads: request, download
using DrWatson
@quickactivate projectdir()
using CSV, DataFrames, DataFramesMeta
using Chain
using StatsBase
using FreqTables
using Downloads: request, download
= projectdir("_research", "BBpangenome") |> mkpath
ddir = joinpath(ddir, "fasta_all")
fullfasta_dir = "https://sra-download.ncbi.nlm.nih.gov/traces"; baseurl
= CSV.read(datadir("exp_raw","BB669","BB669_rowmeta.tsv"), DataFrame; delim="\t", normalizenames=true); biobankdf
= @chain biobankdf.Species begin
keptspecies
countmapsort(byvalue=true, rev=true)
filter(x -> x[2] >= (20), _)
filter(x -> x[1] != "unclassified", _)
end
OrderedCollections.OrderedDict{String, Int64} with 11 entries:
"Phocaeicola vulgatus" => 88
"[Ruminococcus] gnavus" => 41
"Bacteroides thetaiotaomicron" => 35
"Anaerostipes hadrus" => 31
"Bacteroides uniformis" => 27
"Blautia luti" => 24
"Bifidobacterium breve" => 24
"Coprococcus comes" => 23
"Dorea formicigenerans" => 22
"Blautia wexlerae" => 21
"[Eubacterium] rectale" => 20
= biobankdf[biobankdf.Species .∈ Ref(keys(keptspecies)), :]; strvardf
= @chain strvardf begin
strvardf_downsampled @orderby(:Species, :Donor, :Strain_ID)
@groupby(:Species, :Donor)
@transform(
$(nrow => :nstrains_per_species_donor),
$(eachindex => :nth_species_donor_strain),
:species_donor = :Species .* "_" .* :Donor
)@rsubset(
:nstrains_per_species_donor >= 3,
:nth_species_donor_strain <= 3,
# :nth_species_strain <= 9,
# :nth_donor <= 3,
)@orderby(:Species, :Donor, :nth_species_donor_strain)
@groupby(:Species)
@transform($(eachindex => :nth_species_strain))
@rsubset(
:nth_species_strain <= 9,
)@orderby(:Species, :Donor, :nth_species_donor_strain)
@rsubset(
# drop species that don't have 3 donors with 3 strain level replicates
:Species ∉ [
"Dorea formicigenerans",
"Blautia luti",
"Bifidobacterium breve",
],
)@rtransform(
:Accession_ID = first(:Accession, 7) .* "1.1",
)@rtransform(
:Label = join([last(split(:Species, " ")), :Strain_ID, :Accession_ID], '_')
)end
## Get resource urls for each strain
= map(strvardf_downsampled.Accession_ID) do accid
resource_urls = joinpath(accid[1:2], accid[3:4], accid[5:6], accid[1:end-2], accid * ".fsa_nt.gz")
resource_str for rsrc_hub in ["wgs01", "wgs04", "wgs02", "wgs03"]
= joinpath(baseurl, rsrc_hub, "wgs_aux", resource_str)
url = request(url)
req if req.status == 200
return url
end
end
return
end
@assert all(!isnothing, resource_urls)
= resource_urls; strvardf_downsampled.NCBI_resource_url
write(joinpath(ddir, "strvar_downsampled.tsv"), strvardf_downsampled; delim="\t") CSV.
"/Users/bend/projects/Doran_etal_2023/_research/BBpangenome/strvar_downsampled.tsv"
first(strvardf_downsampled, 5)
Row | Strain_ID | Accession | Phylum | Class | Order | Family | Genus | Species | Donor | NCBI_Phylum | NCBI_Class | NCBI_Order | NCBI_Family | NCBI_Genus | NCBI_Species | GTDB_Phylum | GTDB_Class | GTDB_Order | GTDB_Family | GTDB_Genus | GTDB_Species | nstrains_per_species_donor | nth_species_donor_strain | species_donor | nth_species_strain | Accession_ID | Label | NCBI_resource_url |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
String15 | String15 | String15 | String31 | String31 | String31 | String31 | String | String7 | String15 | String31 | String31 | String31 | String31 | String | String31 | String31 | String31 | String31 | String31 | String | Int64 | Int64 | String | Int64 | String | String | String | |
1 | DFI.4.141 | JAJCMT000000000 | Firmicutes | Clostridia | Eubacteriales | Lachnospiraceae | Anaerostipes | Anaerostipes hadrus | DFI.4 | Firmicutes | Clostridia | Eubacteriales | Lachnospiraceae | Anaerostipes | Anaerostipes hadrus | Firmicutes_A | Clostridia | Lachnospirales | Lachnospiraceae | Anaerostipes | Anaerostipes hadrus | 4 | 1 | Anaerostipes hadrus_DFI.4 | 1 | JAJCMT01.1 | hadrus_DFI.4.141_JAJCMT01.1 | https://sra-download.ncbi.nlm.nih.gov/traces/wgs03/wgs_aux/JA/JC/MT/JAJCMT01/JAJCMT01.1.fsa_nt.gz |
2 | DFI.4.155 | JAJCRN000000000 | Firmicutes | Clostridia | Clostridiales | Lachnospiraceae | Anaerostipes | Anaerostipes hadrus | DFI.4 | Firmicutes | Clostridia | Eubacteriales | Lachnospiraceae | Anaerostipes | Anaerostipes hadrus | Firmicutes_A | Clostridia | Lachnospirales | Lachnospiraceae | Anaerostipes | Anaerostipes hadrus | 4 | 2 | Anaerostipes hadrus_DFI.4 | 2 | JAJCRN01.1 | hadrus_DFI.4.155_JAJCRN01.1 | https://sra-download.ncbi.nlm.nih.gov/traces/wgs03/wgs_aux/JA/JC/RN/JAJCRN01/JAJCRN01.1.fsa_nt.gz |
3 | DFI.4.26 | JAJCMP000000000 | Firmicutes | Clostridia | Eubacteriales | Lachnospiraceae | Anaerostipes | Anaerostipes hadrus | DFI.4 | Firmicutes | Clostridia | Eubacteriales | Lachnospiraceae | Anaerostipes | Anaerostipes hadrus | Firmicutes_A | Clostridia | Lachnospirales | Lachnospiraceae | Anaerostipes | Anaerostipes hadrus | 4 | 3 | Anaerostipes hadrus_DFI.4 | 3 | JAJCMP01.1 | hadrus_DFI.4.26_JAJCMP01.1 | https://sra-download.ncbi.nlm.nih.gov/traces/wgs04/wgs_aux/JA/JC/MP/JAJCMP01/JAJCMP01.1.fsa_nt.gz |
4 | MSK.10.20 | JAJBMG000000000 | Firmicutes | Clostridia | Eubacteriales | Lachnospiraceae | Anaerostipes | Anaerostipes hadrus | MSK.10 | Firmicutes | Clostridia | Eubacteriales | Lachnospiraceae | Anaerostipes | Anaerostipes hadrus | Firmicutes_A | Clostridia | Lachnospirales | Lachnospiraceae | Anaerostipes | Anaerostipes hadrus | 4 | 1 | Anaerostipes hadrus_MSK.10 | 4 | JAJBMG01.1 | hadrus_MSK.10.20_JAJBMG01.1 | https://sra-download.ncbi.nlm.nih.gov/traces/wgs01/wgs_aux/JA/JB/MG/JAJBMG01/JAJBMG01.1.fsa_nt.gz |
5 | MSK.10.27 | JAAIQV000000000 | Firmicutes | Clostridia | Eubacteriales | Lachnospiraceae | Anaerostipes | Anaerostipes hadrus | MSK.10 | Firmicutes | Clostridia | Eubacteriales | Lachnospiraceae | Anaerostipes | Anaerostipes hadrus | Firmicutes_A | Clostridia | Lachnospirales | Lachnospiraceae | Anaerostipes | Anaerostipes hadrus | 4 | 2 | Anaerostipes hadrus_MSK.10 | 5 | JAAIQV01.1 | hadrus_MSK.10.27_JAAIQV01.1 | https://sra-download.ncbi.nlm.nih.gov/traces/wgs01/wgs_aux/JA/AI/QV/JAAIQV01/JAAIQV01.1.fsa_nt.gz |
download.(strvardf_downsampled.NCBI_resource_url, joinpath.(fullfasta_dir, strvardf_downsampled.Label .* ".fsa_nt.gz"))
72-element Vector{String}: "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_DFI.4.141_JAJCMT01.1.fsa_nt.gz" "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_DFI.4.155_JAJCRN01.1.fsa_nt.gz" "/Users/bend/projects/Doran_etal" ⋯ 43 bytes ⋯ "s_DFI.4.26_JAJCMP01.1.fsa_nt.gz" "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_MSK.10.20_JAJBMG01.1.fsa_nt.gz" "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_MSK.10.27_JAAIQV01.1.fsa_nt.gz" "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_MSK.10.31_JAAIQU01.1.fsa_nt.gz" "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_MSK.11.19_JAAIQS01.1.fsa_nt.gz" "/Users/bend/projects/Doran_etal" ⋯ 43 bytes ⋯ "s_MSK.11.2_JAAIQR01.1.fsa_nt.gz" "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_MSK.11.20_JAAIQQ01.1.fsa_nt.gz" "/Users/bend/projects/Doran_etal" ⋯ 54 bytes ⋯ "_DFI.1.128_JAJCQT01.1.fsa_nt.gz" ⋮ "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_MSK.15.10_JAAIRX01.1.fsa_nt.gz" "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_MSK.15.18_JAAIRW01.1.fsa_nt.gz" "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_MSK.15.32_JAAIRV01.1.fsa_nt.gz" "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_MSK.22.24_JAAIRN01.1.fsa_nt.gz" "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_MSK.22.91_JAJBPM01.1.fsa_nt.gz" "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_MSK.22.96_JAJBPJ01.1.fsa_nt.gz" "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_MSK.23.10_JAJBPH01.1.fsa_nt.gz" "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_MSK.23.17_JAJBON01.1.fsa_nt.gz" "/Users/bend/projects/Doran_etal" ⋯ 44 bytes ⋯ "_MSK.23.18_JAJBOM01.1.fsa_nt.gz"
# uncompress for prokka
for f in readdir(fullfasta_dir, join=true)
run(`gunzip $f`)
end
## Prokka
= joinpath(ddir, "bacteroides_prokka") |> mkpath
prokka_base_dir = joinpath(ddir, "bacteroides_prokka_gff") |> mkpath; prokka_gff
= @rsubset(strvardf_downsampled, :Family == "Bacteroidaceae"); Bacteroidaceae_df
# 48 minutes per 27 samples
@eachrow Bacteroidaceae_df begin
= :Label
smp = :Genus
genera = joinpath("/data/bacteroides_prokka", smp)
outdir = `docker run --rm -v $ddir:/data staphb/prokka prokka --force --kingdom Bacteria --genus $genera --locustag $smp --outdir $outdir /data/fasta_all/$(smp).fsa_nt`
command # @show command
run(command)
end;
for dir in readdir(prokka_base_dir)
cp(joinpath(prokka_base_dir, dir, "PROKKA_10152024.gff"), joinpath(prokka_gff, dir * ".gff"))
end
Roury on nine P. vulgatus strains, 3 strain replicates from 3 different donors.
run(`
--rm -v $ddir\:/data staphb/roary roary -f ./roary_vulgatus -e -n -v $(
docker run "/data/bacteroides_prokka_gff/" .* filter(x -> contains(x, r"(vulgatus)"), readdir(prokka_gff))
)`)
Roury on nine P. vulgatus strains, 3 strain replicates from 3 different donors
Plus 9 strains of B. thetaiotaomicron (3 strain replicates from 3 different donors)
run(`
--rm -v $ddir\:/data staphb/roary roary -f ./roary_vulgatus_theta -e -n -v $(
docker run "/data/bacteroides_prokka_gff/" .* filter(x -> contains(x, r"(vulgatus)|(theta)"), readdir(prokka_gff))
)`)
Roury on nine P. vulgatus strains, 3 strain replicates from 3 different donors
Plus 9 strains of B. thetaiotaomicron (3 strain replicates from 3 different donors)
Plus 9 strains of B. uniformis (3 strain replicates from 3 different donors)
# took 30 minutes
run(`
--rm -v $ddir\:/data staphb/roary roary -f ./roary_vulgatus_theta_uniformis -e -n -v $(
docker run "/data/bacteroides_prokka_gff/" .* filter(x -> contains(x, r"(vulgatus)|(theta)|(uniformis)"), readdir(prokka_gff))
)`)
using SpectralInference
using NewickTree
using StatsPlots
getlims(x) = extrema(x) .|> abs |> maximum |> x->(-x, x)
function center_rooting!(orig_tree)
= getleaves(orig_tree)
leaves = argmin(prewalk(orig_tree)) do node
centered_node mean(patristic_distance.(Ref(node), leaves).^2)
end
= NewickTree.set_outgroup!(centered_node)
newtree return newtree
end
function center_rooting(tree)
= deepcopy(tree)
tmp_tree center_rooting!(tmp_tree)
end
center_rooting (generic function with 1 method)
= plotsdir("roary_pangenome") |> mkpath pdir
"/Users/bend/projects/Doran_etal_2023/plots/roary_pangenome"
= readnw(readline(joinpath(ddir, "roary_vulgatus", "accessory_binary_genes.fa.newick")))
vtree = getleafnames(vtree)
vulgatus_names as_polytomy!(n -> NewickTree.support(n) < 0.5, vtree)
= (readnw ∘ nwstr)(NewickTree.extract(vtree, vulgatus_names))
vtree # vtree = center_rooting(vtree)
# SpectralInference.ladderize!(vtree)
= patristic_distances(vtree)
v_pij = (0, maximum(v_pij))
color_lims
= plot(vtree,
tplot =6,
fs=7Plots.cm,
rightmargin=(500, 500),
size="roary_vulgatus",
title= color_lims,
xlims
)
= heatmap(v_pij;
hplot =:viridis,
cmap=1,
ratio
)= @layout [a{0.25w} b]
lyt plot(tplot, hplot, layout=lyt, size=(800, 400), link=:y, clims=color_lims)
savefig(joinpath(pdir, "roary_v_plots.pdf"))
"/Users/bend/projects/Doran_etal_2023/plots/roary_pangenome/roary_v_plots.pdf"
= readnw(readline(joinpath(ddir, "roary_vulgatus_theta", "accessory_binary_genes.fa.newick")))
vt_tree as_polytomy!(n -> NewickTree.support(n) < 0.5, vt_tree)
= (readnw ∘ nwstr)(NewickTree.extract(vt_tree, vulgatus_names))
vt_tree = patristic_distances(vt_tree)
vt_pij = (0, maximum(v_pij))
color_lims # vt_tree = center_rooting(vt_tree)
# SpectralInference.ladderize!(vt_tree)
= plot(vt_tree;
tplot =6,
fs=7Plots.cm,
rightmargin="roary_vulgatus_theta",
title= color_lims,
xlims
)
= heatmap(vt_pij;
hplot =:viridis,
cmap=1,
ratio
)= @layout [a{0.25w} b]
lyt plot(tplot, hplot, layout=lyt, size=(800, 400), link=:y, clims=color_lims)
savefig(joinpath(pdir, "roary_vt_plots.pdf"))
"/Users/bend/projects/Doran_etal_2023/plots/roary_pangenome/roary_vt_plots.pdf"
= readnw(readline(joinpath(ddir, "roary_vulgatus_theta_uniformis", "accessory_binary_genes.fa.newick")))
vtu_tree as_polytomy!(n -> NewickTree.support(n) < 0.5, vtu_tree)
= (readnw ∘ nwstr)(NewickTree.extract(vtu_tree, vulgatus_names))
vtu_tree = patristic_distances(vtu_tree)
vtu_pij = (0, maximum(v_pij))
color_lims
= plot(vtu_tree;
tplot =6,
fs=7Plots.cm,
rightmargin="roary_vulgatus_theta_uniformis",
title= (0, maximum(v_pij)),
xlims
)= heatmap(vtu_pij;
hplot =:viridis,
cmap=1,
ratio
)= @layout [a{0.25w} b]
lyt plot(tplot, hplot, layout=lyt, size=(800, 400), link=:y, clims=color_lims)
savefig(joinpath(pdir, "roary_vtu_plots.pdf"))
"/Users/bend/projects/Doran_etal_2023/plots/roary_pangenome/roary_vtu_plots.pdf"
= readnw(readline(joinpath(ddir, "roary_vulgatus", "accessory_binary_genes.fa.newick")))
v_tree = readnw(readline(joinpath(ddir, "roary_vulgatus_theta", "accessory_binary_genes.fa.newick")))
vt_tree = readnw(readline(joinpath(ddir, "roary_vulgatus_theta_uniformis", "accessory_binary_genes.fa.newick"))) vtu_tree
(((thetaiotaomicron_DFI.2.13_JAHOON01.1:0.0,thetaiotaomicron_DFI.2.69_JAHOOB01.1:0.0):5.0e-9,(((uniformis_DFI.1.173_JAJCQC01.1:5.0e-9,(uniformis_DFI.1.135_JAHOOY01.1:5.0e-9,uniformis_DFI.1.247_JAHOOT01.1:0.000500301)0.887:0.000500002)1.0:0.603367675,(((uniformis_MSK.16.39_JAHOKS01.1:0.0,uniformis_MSK.16.44_JAHOKO01.1:0.0):5.0e-9,uniformis_MSK.16.46_JAHOKN01.1:0.000249885)1.0:0.277643945,(uniformis_MSK.18.25_JAHOIE01.1:0.0,uniformis_MSK.18.30_JAHOID01.1:0.0,uniformis_MSK.18.33_JAHOIC01.1:0.0):5.0e-9)0.994:5.0e-9)1.0:0.603392367,(((vulgatus_DFI.1.126_JAJCQV01.1:0.0,vulgatus_DFI.1.127_JAJCQU01.1:0.0,vulgatus_DFI.1.133_JAJCQR01.1:0.0):0.008662128,vulgatus_DFI.3.23_JAJCMZ01.1:0.000147566)1.0:0.021838157,((vulgatus_MSK.16.10_JAHOLK01.1:0.000247821,(vulgatus_MSK.16.26_JAHOLB01.1:0.0,vulgatus_MSK.16.2_JAHOLF01.1:0.0):0.000502492)1.0:0.020982885,(vulgatus_DFI.3.15_JAJCNC01.1:5.0e-9,vulgatus_DFI.3.65_JAJCMX01.1:0.001000293)0.96:0.002430647)0.987:0.005639065)0.208:5.0e-9)1.0:0.070612547)1.0:0.011688911,(thetaiotaomicron_DFI.4.151_JAHONL01.1:5.0e-9,(thetaiotaomicron_DFI.4.108_JAHONP01.1:0.000500452,thetaiotaomicron_DFI.4.133_JAHONN01.1:0.000500396)0.841:0.000249888)0.997:0.002972506,(thetaiotaomicron_DFI.2.27_JAHOOK01.1:0.000249958,(thetaiotaomicron_DFI.1.169_JAJCQF01.1:5.0e-9,(thetaiotaomicron_DFI.1.128_JAJCQT01.1:0.000249958,thetaiotaomicron_DFI.1.150_JAHOOX01.1:5.0e-9)0.769:0.000249979)0.468:6.0e-9)1.0:0.015528138);
= []
ps for tree in [v_tree, vt_tree,vtu_tree]
ladderize!(tree)
= plot(treeb)
p push!(ps, p)
end
plot(ps..., layout=grid(1,3), size=(1200, 400), rightmargin=4Plots.cm)
savefig(joinpath(pdir, "full_pangenome_trees.pdf"))
"/Users/bend/projects/Doran_etal_2023/plots/roary_pangenome/full_pangenome_trees.pdf"