using DrWatson
@quickactivate projectdir()
using SpectralInference
using NewickTree
using HypothesisTests
using MultipleTesting: adjust, Bonferroni, BenjaminiHochberg
using CSV, DataFrames
using StatsBase
using Muon
using Random: shuffle, seed!
using CategoricalArrays: cut, recode!
using LaTeXStrings
using Images
using StatsPlots
theme(:default, grid=false, label=false, tickdir=:out)
include(srcdir("helpers.jl"))
= datadir("exp_pro", "BB669") |> mkpath;
bbdir = plotsdir("strain_variation_tree_and_tests") |> mkpath;
pdir = projectdir("_research", "strain_variation_wilcoxon_tests") |> mkpath
rdir
= CSV.read(datadir("exp_raw", "BB669", "subsettreecolors.csv"), DataFrame)
speciescolordf = Dict(k => v for (k, v) in zip(speciescolordf.species_name, speciescolordf.color)); species_color_dict
Fig. 5 - Interpreting New Strain Relationships
Fig. S11 - Statistical tests through the spectral tree
Given that we are able to create a spectral tree that respects taxonomic relationships and even captures sub-species groupings that corresponds with host environment, we were interested in understanding what genomic signatures were describing these sub-species groupings. However, because these sub-species groups a specifically defined by these genomic features, assumptions of independence typical to statistical tests are violated. Therefore we use a more stringent test to determine whether a given OGG is differentially abundant by using how likely it is for any of the features we are looking at to be significant before deciding if each individual OGG feature is significant.
Specifically to determine whether a given OGG is differentially abundant in a statistically significant way across two groups (‘group A’ and ‘group B’) that arise from a common node in the Spectral Tree, we first generate a ‘reference p-value’ and a ‘null distribution’. The reference p-value signifies the difference in \(OGG_i\) across the two groups (Wilcoxon test). To generate a null distribution of p-values, we create 100 permutations of the distributions between groups A and B that randomly shuffles entries in the two groups. For all OGGs within the Spectral Tree, we compute a distribution of the most significant p-values across all permutations. The Q-value for a given OGG is computed as the number of p-values in the null distribution that are less than the reference p-value (\(P\)) relative to the total number of p-values in the null distribution (equation).
\[Q = \frac{N_{p_{perm}} < N_{p_{ref}}}{N_{p_{perm}}}\]
Filtering to species that have > 20 strain replicates
= readh5ad(datadir("exp_raw", "UP7047", "2020_02_UP7047.h5ad"))
uniprot = readh5mu(datadir("exp_raw", "BB669", "BB669.h5mu")) biobank
┌ Warning: Cannot join columns with the same name because var_names are intersecting.
└ @ Muon /Users/bend/.julia/packages/Muon/UKjAF/src/mudata.jl:367
MuData object 669 ✕ 21475
└ metabolites_foldchange
AnnData object 669 ✕ 50
└ oggs
AnnData object 669 ✕ 11248
└ UPorder_oggs
AnnData object 669 ✕ 10177
= uniprot.X[:, :]
upmtx = svd(upmtx)
upusv = biobank["UPorder_oggs"].X[:,:]
bbmtx size(bbmtx)
(669, 10177)
= sort(filter(x-> last(x) >= 20, countmap(biobank.obs.Species)), byvalue=true, rev=true)
keptspecies filter!(!=("unclassified"), keptspecies)
# species => number of strains belonging to that species
OrderedCollections.OrderedDict{String, Int64} with 12 entries:
"Phocaeicola vulgatus" => 88
"[Ruminococcus] gnavus" => 41
"Bacteroides thetaiotaomicron" => 35
"Anaerostipes hadrus" => 31
"Bacteroides uniformis" => 27
"unclassified" => 26
"Blautia luti" => 24
"Bifidobacterium breve" => 24
"Coprococcus comes" => 23
"Dorea formicigenerans" => 22
"Blautia wexlerae" => 21
"[Eubacterium] rectale" => 20
I had previously calculated which strains belonged to these species when I saved the biobank dataset
= bbmtx[biobank.obs.kept_species.==1,:];
strvarmtx @show size(strvarmtx)
= biobank.obs[biobank.obs.kept_species.==1, :]
strvarobs = strvarobs.Strain_ID
strvar_obsnames
# infer hierarchical relationships of just the species with
# greater than 20 strain replicates
= projectinLSV(strvarmtx, upusv)
uhat = spectraldistances(uhat, upusv.S; alpha=1.5, q=.75) ./ size(uniprot, 2)
dij = UPGMA_tree(dij)
strvartree_hc = readnw(SpectralInference.newickstring(strvartree_hc, strvar_obsnames))
strvartree # ladderize!(strvartree)
## write out tree
open(joinpath(bbdir, "strvar-spitree.nw"), "w") do io
writenw(io, strvartree)
end
size(strvarmtx) = (356, 10177)
12058
# plot cladogram
= UPGMA_tree(dij)
strvartree_hc = SpectralInference.newickstring(strvartree_hc, strvarobs.Strain_ID)
subsettreestring = readnw(subsettreestring);
subsettree plot(strvartree_hc,
=(800, 900),
size=.5,
lw=true,
yflip=true,
xmirror=:none,
xticks=(:y, :x),
permute=false,
grid=:none,
tickdirection=5.5Plots.cm,
rightmargin="",
label=:grid,
framestyle
)
# plot annotation rectangles
rectangle(w, h, x, y) = Shape(x .+ [0,w,w,0], y .+ [0,0,h,h])
= strvarobs.Species[strvartree_hc.order]
speciesvector = findall(speciesvector[begin:(end-1)] .!= speciesvector[2:end])[Not([10, 11, 12, 13, 14,15])]
breaks = [(s, e) for (s,e) in zip(vcat([0],breaks), vcat(breaks, [length(speciesvector)]))];
edges = [rectangle(2,(e-s),0,s+.5) for (s,e) in zip(vcat([0],breaks), vcat(breaks, [length(speciesvector)]))];
rects = permutedims(speciescolordf.color[indexin(speciesvector[first.(edges).+1], speciescolordf.species_name)]);
rectspeciescolors = plot!(permutedims(rects), fill=0.35, lw=0, c=rectspeciescolors, label="")
fancy_treeplot
# species
for (i, (k,v)) in enumerate(zip(speciescolordf.species_name, speciescolordf.color))
= median(findall(==(k), speciesvector))
yval annotate!(-0.1, yval, text(k, 9, :left, v))
end
annotate!(-0.1, 356, text("Blautia luti (2)", 9, :left, species_color_dict["Blautia luti"]))
fancy_treeplot
Statistically test clades at each node in tree
# functions to compare differences
function cliffs_d(a, b)
ans = 0.
for i in a, j in b
ans += sign(i - j)
end
ans / (length(a) * length(b))
end
function log2FC(a, b)
mean(log2.(a .+ 1)) - mean(log2.(b .+ 1))
end
log2FC (generic function with 1 method)
# significance tests across full strain variation tree
seed!(123456789)
= strvartree
tree
= last(maximum(getheights(tree)))
maxtreeheight = getleaves(tree)
leaves = name.(leaves);
leafnames = indexin(leafnames, strvar_obsnames);
treeorder
= biobank["oggs"].X[:,:][biobank.obs.kept_species.==1,:]
ogg_mtx = vec(mapslices(c->std(c) > 0, ogg_mtx, dims=1))
ogg_mask = ogg_mtx[treeorder, ogg_mask]
ogg_mtx @show size(ogg_mtx)
= biobank["oggs"].var_names[ogg_mask]
ogg_names
= vec(mean(ogg_mtx .> 0, dims=1))
ogg_freqs
= strvarobs[treeorder, :]
taxon_info = join.(eachrow(strvarobs[treeorder, [:Species, :Donor]]), " ")
taxon_info.species_donor
= getleafnames(tree)
isolate_ids = DataFrame()
testresults
for node in prewalk(tree)
# if group size would be less than 1 skip node
isleaf(node) || any(isleaf.(node.children))) && continue
(
# find comparison groups
= indexin(getleafnames(node.children[1]), isolate_ids)
grp1_idx = indexin(getleafnames(node.children[2]), isolate_ids)
grp2_idx
= mapslices(ogg_mtx, dims=1) do column
pvals pvalue(MannWhitneyUTest(column[grp1_idx], column[grp2_idx]))
end |> vec
# make null
seed!(1234)
= map(1:100) do i
null_qs = length(grp1_idx)
n = length(grp2_idx)
m = shuffle(vcat(grp1_idx, grp2_idx))
perm_idx = mapslices(ogg_mtx, dims=1) do column_perm
qs = pvalue(MannWhitneyUTest(column_perm[perm_idx[1:n]], column_perm[perm_idx[n+1:end]]))
q end
minimum(qs)
end
= mapslices(ogg_mtx, dims=1) do column
qvals = pvalue(MannWhitneyUTest(column[grp1_idx], column[grp2_idx]))
pval mean(null_qs .< pval)
end |> vec
= mapslices(ogg_mtx, dims=1) do column
effects abs(mean(column[grp1_idx]) - mean(column[grp2_idx]) / std(column))
end |> vec
= mapslices(ogg_mtx, dims=1) do column
log_effects = log2.(column.+1)
logcol abs(mean(logcol[grp1_idx]) - mean(logcol[grp2_idx]) / std(logcol))
end |> vec
= mapslices(ogg_mtx, dims=1) do column
log2FCs log2FC(column[grp1_idx], column[grp2_idx])
end |> vec
= mapslices(ogg_mtx, dims=1) do column
cliffs_ds cliffs_d(column[grp1_idx], column[grp2_idx])
end |> vec
= mapslices(ogg_mtx, dims=1) do column
grp1_prp_exp mean(column[grp1_idx] .> 0)
end |> vec
= mapslices(ogg_mtx, dims=1) do column
grp2_prp_exp mean(column[grp2_idx] .> 0)
end |> vec
= vcat(testresults,
testresults DataFrame(
:nodeids => id(node),
:nodeheight => NewickTree.height(node),
:nodedepth => maxtreeheight - NewickTree.height(node),
:grp1_N => length(grp1_idx),
:grp2_N => length(grp2_idx),
:grp1_phylum_mode => mode(taxon_info.Phylum[grp1_idx]),
:grp2_phylum_mode => mode(taxon_info.Phylum[grp2_idx]),
:grp1_species_mode => mode(taxon_info.Species[grp1_idx]),
:grp2_species_mode => mode(taxon_info.Species[grp2_idx]),
:grp1_species_donor_mode => mode(taxon_info.species_donor[grp1_idx]),
:grp2_species_donor_mode => mode(taxon_info.species_donor[grp2_idx]),
:ogg_name => ogg_names,
:ogg_idx => collect(axes(ogg_mtx, 2)),
:ogg_freqs => ogg_freqs,
:grp1_prp_exp => grp1_prp_exp,
:grp2_prp_exp => grp2_prp_exp,
:effectsize => effects,
:logeffectsize => log_effects,
:log2FC => log2FCs,
:cliffs_d => cliffs_ds,
:pvals => pvals,
:qvals => qvals,
)
)end
"pval_BH"] .= adjust(testresults.pvals, BenjaminiHochberg());
testresults[!, "pval_Bon"] .= adjust(testresults.pvals, Bonferroni());
testresults[!, "qval_BH"] .= adjust(testresults.qvals, BenjaminiHochberg());
testresults[!,
= cut(testresults.nodedepth, [0, 2, 4, 5])
tree_discretization recode!(tree_discretization,
"[4, 5)"=>"phylum level",
"[2, 4)"=>"species level",
"[0, 2)"=>"strain level")
"tree_level"] = tree_discretization
testresults[!, testresults;
size(ogg_mtx) = (356, 5449)
countmap(testresults.tree_level)
Dict{CategoricalArrays.CategoricalValue{String, UInt32}, Int64} with 3 entries:
"species level" => 59939
"strain level" => 457716
"phylum level" => 5449
write(joinpath(rdir, "full_ogg_wilcoxon_testresults_on_big10species.csv"), testresults)
CSV.|>
testresults df->filter(:pval_BH => <(0.05), df) |>
->CSV.write(joinpath(rdir, "significant_BH_ogg_wilcoxon_testresults_on_big10species.csv"), df)
df|>
testresults df->filter(:pval_Bon => <(0.05), df) |>
->CSV.write(joinpath(rdir, "significant_Bon_ogg_wilcoxon_testresults_on_big10species.csv"), df)
df|>
testresults df->filter(:qval_BH => <(0.05), df) |>
->CSV.write(joinpath(rdir, "significant_Qbh_ogg_wilcoxon_testresults_on_big10species.csv"), df) df
"/Users/bend/projects/Doran_etal_2023/_research/strain_variation_wilcoxon_tests/significant_Qbh_ogg_wilcoxon_testresults_on_big10species.csv"
Filtering to most significant results
= CSV.read(joinpath(rdir, "full_ogg_wilcoxon_testresults_on_big10species.csv"), DataFrame); testresults
= filter(:qval_BH => <(0.05), testresults) |>
sigtestresults -> filter(:logeffectsize => >(1), df) |>
df -> filter(:effectsize => >(1), df) |>
df -> filter(:log2FC => x->abs.(x) .> 1 , df); df
@show sigtestresults.ogg_name |> unique |> length
@show sigtestresults.nodeids |> unique |> length
@show sum(sigtestresults.tree_level .== "phylum level")
@show sum(sigtestresults.tree_level .== "species level")
@show sum(sigtestresults.tree_level .== "strain level");
(sigtestresults.ogg_name |> unique) |> length = 1530
(sigtestresults.nodeids |> unique) |> length = 66
sum(sigtestresults.tree_level .== "phylum level") = 415
sum(sigtestresults.tree_level .== "species level") = 1117
sum(sigtestresults.tree_level .== "strain level") = 1070
# how many OGGs are still duplicated?
|>
sigtestresults -> groupby(df, :ogg_name) |>
df -> subset(df, :log2FC => x -> abs.(x) .== maximum(abs.(x))) |>
df -> countmap(df.ogg_name) |>
df -> sum(values(cm) .> 1) cm
27
# for these remaining OGGs select the shallowest significant example
= sigtestresults |>
uniqueOGGresults -> groupby(df, :ogg_name) |>
df -> subset(df, :log2FC => x -> abs.(x) .== maximum(abs.(x))) |>
df -> groupby(df, :ogg_name) |>
df -> subset(df, :nodeheight => x -> x .== minimum(x));
df @show nrow(uniqueOGGresults) == length(unique(uniqueOGGresults.ogg_name))
@show sum(uniqueOGGresults.tree_level .== "phylum level")
@show sum(uniqueOGGresults.tree_level .== "species level")
@show sum(uniqueOGGresults.tree_level .== "strain level");
nrow(uniqueOGGresults) == length(unique(uniqueOGGresults.ogg_name)) = true
sum(uniqueOGGresults.tree_level .== "phylum level") = 273
sum(uniqueOGGresults.tree_level .== "species level") = 676
sum(uniqueOGGresults.tree_level .== "strain level") = 581
write(joinpath(rdir, "unique_sig_ogg_wilcoxon_testresults_on_big10species.csv"), uniqueOGGresults) CSV.
"/Users/bend/projects/Doran_etal_2023/_research/strain_variation_wilcoxon_tests/unique_sig_ogg_wilcoxon_testresults_on_big10species.csv"
Phylum level significant orthologs
= uniqueOGGresults |>
ogg_plotting_order df->subset(df, :tree_level => x-> x.==("phylum level"), :grp1_N => x-> x.>=3, :grp2_N => x-> x.>=3) |>
df->subset(df, :nodeids => (x->get.(Ref(countmap(x)), x, 0) .> 10)) |>
df->sort(df, [:nodeids, :log2FC]);
= heatmap(log2.(ogg_mtx[:, ogg_plotting_order.ogg_idx] .+ 1),
hmapplot # hmapplot = heatmap(log2.(ogg_mtx[:, uniqueOGGresults.ogg_idx]),
="Phylum level tests; " * L"Q_{BH} < 0.05, |\log_2 FC| > 1",
title=false,
yticks# xticks=(1:nrow(uniqueOGGresults), uniqueOGGresults.ogg_name),
=[1, findall(diff(ogg_plotting_order.nodeids) .!= 0)..., nrow(ogg_plotting_order)],
xticks=45,
xrotation=(.5,356.5),
ylims=:grid,
framestyle=(1:4, string.(2 .^ 1:4)),
cticks=cgrad(:bilbao, rev=true),
c=0Plots.mm,
leftmargin
);
rectangle(w, h, x, y) = Shape(x .+ [0,w,w,0], y .+ [0,0,h,h])
= getleafnames(strvartree)
leafnames = map(unique(ogg_plotting_order.nodeids)) do nid
oggrects = filter(x-> id(x)==(nid), prewalk(strvartree))[1]
node = indexin(getleafnames(node), leafnames)
idx_taxa = filter(:nodeids => ==(nid), ogg_plotting_order)
noderesults = indexin(noderesults.ogg_name, ogg_plotting_order.ogg_name)
idx_oggs = minimum(idx_oggs)
x = minimum(idx_taxa)
y = maximum(idx_oggs) - minimum(idx_oggs)
width = maximum(idx_taxa) - minimum(idx_taxa)
height rectangle(width, height, x, y)
end
plot!(permutedims(oggrects), fill=0.0, lw=1.5, linecolor="red", label="")
= @layout [a{.2w} b]
layout plot(plot(fancy_treeplot, title="SPI tree"), hmapplot; layout, xrotation=45, size=(1000,600), link=:y, legend=:none, margin=3Plots.Measures.mm)
savefig(joinpath(pdir, "significantOGGS_pernode_phylumlevel_treeandheatmap_coloredboxes.pdf"))
"/Users/bend/projects/Doran_etal_2023/plots/strain_variation_tree_and_tests/significantOGGS_pernode_phylumlevel_treeandheatmap_coloredboxes.pdf"
Species level significant orthologs
= uniqueOGGresults |>
ogg_plotting_order df->subset(df, :tree_level => x-> x.==("species level"), :grp1_N => x-> x.>=3, :grp2_N => x-> x.>=3) |>
df->subset(df, :nodeids => (x->get.(Ref(countmap(x)), x, 0) .> 10)) |>
df->sort(df, [:nodeids, :log2FC]);
= heatmap(log2.(ogg_mtx[:, ogg_plotting_order.ogg_idx] .+ 1),
hmapplot # hmapplot = heatmap(log2.(ogg_mtx[:, uniqueOGGresults.ogg_idx]),
="Species level tests; " * L"Q_{BH} < 0.05, |\log_2 FC| > 1",
title=false,
yticks# xticks=(1:nrow(uniqueOGGresults), uniqueOGGresults.ogg_name),
=[1, findall(diff(ogg_plotting_order.nodeids) .!= 0)..., nrow(ogg_plotting_order)],
xticks=45,
xrotation=(.5,356.5),
ylims=:grid,
framestyle=(1:4, string.(2 .^ 1:4)),
cticks=cgrad(:bilbao, rev=true),
c=0Plots.mm,
leftmargin
);
rectangle(w, h, x, y) = Shape(x .+ [0,w,w,0], y .+ [0,0,h,h])
= getleafnames(strvartree)
leafnames = map(unique(ogg_plotting_order.nodeids)) do nid
oggrects = filter(x-> id(x)==(nid), prewalk(strvartree))[1]
node = indexin(getleafnames(node), leafnames)
idx_taxa = filter(:nodeids => ==(nid), ogg_plotting_order)
noderesults = indexin(noderesults.ogg_name, ogg_plotting_order.ogg_name)
idx_oggs = minimum(idx_oggs)
x = minimum(idx_taxa)
y = maximum(idx_oggs) - minimum(idx_oggs)
width = maximum(idx_taxa) - minimum(idx_taxa)
height rectangle(width, height, x, y)
end
plot!(permutedims(oggrects), fill=0.0, lw=1.5, linecolor="aqua", label="")
= @layout [a{.2w} b]
layout plot(plot(fancy_treeplot, title="SPI tree"), hmapplot; layout, xrotation=45, size=(1000,600), link=:y, legend=:none, margin=3Plots.Measures.mm)
savefig(joinpath(pdir, "significantOGGS_pernode_specieslevel_treeandheatmap_coloredboxes.pdf"))
"/Users/bend/projects/Doran_etal_2023/plots/strain_variation_tree_and_tests/significantOGGS_pernode_specieslevel_treeandheatmap_coloredboxes.pdf"
Strain level significant orthologs
= uniqueOGGresults |>
ogg_plotting_order df->subset(df, :tree_level => x-> x.==("strain level"), :grp1_N => x-> x.>=3, :grp2_N => x-> x.>=3) |>
df->subset(df, :nodeids => (x->get.(Ref(countmap(x)), x, 0) .> 10)) |>
df->sort(df, [:nodeids, :log2FC]);
= heatmap(log2.(ogg_mtx[:, ogg_plotting_order.ogg_idx] .+ 1),
hmapplot # hmapplot = heatmap(log2.(ogg_mtx[:, uniqueOGGresults.ogg_idx]),
="Strain level tests; " * L"Q_{BH} < 0.05, |\log_2 FC| > 1",
title=false,
yticks# xticks=(1:nrow(uniqueOGGresults), uniqueOGGresults.ogg_name),
=[1, findall(diff(ogg_plotting_order.nodeids) .!= 0)..., nrow(ogg_plotting_order)],
xticks=45,
xrotation=(.5,356.5),
ylims=:grid,
framestyle=(1:4, string.(2 .^ 1:4)),
cticks=cgrad(:bilbao, rev=true),
c=0Plots.mm,
leftmargin
);
rectangle(w, h, x, y) = Shape(x .+ [0,w,w,0], y .+ [0,0,h,h])
= getleafnames(strvartree)
leafnames = map(unique(ogg_plotting_order.nodeids)) do nid
oggrects = filter(x-> id(x)==(nid), prewalk(strvartree))[1]
node = indexin(getleafnames(node), leafnames)
idx_taxa = filter(:nodeids => ==(nid), ogg_plotting_order)
noderesults = indexin(noderesults.ogg_name, ogg_plotting_order.ogg_name)
idx_oggs = minimum(idx_oggs)
x = minimum(idx_taxa)
y = maximum(idx_oggs) - minimum(idx_oggs)
width = maximum(idx_taxa) - minimum(idx_taxa)
height rectangle(width, height, x, y)
end
plot!(permutedims(oggrects), fill=0.0, lw=1.5, linecolor="lightgreen", label="")
# vline!(findall(diff(ogg_plotting_order.nodeids) .> 0) .- .5, lw=1, c=:black, label="")
= @layout [a{.2w} b]
layout plot(plot(fancy_treeplot, title="SPI tree"), hmapplot; layout, xrotation=45, size=(1000,600), link=:y, legend=:none, margin=3Plots.Measures.mm)
savefig(joinpath(pdir, "significantOGGS_pernode_strainlevel_treeandheatmap_coloredboxes.pdf"))
"/Users/bend/projects/Doran_etal_2023/plots/strain_variation_tree_and_tests/significantOGGS_pernode_strainlevel_treeandheatmap_coloredboxes.pdf"
zoom-ins of strain level significant orthologs
### Strain level zoomin P. vulgatus
|> unique
ogg_plotting_order.nodeids = uniqueOGGresults |>
ogg_plotting_order df->subset(df, :tree_level => x-> x.==("strain level"), :grp1_N => x-> x.>=3, :grp2_N => x-> x.>=3) |>
df->subset(df, :nodeids => (x->get.(Ref(countmap(x)), x, 0) .> 10)) |>
df->sort(df, [:nodeids, :log2FC]) |>
df->filter(:nodeids => ==(136), df)
= 136
nid = filter(x-> id(x)==(nid), prewalk(strvartree))[1]
node = indexin(getleafnames(node), leafnames)
idx_taxa
= heatmap(log2.(ogg_mtx[idx_taxa, ogg_plotting_order.ogg_idx] .+ 1),
hmapplot # hmapplot = heatmap(log2.(ogg_mtx[:, uniqueOGGresults.ogg_idx]),
="P. vulgatus (zoomin); " * L"Q_{BH} < 0.05, |\log_2 FC| > 1",
title=false,
yticks=(1:nrow(ogg_plotting_order), ogg_plotting_order.ogg_name),
xticks# xtickfontsize=1,
=90,
xrotation# ylims=(.5,356.5),
=:out,
tickdirection=:grid,
framestyle=(1:4, string.(2 .^ 1:4)),
cticks=(0, 6.1),
clims=cgrad(:bilbao, rev=true),
c=(550,550),
size=0Plots.mm,
leftmargin=false
grid )
### Strain level zoomin C. comes
|> unique
ogg_plotting_order.nodeids = uniqueOGGresults |>
ogg_plotting_order df->subset(df, :tree_level => x-> x.==("strain level"), :grp1_N => x-> x.>=3, :grp2_N => x-> x.>=3) |>
df->subset(df, :nodeids => (x->get.(Ref(countmap(x)), x, 0) .> 10)) |>
df->sort(df, [:nodeids, :log2FC]) |>
df->filter(:nodeids => ==(523), df)
= 523
nid = filter(x-> id(x)==(nid), prewalk(strvartree))[1]
node = indexin(getleafnames(node), leafnames)
idx_taxa
= heatmap(log2.(ogg_mtx[idx_taxa, ogg_plotting_order.ogg_idx] .+ 1),
hmapplot ="C. comes (zoomin); " * L"Q_{BH} < 0.05, |\log_2 FC| > 1",
title=false,
yticks=(1:nrow(ogg_plotting_order), ogg_plotting_order.ogg_name),
xticks=90,
xrotation=:out,
tickdirection=:grid,
framestyle=(1:4, string.(2 .^ 1:4)),
cticks=(0, 6.1),
clims=cgrad(:bilbao, rev=true),
c=(550,550),
size=0Plots.mm,
leftmargin=false
grid )
Main - Explore differences within E. rectale
= "MSK.16.22"
targetid = getleaves(strvartree)
strvar_leaves = strvar_leaves[findfirst(n->name(n) == targetid, strvar_leaves)]
basenode = readnw(nwstr(nthparent(basenode, 3)))
subtree = indexin(getleafnames(subtree),strvarobs.Strain_ID)
rectale_idxs
= strvarobs.Donor[rectale_idxs]
tiplabels = Dict(k=>v for (k,v) in zip(getleafnames(subtree), tiplabels))
idmapping rename_treeleaves!(subtree, idmapping)
= Dict(
rectale_donor_dict "MSK.22"=>"#2B3990",
"MSK.17"=>"#6682C1",
"MSK.16"=>"#FBB040",
"MSK.13"=>"#c9a964",
"MSK.9"=>"#A97C50",
)
= map(prewalk(subtree)) do node; rectale_donor_dict
rectale_linecolors if length(unique(getleafnames(node))) == 1
get(rectale_donor_dict, mode(getleafnames(node)), :grey)
else
:grey
end
end |> x->x[2:end] |> permutedims
plot(subtree,
=rectale_linecolors,
c# alpha=.5,
=3, fs=2,
lw=(400,600),
size=2Plots.cm,
rightmargin=10Plots.mm,
leftmargin="E. rectale",
ylabel=:grid,
framestyle=false,
ticks
)# donor annotations
for (i, (k,v)) in enumerate(rectale_donor_dict)
= findall(==(k), getleafnames(subtree))
yvals = median(yvals)
yval annotate!(1.8, yval, text("$k\n(n=$(length(yvals)))", 9, :center, v))
end
= plot!() p1
= uniqueOGGresults |>
ogg_plotting_order # select test at root of rectale sub tree
df->subset(df, :nodeids => x-> x.==(id(nthparent(basenode, 3)))) |>
df->select(df, [:nodeids, :log2FC, :grp1_N, :grp2_N, :ogg_name, :ogg_idx]) |>
# add in descriptions of each ogg
df->leftjoin(df, uniprot.var[:, [:og, :description]], on = :ogg_name => :og) |>
# sort oggs based on abundence per group
df->sort(df, [:nodeids, :log2FC]) |>
# get shorter descriptions for lables
df->coalesce.(df, "") |>
df->transform(df,
:description => ByRow(x->first(replace.(x, r"\(.*\)"=>""), 75)) => :ogg_labels
);
# more abundant in MSK.17 and MSK.22
subset(ogg_plotting_order,
:log2FC => ByRow(<(0)),
:description => ByRow(!=(""))
)
Row | nodeids | log2FC | grp1_N | grp2_N | ogg_name | ogg_idx | description | ogg_labels |
---|---|---|---|---|---|---|---|---|
Int64 | Float64 | Int64 | Int64 | String15 | Int64 | String | String | |
1 | 433 | -2.52345 | 7 | 13 | COG1345 | 3757 | Required for morphogenesis and for the elongation of the flagellar filament by facilitating polymerization of the flagellin monomers at the tip of growing filament. Forms a capping structure, which prevents flagellin subunits (transported through the central channel of the flagellum) from leaking out without polymerization at the distal end | Required for morphogenesis and for the elongation of the flagellar filament |
2 | 433 | -2.26897 | 7 | 13 | COG1344 | 3756 | bacterial-type flagellum-dependent cell motility | bacterial-type flagellum-dependent cell motility |
3 | 433 | -2.03848 | 7 | 13 | COG5444 | 5401 | nuclease activity | nuclease activity |
4 | 433 | -2.03047 | 7 | 13 | COG5001 | 5338 | cyclic-guanylate-specific phosphodiesterase activity | cyclic-guanylate-specific phosphodiesterase activity |
5 | 433 | -2.0 | 7 | 13 | COG3210 | 4625 | domain, Protein | domain, Protein |
6 | 433 | -1.94674 | 7 | 13 | COG3547 | 4769 | Transposase (IS116 IS110 IS902 family) | Transposase |
7 | 433 | -1.8723 | 7 | 13 | COG4495 | 5124 | Domain of unknown function (DUF4176) | Domain of unknown function |
8 | 433 | -1.74526 | 7 | 13 | COG1442 | 3821 | lipopolysaccharide 3-alpha-galactosyltransferase activity | lipopolysaccharide 3-alpha-galactosyltransferase activity |
9 | 433 | -1.58496 | 7 | 13 | COG4786 | 5238 | Flagellar basal body rod | Flagellar basal body rod |
10 | 433 | -1.50843 | 7 | 13 | COG5279 | 5367 | protein involved in cytokinesis, contains TGc (transglutaminase protease-like) domain | protein involved in cytokinesis, contains TGc domain |
11 | 433 | -1.38462 | 7 | 13 | COG1776 | 4031 | chemotaxis | chemotaxis |
12 | 433 | -1.35998 | 7 | 13 | COG2944 | 4514 | sequence-specific DNA binding | sequence-specific DNA binding |
13 | 433 | -1.29925 | 7 | 13 | COG0842 | 3475 | Transport permease protein | Transport permease protein |
14 | 433 | -1.29925 | 7 | 13 | COG1516 | 3869 | flagellar protein fliS | flagellar protein fliS |
15 | 433 | -1.22499 | 7 | 13 | COG0835 | 3469 | chemotaxis | chemotaxis |
16 | 433 | -1.22499 | 7 | 13 | COG2186 | 4263 | Transcriptional regulator | Transcriptional regulator |
17 | 433 | -1.22499 | 7 | 13 | COG2198 | 4271 | Histidine kinase | Histidine kinase |
# more abundant in MSK.9, MSK.13, and MSK.16
subset(ogg_plotting_order,
:log2FC => ByRow(>(0)),
:description => ByRow(!=(""))
)
Row | nodeids | log2FC | grp1_N | grp2_N | ogg_name | ogg_idx | description | ogg_labels |
---|---|---|---|---|---|---|---|---|
Int64 | Float64 | Int64 | Int64 | String15 | Int64 | String | String | |
1 | 433 | 1.16713 | 7 | 13 | COG5015 | 5347 | Pyridoxamine 5'-phosphate oxidase | Pyridoxamine 5'-phosphate oxidase |
2 | 433 | 1.32193 | 7 | 13 | COG3534 | 4763 | alpha-L-arabinofuranosidase | alpha-L-arabinofuranosidase |
3 | 433 | 1.64425 | 7 | 13 | COG3378 | 4691 | Phage plasmid primase P4 family | Phage plasmid primase P4 family |
4 | 433 | 1.92997 | 7 | 13 | COG0286 | 2997 | site-specific DNA-methyltransferase (adenine-specific) activity | site-specific DNA-methyltransferase activity |
5 | 433 | 2.99151 | 7 | 13 | COG0732 | 3379 | type I restriction modification DNA specificity domain | type I restriction modification DNA specificity domain |
= ogg_mtx[indexin(strvarobs.Strain_ID, getleafnames(strvartree)), ogg_plotting_order.ogg_idx][rectale_idxs, :]
pltmtx = heatmap(log2.(pltmtx .+ 1),
p2 =false,
yticks=(1:nrow(ogg_plotting_order), ogg_plotting_order.ogg_labels),
xticks=:out,
tickdirection=(1:4, string.(2 .^ 1:4)),
cticks=cgrad(:bilbao, rev=true),
c=0Plots.mm,
leftmargin=4Plots.cm,
rightmargin=1.5Plots.cm,
topmargin=false,
grid=:box,
framestyle=true,
xmirror=45,
xrotation
);
plot(p1, p2, layout=@layout([a{.25w} b]), link=:y, size=(1000, 800))
Validation experiment
We noted that groups of strains from donors 22 and 17 had many orthologs related to the flagellum, that were absent from donors 9, 13, and 16. Conversely donors 9, 13, and 16 had orthologs related to viral (bacteriophage) infection. It suggested that in donors 9, 13, and 16 there had been an infection of populations that caused these strains to lose their flagellum and thus motility, likely to become more resistant to this bacteriophage.
We just wanted to test whether these strains where indeed non-motile, to verify that we were detecting accurate and functional differences between these strains. So we grew a sample of these strain in BHIS media, took pictures at the 24 hour mark and a time-series of optical density measurements (OD600).
plot motility assay
= CSV.read(datadir("exp_raw", "OD600_motility", "earlygrowth_OD600.tsv"), DataFrame, delim="\t") |>
earlydf -> stack(df, 2:5) |>
df -> rename(df, :variable => :time, :value => :OD600);
df = earlydf |>
earlydf -> transform(df,
df :ID => (x->join.(first.(split.(x, "."), 2), ".")) => :msk_id,
:ID => (x->last.(split.(x, "."))) => :replicate,
);
= CSV.read(datadir("exp_raw", "OD600_motility", "postvortex_OD600.tsv"), DataFrame, delim="\t") |>
vortexdf -> stack(df, 2:8) |>
df -> rename(df, :variable => :time, :value => :OD600);
df = vortexdf |>
vortexdf -> transform(df,
df :ID => (x->join.(first.(split.(x, "."), 2), ".")) => :msk_id,
:ID => (x->last.(split.(x, "."))) => :replicate,
);= vortexdf |>
vortexpltdf df->groupby(df, [:time, :msk_id]) |>
df->combine(df,
:OD600 => mean,
:OD600 => std,
);
= vcat(earlydf, vortexdf)
fulldf # fulldf = filter(:msk_id => x->startswith(x, r"(9|17|22|neg)"), fulldf)
= fulldf |>
fullpltdf df->groupby(df, [:time, :msk_id]) |>
df->combine(df,
:OD600 => mean,
:OD600 => std,
);.= replace(fullpltdf.OD600_std, NaN => 0.); # set correct std for single negative control
fullpltdf.OD600_std
=Dict(
cmap"neg" => :grey,
"9.13" => "#A97C50",
"9.15" => "#A97C50",
"16.22" => "#FBB040",
"17.70" => "#6682C1",
"17.78" => "#6682C1",
"22.92" => "#2B3990"
)
plot(
="OD600",
ylabel=:outerright,
legend=45,
xrotation=5Plots.Measures.mm,
margin=(700,300),
size=10Plots.mm,
bottommargins
)@df fulldf scatter!(
:time, :OD600,
=:ID,
group=[cmap[id] for id in :msk_id],
c="", markersize=3, markerstrokewidth=0,
label
)vline!([4], c=:black, lw=.5, linestyle=:dash)
@df fullpltdf plot!(
:time, :OD600_mean,
=:msk_id, ribbon=:OD600_std,
group=[cmap[id] for id in :msk_id],
c=.25,
fillalpha
)annotate!(4.8, .27, text("vortex"))
savefig(joinpath(pdir, "motility_experiment_plot.pdf"))
"/Users/bend/projects/Doran_etal_2023/plots/strain_variation_tree_and_tests/motility_experiment_plot.pdf"
= load(datadir("exp_raw", "OD600_motility", "swimmers_after24h.jpg"))
img = floor.(Int, quantile(1:size(img,1), [0.2, 0.77])) |> x->x[1]:x[2]
ysize = floor.(Int, quantile(1:size(img,2), [0.15, 0.73])) |> x->x[1]:x[2]
xsize img[ysize, xsize]
= load(datadir("exp_raw", "OD600_motility", "nonswimmers_after24h.jpg"))
img = floor.(Int, quantile(1:size(img,1), [0.2, 0.77])) |> x->x[1]:x[2]
ysize = floor.(Int, quantile(1:size(img,2), [0.2, 0.71])) |> x->x[1]:x[2]
xsize img[ysize, xsize]
How conserved are these features?
The flagellum is generally highly conserved, it is remarkable that phage seemed to be involved with deleting such seemingly important machinery from the organism.
Thus, we examined the conservation pattern of the 12 annotated gene groups that were absent in E. rectale strains isolated from donors 9, 13, 16 but present in strains isolated from donors 17 and 22 across the entire Spectral Tree
= readh5ad(datadir("exp_raw", "UP7047", "2020_02_UP7047.h5ad"))
uniprot = uniprot.X[:,:];
upmtx = readnw(readline(projectdir("_research", "UP7047_neighborjoined_spitree", "2020_02_UP7047-supporttree_50pct.nw")));
uptree = getleaves(uptree);
leafnodes = indexin(getleafnames(uptree), uniprot.obs_names); treeorder
= "rectale"
species_name = uniprot.obs[contains.(uniprot.obs.Species, species_name), [:Kingdom, :Phylum, :Class, :Order, :Family, :Genus, :Species]]
foundtaxa @info "string: '$(species_name)' finds $(nrow(foundtaxa)) taxa:\n$(foundtaxa)"
┌ Info: string: 'rectale' finds 1 taxa:
│ 1×7 DataFrame
│ Row │ Kingdom Phylum Class Order Family Genus Species
│ │ String String String String String String String
│ ─────┼─────────────────────────────────────────────────────────────────────────────────────────────────
│ 1 │ Bacteria Firmicutes Clostridia Clostridiales Lachnospiraceae [Eubacterium] rectale
└ @ Main /Users/bend/projects/Doran_etal_2023/notebooks/jl_notebook_cell_df34fa98e69747e1a8f8a730347b8e2f_Y101sZmlsZQ==.jl:3
= subset(ogg_plotting_order,
test_oggs :description => ByRow(!=("")), # drop unannotated oggs
:description => ByRow(!=("domain, Protein")), # drop unannotated oggs
:log2FC => ByRow(<(0)), # drop oggs increased in the presence of phage
).ogg_nameindexin(sort(test_oggs), uniprot.var_names), [:og, :description]] uniprot.var[
Row | og | description |
---|---|---|
String | String | |
1 | COG0835 | chemotaxis |
2 | COG0842 | Transport permease protein |
3 | COG1344 | bacterial-type flagellum-dependent cell motility |
4 | COG1345 | Required for morphogenesis and for the elongation of the flagellar filament by facilitating polymerization of the flagellin monomers at the tip of growing filament. Forms a capping structure, which prevents flagellin subunits (transported through the central channel of the flagellum) from leaking out without polymerization at the distal end |
5 | COG1442 | lipopolysaccharide 3-alpha-galactosyltransferase activity |
6 | COG1516 | flagellar protein fliS |
7 | COG1776 | chemotaxis |
8 | COG2186 | Transcriptional regulator |
9 | COG2198 | Histidine kinase |
10 | COG2944 | sequence-specific DNA binding |
11 | COG3547 | Transposase (IS116 IS110 IS902 family) |
12 | COG4495 | Domain of unknown function (DUF4176) |
13 | COG4786 | Flagellar basal body rod |
14 | COG5001 | cyclic-guanylate-specific phosphodiesterase activity |
15 | COG5279 | protein involved in cytokinesis, contains TGc (transglutaminase protease-like) domain |
16 | COG5444 | nuclease activity |
# oggs from E. rectale test that are in Uniprot
= indexin(test_oggs, uniprot.var_names);
chosen_oggs_idx # make presence/absence matix `subset_ogg_mtx`
= upmtx[:, chosen_oggs_idx] .> 0;
subset_ogg_mtx = uniprot.var.og[chosen_oggs_idx]
chosen_oggs @show chosen_oggs;
chosen_oggs = ["COG1345", "COG1344", "COG5444", "COG5001", "COG3547", "COG4495", "COG1442", "COG4786", "COG5279", "COG1776", "COG2944", "COG0842", "COG1516", "COG0835", "COG2186", "COG2198"]
# collect parent nodes back to pseudo-root of uniprot tree
= findall(contains.(uniprot.obs.Species[treeorder], species_name))[1]
species_node_idx = leafnodes[species_node_idx]
speciesnode = network_distance(uptree, speciesnode)
depthofspecies = []
speciesparents = speciesnode
node for i in 1:depthofspecies
= parent(node)
node push!(speciesparents, node)
end
= length.(getleafnames.(speciesparents))
cladesizes @info "# of taxa per clade = $(cladesizes)"
┌ Info: # of taxa per clade = [3, 10, 23, 101, 216, 265, 380, 600, 733, 1022, 1065, 2376, 6709, 7047]
└ @ Main /Users/bend/projects/Doran_etal_2023/notebooks/jl_notebook_cell_df34fa98e69747e1a8f8a730347b8e2f_Y104sZmlsZQ==.jl:12
# only 1 E. rectale in uniprot
sum(==("Agathobacter"), uniprot.obs.Genus)
1
Local subtree of E. rectale in Uniprot tree
= readnw(nwstr(speciesparents[3]))
subset_tree for leaf in getleaves(subset_tree)
setname!(leaf, only(uniprot.obs.Species[uniprot.obs.Proteome_ID .== name(leaf)]) )
NewickTree.end
plot(subset_tree, rightmargin=6Plots.cm, fs=9, size=(500,600))
Larger subtree of E. rectale in Uniprot tree
= readnw(nwstr(speciesparents[4]))
subset_tree for leaf in getleaves(subset_tree)
setname!(leaf, only(uniprot.obs.Species[uniprot.obs.Proteome_ID .== name(leaf)]) )
NewickTree.end
plot(subset_tree, rightmargin=4Plots.cm, fs=5, size=(400,600))
= projectinLSV(bbmtx, upusv); bblsvs
= occursin.("rectale", biobank.obs.Species); rectale_mask
= spectraldistances([upusv.U; bblsvs[rectale_mask, :]], upusv.S, alpha=1.5, q=0.75); joined_dij
= joined_dij[(size(upusv.U,1)+1):end, 1:(end-sum(rectale_mask))]
rectale_dij = first.(sortperm.(eachrow(rectale_dij)), 5); rectale_upnns
hcat(biobank.obs[rectale_mask, [:Strain_ID, :NCBI_Species]],
DataFrame(getindex.(Ref(uniprot.obs.Species), rectale_upnns) |> stack |> permutedims, [:nn_1, :nn_2, :nn_3, :nn_4, :nn_5]),
DataFrame(;
# uniprot_nearest_neighbor_Species=uniprot.obs.Species[rectale_upnns],
# uniprot_nearest_neighbor_Phylum=uniprot.obs.Phylum[rectale_upnns],
) )
Row | Strain_ID | NCBI_Species | nn_1 | nn_2 | nn_3 | nn_4 | nn_5 |
---|---|---|---|---|---|---|---|
String | String | String | String | String | String | String | |
1 | MSK.9.13 | [Eubacterium] rectale | [Eubacterium] rectale | Aeromonas sp. w55 | Photobacterium damselae | Cyanobacteria bacterium | Bacillus sp. UFRGS-B20 |
2 | MSK.13.59 | [Eubacterium] rectale | Photobacterium damselae | Cyanobacteria bacterium | [Eubacterium] rectale | Candidatus Nasuia deltocephalinicola | Aeromonas sp. w55 |
3 | MSK.16.22 | [Eubacterium] rectale | [Eubacterium] rectale | Cyanobacteria bacterium | Candidatus Vidania fulgoroideae | Candidatus Carsonella ruddii | Bacillus sp. UFRGS-B20 |
4 | MSK.9.15 | [Eubacterium] rectale | Aeromonas sp. w55 | Photobacterium damselae | [Eubacterium] rectale | Cyanobacteria bacterium | Candidatus Tremblaya princeps |
5 | MSK.13.48 | [Eubacterium] rectale | Photobacterium damselae | Cyanobacteria bacterium | [Eubacterium] rectale | Candidatus Vidania fulgoroideae | Candidatus Nasuia deltocephalinicola |
6 | MSK.13.50 | [Eubacterium] rectale | Photobacterium damselae | Cyanobacteria bacterium | [Eubacterium] rectale | Candidatus Nasuia deltocephalinicola | Candidatus Vidania fulgoroideae |
7 | MSK.17.42 | [Eubacterium] rectale | [Eubacterium] rectale | Roseburia sp. CAG:18 | Photobacterium damselae | Cyanobacteria bacterium | Opitutae bacterium SCGC AG-212-L18 |
8 | MSK.17.3 | [Eubacterium] rectale | [Eubacterium] rectale | Roseburia sp. CAG:18 | Photobacterium damselae | Cyanobacteria bacterium | Opitutae bacterium SCGC AG-212-L18 |
9 | MSK.17.13 | [Eubacterium] rectale | [Eubacterium] rectale | Roseburia sp. CAG:18 | Photobacterium damselae | Cyanobacteria bacterium | Opitutae bacterium SCGC AG-212-L18 |
10 | MSK.17.19 | [Eubacterium] rectale | [Eubacterium] rectale | Roseburia sp. CAG:18 | Cyanobacteria bacterium | Photobacterium damselae | Opitutae bacterium SCGC AG-212-L18 |
11 | MSK.17.57 | [Eubacterium] rectale | [Eubacterium] rectale | Roseburia sp. CAG:18 | Cyanobacteria bacterium | Photobacterium damselae | Opitutae bacterium SCGC AG-212-L18 |
12 | MSK.22.19 | [Eubacterium] rectale | [Eubacterium] rectale | Roseburia sp. CAG:18 | Cyanobacteria bacterium | Photobacterium damselae | Candidatus Tremblaya princeps |
13 | MSK.17.70 | [Eubacterium] rectale | [Eubacterium] rectale | Roseburia sp. CAG:18 | Photobacterium damselae | Opitutae bacterium SCGC AG-212-L18 | Cyanobacteria bacterium |
14 | MSK.17.78 | [Eubacterium] rectale | [Eubacterium] rectale | Roseburia sp. CAG:18 | Photobacterium damselae | Opitutae bacterium SCGC AG-212-L18 | Cyanobacteria bacterium |
15 | MSK.17.79 | [Eubacterium] rectale | [Eubacterium] rectale | Roseburia sp. CAG:18 | Cyanobacteria bacterium | Photobacterium damselae | Opitutae bacterium SCGC AG-212-L18 |
16 | MSK.22.23 | [Eubacterium] rectale | [Eubacterium] rectale | Roseburia sp. CAG:18 | Cyanobacteria bacterium | Photobacterium damselae | Candidatus Tremblaya princeps |
17 | MSK.22.28 | [Eubacterium] rectale | [Eubacterium] rectale | Roseburia sp. CAG:18 | Cyanobacteria bacterium | Photobacterium damselae | Candidatus Tremblaya princeps |
18 | MSK.22.51 | [Eubacterium] rectale | [Eubacterium] rectale | Roseburia sp. CAG:18 | Cyanobacteria bacterium | Photobacterium damselae | Candidatus Tremblaya princeps |
19 | MSK.16.45 | [Eubacterium] rectale | [Eubacterium] rectale | Cyanobacteria bacterium | Candidatus Vidania fulgoroideae | Candidatus Carsonella ruddii | Bacillus sp. UFRGS-B20 |
20 | MSK.22.92 | [Eubacterium] rectale | [Eubacterium] rectale | Roseburia sp. CAG:18 | Cyanobacteria bacterium | Photobacterium damselae | Candidatus Vidania fulgoroideae |
Conservation of motility expanding out from E. rectale
# for each node find all children and measure percentage of children where
# chosen oggs are present
= getleafnames.(speciesparents);
cladesateachtreelevel = map(cladesateachtreelevel) do clade_ids
frac_present = indexin(clade_ids, uniprot.obs_names)
cladeidx mean(subset_ogg_mtx[cladeidx, :], dims=1) |> vec
end |> stack
# check how many oggs (n=16) are fully conserved across the smallest clade (n=3)
= frac_present[:, 1] .== 1;
islocallypresent @info "# locally present = $(sum(islocallypresent)), # not locally present = $(sum(.!islocallypresent))"
= length(cladesizes); numcols
┌ Info: # locally present = 12, # not locally present = 4
└ @ Main /Users/bend/projects/Doran_etal_2023/notebooks/jl_notebook_cell_df34fa98e69747e1a8f8a730347b8e2f_Y120sZmlsZQ==.jl:11
# plot the locally present oggs up through the tree
plot(
=(1:numcols, string.(cladesizes)),
xticks=45,
xrotation="fractional coverage of taxa",
ylabel="# taxa in clade",
xlabel=5Plots.Measures.mm,
margin=(0,1), widen=true,
ylims=(600,300),
size=0:.25:1,
yticks=:bottomleft,
legend
)hline!([.25, .5, .75], c=:grey, alpha=.5, linestyle=:dash)
violin!(permutedims(1:numcols), frac_present[islocallypresent, :], alpha=.3, c="#6182ce")
dotplot!(permutedims(1:numcols), frac_present[islocallypresent, :], c="#6182ce", alpha=.5, mode=:none, )
scatter!([NaN], label=permutedims(["$(species_name) locally conserved OGG (n=$(sum(islocallypresent)))"]), c=["#6182ce"], alpha=[0.5] )
these results seemed to show that indeed the orthologs absent from phage-assoiated strains in E. rectale are present on over half of the 250 most closely related species. And, even across all bacteria, there at least 3 orthologs that are conserved across half of bacteria.