using DrWatson
@quickactivate projectdir()
using SpectralInference
using NewickTree
using HypothesisTests
using MultipleTesting: adjust, Bonferroni, BenjaminiHochberg
using CSV, DataFrames
using StatsBase
using Muon
using Random: shuffle, seed!
using CategoricalArrays: cut, recode!
using LaTeXStrings
using Images
using StatsPlots
theme(:default, grid=false, label=false, tickdir=:out)
include(srcdir("helpers.jl"))
bbdir = datadir("exp_pro", "BB669") |> mkpath;
pdir = plotsdir("strain_variation_tree_and_tests") |> mkpath;
rdir = projectdir("_research", "strain_variation_wilcoxon_tests") |> mkpath
speciescolordf = CSV.read(datadir("exp_raw", "BB669", "subsettreecolors.csv"), DataFrame)
species_color_dict = Dict(k => v for (k, v) in zip(speciescolordf.species_name, speciescolordf.color));Fig. 5 - Interpreting New Strain Relationships
Fig. S11 - Statistical tests through the spectral tree
Given that we are able to create a spectral tree that respects taxonomic relationships and even captures sub-species groupings that corresponds with host environment, we were interested in understanding what genomic signatures were describing these sub-species groupings. However, because these sub-species groups a specifically defined by these genomic features, assumptions of independence typical to statistical tests are violated. Therefore we use a more stringent test to determine whether a given OGG is differentially abundant by using how likely it is for any of the features we are looking at to be significant before deciding if each individual OGG feature is significant.
Specifically to determine whether a given OGG is differentially abundant in a statistically significant way across two groups (‘group A’ and ‘group B’) that arise from a common node in the Spectral Tree, we first generate a ‘reference p-value’ and a ‘null distribution’. The reference p-value signifies the difference in \(OGG_i\) across the two groups (Wilcoxon test). To generate a null distribution of p-values, we create 100 permutations of the distributions between groups A and B that randomly shuffles entries in the two groups. For all OGGs within the Spectral Tree, we compute a distribution of the most significant p-values across all permutations. The Q-value for a given OGG is computed as the number of p-values in the null distribution that are less than the reference p-value (\(P\)) relative to the total number of p-values in the null distribution (equation).
\[Q = \frac{N_{p_{perm}} < N_{p_{ref}}}{N_{p_{perm}}}\]
Filtering to species that have > 20 strain replicates
uniprot = readh5ad(datadir("exp_raw", "UP7047", "2020_02_UP7047.h5ad"))
biobank = readh5mu(datadir("exp_raw", "BB669", "BB669.h5mu"))┌ Warning: Cannot join columns with the same name because var_names are intersecting.
└ @ Muon /Users/bend/.julia/packages/Muon/UKjAF/src/mudata.jl:367
MuData object 669 ✕ 21475
└ metabolites_foldchange
AnnData object 669 ✕ 50
└ oggs
AnnData object 669 ✕ 11248
└ UPorder_oggs
AnnData object 669 ✕ 10177
upmtx = uniprot.X[:, :]
upusv = svd(upmtx)
bbmtx = biobank["UPorder_oggs"].X[:,:]
size(bbmtx)(669, 10177)
keptspecies = sort(filter(x-> last(x) >= 20, countmap(biobank.obs.Species)), byvalue=true, rev=true)
filter!(!=("unclassified"), keptspecies)
# species => number of strains belonging to that speciesOrderedCollections.OrderedDict{String, Int64} with 12 entries:
"Phocaeicola vulgatus" => 88
"[Ruminococcus] gnavus" => 41
"Bacteroides thetaiotaomicron" => 35
"Anaerostipes hadrus" => 31
"Bacteroides uniformis" => 27
"unclassified" => 26
"Blautia luti" => 24
"Bifidobacterium breve" => 24
"Coprococcus comes" => 23
"Dorea formicigenerans" => 22
"Blautia wexlerae" => 21
"[Eubacterium] rectale" => 20
I had previously calculated which strains belonged to these species when I saved the biobank dataset
strvarmtx = bbmtx[biobank.obs.kept_species.==1,:];
@show size(strvarmtx)
strvarobs = biobank.obs[biobank.obs.kept_species.==1, :]
strvar_obsnames = strvarobs.Strain_ID
# infer hierarchical relationships of just the species with
# greater than 20 strain replicates
uhat = projectinLSV(strvarmtx, upusv)
dij = spectraldistances(uhat, upusv.S; alpha=1.5, q=.75) ./ size(uniprot, 2)
strvartree_hc = UPGMA_tree(dij)
strvartree = readnw(SpectralInference.newickstring(strvartree_hc, strvar_obsnames))
# ladderize!(strvartree)
## write out tree
open(joinpath(bbdir, "strvar-spitree.nw"), "w") do io
writenw(io, strvartree)
endsize(strvarmtx) = (356, 10177)
12058
# plot cladogram
strvartree_hc = UPGMA_tree(dij)
subsettreestring = SpectralInference.newickstring(strvartree_hc, strvarobs.Strain_ID)
subsettree = readnw(subsettreestring);
plot(strvartree_hc,
size=(800, 900),
lw=.5,
yflip=true,
xmirror=true,
xticks=:none,
permute=(:y, :x),
grid=false,
tickdirection=:none,
rightmargin=5.5Plots.cm,
label="",
framestyle=:grid,
)
# plot annotation rectangles
rectangle(w, h, x, y) = Shape(x .+ [0,w,w,0], y .+ [0,0,h,h])
speciesvector = strvarobs.Species[strvartree_hc.order]
breaks = findall(speciesvector[begin:(end-1)] .!= speciesvector[2:end])[Not([10, 11, 12, 13, 14,15])]
edges = [(s, e) for (s,e) in zip(vcat([0],breaks), vcat(breaks, [length(speciesvector)]))];
rects = [rectangle(2,(e-s),0,s+.5) for (s,e) in zip(vcat([0],breaks), vcat(breaks, [length(speciesvector)]))];
rectspeciescolors = permutedims(speciescolordf.color[indexin(speciesvector[first.(edges).+1], speciescolordf.species_name)]);
fancy_treeplot = plot!(permutedims(rects), fill=0.35, lw=0, c=rectspeciescolors, label="")
# species
for (i, (k,v)) in enumerate(zip(speciescolordf.species_name, speciescolordf.color))
yval = median(findall(==(k), speciesvector))
annotate!(-0.1, yval, text(k, 9, :left, v))
end
annotate!(-0.1, 356, text("Blautia luti (2)", 9, :left, species_color_dict["Blautia luti"]))
fancy_treeplotStatistically test clades at each node in tree
# functions to compare differences
function cliffs_d(a, b)
ans = 0.
for i in a, j in b
ans += sign(i - j)
end
ans / (length(a) * length(b))
end
function log2FC(a, b)
mean(log2.(a .+ 1)) - mean(log2.(b .+ 1))
endlog2FC (generic function with 1 method)
# significance tests across full strain variation tree
seed!(123456789)
tree = strvartree
maxtreeheight = last(maximum(getheights(tree)))
leaves = getleaves(tree)
leafnames = name.(leaves);
treeorder = indexin(leafnames, strvar_obsnames);
ogg_mtx = biobank["oggs"].X[:,:][biobank.obs.kept_species.==1,:]
ogg_mask = vec(mapslices(c->std(c) > 0, ogg_mtx, dims=1))
ogg_mtx = ogg_mtx[treeorder, ogg_mask]
@show size(ogg_mtx)
ogg_names = biobank["oggs"].var_names[ogg_mask]
ogg_freqs = vec(mean(ogg_mtx .> 0, dims=1))
taxon_info = strvarobs[treeorder, :]
taxon_info.species_donor = join.(eachrow(strvarobs[treeorder, [:Species, :Donor]]), " ")
isolate_ids = getleafnames(tree)
testresults = DataFrame()
for node in prewalk(tree)
# if group size would be less than 1 skip node
(isleaf(node) || any(isleaf.(node.children))) && continue
# find comparison groups
grp1_idx = indexin(getleafnames(node.children[1]), isolate_ids)
grp2_idx = indexin(getleafnames(node.children[2]), isolate_ids)
pvals = mapslices(ogg_mtx, dims=1) do column
pvalue(MannWhitneyUTest(column[grp1_idx], column[grp2_idx]))
end |> vec
# make null
seed!(1234)
null_qs = map(1:100) do i
n = length(grp1_idx)
m = length(grp2_idx)
perm_idx = shuffle(vcat(grp1_idx, grp2_idx))
qs = mapslices(ogg_mtx, dims=1) do column_perm
q = pvalue(MannWhitneyUTest(column_perm[perm_idx[1:n]], column_perm[perm_idx[n+1:end]]))
end
minimum(qs)
end
qvals = mapslices(ogg_mtx, dims=1) do column
pval = pvalue(MannWhitneyUTest(column[grp1_idx], column[grp2_idx]))
mean(null_qs .< pval)
end |> vec
effects = mapslices(ogg_mtx, dims=1) do column
abs(mean(column[grp1_idx]) - mean(column[grp2_idx]) / std(column))
end |> vec
log_effects = mapslices(ogg_mtx, dims=1) do column
logcol = log2.(column.+1)
abs(mean(logcol[grp1_idx]) - mean(logcol[grp2_idx]) / std(logcol))
end |> vec
log2FCs = mapslices(ogg_mtx, dims=1) do column
log2FC(column[grp1_idx], column[grp2_idx])
end |> vec
cliffs_ds = mapslices(ogg_mtx, dims=1) do column
cliffs_d(column[grp1_idx], column[grp2_idx])
end |> vec
grp1_prp_exp = mapslices(ogg_mtx, dims=1) do column
mean(column[grp1_idx] .> 0)
end |> vec
grp2_prp_exp = mapslices(ogg_mtx, dims=1) do column
mean(column[grp2_idx] .> 0)
end |> vec
testresults = vcat(testresults,
DataFrame(
:nodeids => id(node),
:nodeheight => NewickTree.height(node),
:nodedepth => maxtreeheight - NewickTree.height(node),
:grp1_N => length(grp1_idx),
:grp2_N => length(grp2_idx),
:grp1_phylum_mode => mode(taxon_info.Phylum[grp1_idx]),
:grp2_phylum_mode => mode(taxon_info.Phylum[grp2_idx]),
:grp1_species_mode => mode(taxon_info.Species[grp1_idx]),
:grp2_species_mode => mode(taxon_info.Species[grp2_idx]),
:grp1_species_donor_mode => mode(taxon_info.species_donor[grp1_idx]),
:grp2_species_donor_mode => mode(taxon_info.species_donor[grp2_idx]),
:ogg_name => ogg_names,
:ogg_idx => collect(axes(ogg_mtx, 2)),
:ogg_freqs => ogg_freqs,
:grp1_prp_exp => grp1_prp_exp,
:grp2_prp_exp => grp2_prp_exp,
:effectsize => effects,
:logeffectsize => log_effects,
:log2FC => log2FCs,
:cliffs_d => cliffs_ds,
:pvals => pvals,
:qvals => qvals,
)
)
end
testresults[!, "pval_BH"] .= adjust(testresults.pvals, BenjaminiHochberg());
testresults[!, "pval_Bon"] .= adjust(testresults.pvals, Bonferroni());
testresults[!, "qval_BH"] .= adjust(testresults.qvals, BenjaminiHochberg());
tree_discretization = cut(testresults.nodedepth, [0, 2, 4, 5])
recode!(tree_discretization,
"[4, 5)"=>"phylum level",
"[2, 4)"=>"species level",
"[0, 2)"=>"strain level")
testresults[!, "tree_level"] = tree_discretization
testresults;size(ogg_mtx) = (356, 5449)
countmap(testresults.tree_level)Dict{CategoricalArrays.CategoricalValue{String, UInt32}, Int64} with 3 entries:
"species level" => 59939
"strain level" => 457716
"phylum level" => 5449
CSV.write(joinpath(rdir, "full_ogg_wilcoxon_testresults_on_big10species.csv"), testresults)
testresults |>
df->filter(:pval_BH => <(0.05), df) |>
df->CSV.write(joinpath(rdir, "significant_BH_ogg_wilcoxon_testresults_on_big10species.csv"), df)
testresults |>
df->filter(:pval_Bon => <(0.05), df) |>
df->CSV.write(joinpath(rdir, "significant_Bon_ogg_wilcoxon_testresults_on_big10species.csv"), df)
testresults |>
df->filter(:qval_BH => <(0.05), df) |>
df->CSV.write(joinpath(rdir, "significant_Qbh_ogg_wilcoxon_testresults_on_big10species.csv"), df)"/Users/bend/projects/Doran_etal_2023/_research/strain_variation_wilcoxon_tests/significant_Qbh_ogg_wilcoxon_testresults_on_big10species.csv"
Filtering to most significant results
testresults = CSV.read(joinpath(rdir, "full_ogg_wilcoxon_testresults_on_big10species.csv"), DataFrame);sigtestresults = filter(:qval_BH => <(0.05), testresults) |>
df -> filter(:logeffectsize => >(1), df) |>
df -> filter(:effectsize => >(1), df) |>
df -> filter(:log2FC => x->abs.(x) .> 1 , df);@show sigtestresults.ogg_name |> unique |> length
@show sigtestresults.nodeids |> unique |> length
@show sum(sigtestresults.tree_level .== "phylum level")
@show sum(sigtestresults.tree_level .== "species level")
@show sum(sigtestresults.tree_level .== "strain level");(sigtestresults.ogg_name |> unique) |> length = 1530
(sigtestresults.nodeids |> unique) |> length = 66
sum(sigtestresults.tree_level .== "phylum level") = 415
sum(sigtestresults.tree_level .== "species level") = 1117
sum(sigtestresults.tree_level .== "strain level") = 1070
# how many OGGs are still duplicated?
sigtestresults |>
df -> groupby(df, :ogg_name) |>
df -> subset(df, :log2FC => x -> abs.(x) .== maximum(abs.(x))) |>
df -> countmap(df.ogg_name) |>
cm -> sum(values(cm) .> 1)27
# for these remaining OGGs select the shallowest significant example
uniqueOGGresults = sigtestresults |>
df -> groupby(df, :ogg_name) |>
df -> subset(df, :log2FC => x -> abs.(x) .== maximum(abs.(x))) |>
df -> groupby(df, :ogg_name) |>
df -> subset(df, :nodeheight => x -> x .== minimum(x));
@show nrow(uniqueOGGresults) == length(unique(uniqueOGGresults.ogg_name))
@show sum(uniqueOGGresults.tree_level .== "phylum level")
@show sum(uniqueOGGresults.tree_level .== "species level")
@show sum(uniqueOGGresults.tree_level .== "strain level");nrow(uniqueOGGresults) == length(unique(uniqueOGGresults.ogg_name)) = true
sum(uniqueOGGresults.tree_level .== "phylum level") = 273
sum(uniqueOGGresults.tree_level .== "species level") = 676
sum(uniqueOGGresults.tree_level .== "strain level") = 581
CSV.write(joinpath(rdir, "unique_sig_ogg_wilcoxon_testresults_on_big10species.csv"), uniqueOGGresults)"/Users/bend/projects/Doran_etal_2023/_research/strain_variation_wilcoxon_tests/unique_sig_ogg_wilcoxon_testresults_on_big10species.csv"
Phylum level significant orthologs
ogg_plotting_order = uniqueOGGresults |>
df->subset(df, :tree_level => x-> x.==("phylum level"), :grp1_N => x-> x.>=3, :grp2_N => x-> x.>=3) |>
df->subset(df, :nodeids => (x->get.(Ref(countmap(x)), x, 0) .> 10)) |>
df->sort(df, [:nodeids, :log2FC]);
hmapplot = heatmap(log2.(ogg_mtx[:, ogg_plotting_order.ogg_idx] .+ 1),
# hmapplot = heatmap(log2.(ogg_mtx[:, uniqueOGGresults.ogg_idx]),
title="Phylum level tests; " * L"Q_{BH} < 0.05, |\log_2 FC| > 1",
yticks=false,
# xticks=(1:nrow(uniqueOGGresults), uniqueOGGresults.ogg_name),
xticks=[1, findall(diff(ogg_plotting_order.nodeids) .!= 0)..., nrow(ogg_plotting_order)],
xrotation=45,
ylims=(.5,356.5),
framestyle=:grid,
cticks=(1:4, string.(2 .^ 1:4)),
c=cgrad(:bilbao, rev=true),
leftmargin=0Plots.mm,
);
rectangle(w, h, x, y) = Shape(x .+ [0,w,w,0], y .+ [0,0,h,h])
leafnames = getleafnames(strvartree)
oggrects = map(unique(ogg_plotting_order.nodeids)) do nid
node = filter(x-> id(x)==(nid), prewalk(strvartree))[1]
idx_taxa = indexin(getleafnames(node), leafnames)
noderesults = filter(:nodeids => ==(nid), ogg_plotting_order)
idx_oggs = indexin(noderesults.ogg_name, ogg_plotting_order.ogg_name)
x = minimum(idx_oggs)
y = minimum(idx_taxa)
width = maximum(idx_oggs) - minimum(idx_oggs)
height = maximum(idx_taxa) - minimum(idx_taxa)
rectangle(width, height, x, y)
end
plot!(permutedims(oggrects), fill=0.0, lw=1.5, linecolor="red", label="")
layout = @layout [a{.2w} b]
plot(plot(fancy_treeplot, title="SPI tree"), hmapplot; layout, xrotation=45, size=(1000,600), link=:y, legend=:none, margin=3Plots.Measures.mm)savefig(joinpath(pdir, "significantOGGS_pernode_phylumlevel_treeandheatmap_coloredboxes.pdf"))"/Users/bend/projects/Doran_etal_2023/plots/strain_variation_tree_and_tests/significantOGGS_pernode_phylumlevel_treeandheatmap_coloredboxes.pdf"
Species level significant orthologs
ogg_plotting_order = uniqueOGGresults |>
df->subset(df, :tree_level => x-> x.==("species level"), :grp1_N => x-> x.>=3, :grp2_N => x-> x.>=3) |>
df->subset(df, :nodeids => (x->get.(Ref(countmap(x)), x, 0) .> 10)) |>
df->sort(df, [:nodeids, :log2FC]);
hmapplot = heatmap(log2.(ogg_mtx[:, ogg_plotting_order.ogg_idx] .+ 1),
# hmapplot = heatmap(log2.(ogg_mtx[:, uniqueOGGresults.ogg_idx]),
title="Species level tests; " * L"Q_{BH} < 0.05, |\log_2 FC| > 1",
yticks=false,
# xticks=(1:nrow(uniqueOGGresults), uniqueOGGresults.ogg_name),
xticks=[1, findall(diff(ogg_plotting_order.nodeids) .!= 0)..., nrow(ogg_plotting_order)],
xrotation=45,
ylims=(.5,356.5),
framestyle=:grid,
cticks=(1:4, string.(2 .^ 1:4)),
c=cgrad(:bilbao, rev=true),
leftmargin=0Plots.mm,
);
rectangle(w, h, x, y) = Shape(x .+ [0,w,w,0], y .+ [0,0,h,h])
leafnames = getleafnames(strvartree)
oggrects = map(unique(ogg_plotting_order.nodeids)) do nid
node = filter(x-> id(x)==(nid), prewalk(strvartree))[1]
idx_taxa = indexin(getleafnames(node), leafnames)
noderesults = filter(:nodeids => ==(nid), ogg_plotting_order)
idx_oggs = indexin(noderesults.ogg_name, ogg_plotting_order.ogg_name)
x = minimum(idx_oggs)
y = minimum(idx_taxa)
width = maximum(idx_oggs) - minimum(idx_oggs)
height = maximum(idx_taxa) - minimum(idx_taxa)
rectangle(width, height, x, y)
end
plot!(permutedims(oggrects), fill=0.0, lw=1.5, linecolor="aqua", label="")
layout = @layout [a{.2w} b]
plot(plot(fancy_treeplot, title="SPI tree"), hmapplot; layout, xrotation=45, size=(1000,600), link=:y, legend=:none, margin=3Plots.Measures.mm)savefig(joinpath(pdir, "significantOGGS_pernode_specieslevel_treeandheatmap_coloredboxes.pdf"))"/Users/bend/projects/Doran_etal_2023/plots/strain_variation_tree_and_tests/significantOGGS_pernode_specieslevel_treeandheatmap_coloredboxes.pdf"
Strain level significant orthologs
ogg_plotting_order = uniqueOGGresults |>
df->subset(df, :tree_level => x-> x.==("strain level"), :grp1_N => x-> x.>=3, :grp2_N => x-> x.>=3) |>
df->subset(df, :nodeids => (x->get.(Ref(countmap(x)), x, 0) .> 10)) |>
df->sort(df, [:nodeids, :log2FC]);
hmapplot = heatmap(log2.(ogg_mtx[:, ogg_plotting_order.ogg_idx] .+ 1),
# hmapplot = heatmap(log2.(ogg_mtx[:, uniqueOGGresults.ogg_idx]),
title="Strain level tests; " * L"Q_{BH} < 0.05, |\log_2 FC| > 1",
yticks=false,
# xticks=(1:nrow(uniqueOGGresults), uniqueOGGresults.ogg_name),
xticks=[1, findall(diff(ogg_plotting_order.nodeids) .!= 0)..., nrow(ogg_plotting_order)],
xrotation=45,
ylims=(.5,356.5),
framestyle=:grid,
cticks=(1:4, string.(2 .^ 1:4)),
c=cgrad(:bilbao, rev=true),
leftmargin=0Plots.mm,
);
rectangle(w, h, x, y) = Shape(x .+ [0,w,w,0], y .+ [0,0,h,h])
leafnames = getleafnames(strvartree)
oggrects = map(unique(ogg_plotting_order.nodeids)) do nid
node = filter(x-> id(x)==(nid), prewalk(strvartree))[1]
idx_taxa = indexin(getleafnames(node), leafnames)
noderesults = filter(:nodeids => ==(nid), ogg_plotting_order)
idx_oggs = indexin(noderesults.ogg_name, ogg_plotting_order.ogg_name)
x = minimum(idx_oggs)
y = minimum(idx_taxa)
width = maximum(idx_oggs) - minimum(idx_oggs)
height = maximum(idx_taxa) - minimum(idx_taxa)
rectangle(width, height, x, y)
end
plot!(permutedims(oggrects), fill=0.0, lw=1.5, linecolor="lightgreen", label="")
# vline!(findall(diff(ogg_plotting_order.nodeids) .> 0) .- .5, lw=1, c=:black, label="")
layout = @layout [a{.2w} b]
plot(plot(fancy_treeplot, title="SPI tree"), hmapplot; layout, xrotation=45, size=(1000,600), link=:y, legend=:none, margin=3Plots.Measures.mm)savefig(joinpath(pdir, "significantOGGS_pernode_strainlevel_treeandheatmap_coloredboxes.pdf"))"/Users/bend/projects/Doran_etal_2023/plots/strain_variation_tree_and_tests/significantOGGS_pernode_strainlevel_treeandheatmap_coloredboxes.pdf"
zoom-ins of strain level significant orthologs
### Strain level zoomin P. vulgatus
ogg_plotting_order.nodeids |> unique
ogg_plotting_order = uniqueOGGresults |>
df->subset(df, :tree_level => x-> x.==("strain level"), :grp1_N => x-> x.>=3, :grp2_N => x-> x.>=3) |>
df->subset(df, :nodeids => (x->get.(Ref(countmap(x)), x, 0) .> 10)) |>
df->sort(df, [:nodeids, :log2FC]) |>
df->filter(:nodeids => ==(136), df)
nid = 136
node = filter(x-> id(x)==(nid), prewalk(strvartree))[1]
idx_taxa = indexin(getleafnames(node), leafnames)
hmapplot = heatmap(log2.(ogg_mtx[idx_taxa, ogg_plotting_order.ogg_idx] .+ 1),
# hmapplot = heatmap(log2.(ogg_mtx[:, uniqueOGGresults.ogg_idx]),
title="P. vulgatus (zoomin); " * L"Q_{BH} < 0.05, |\log_2 FC| > 1",
yticks=false,
xticks=(1:nrow(ogg_plotting_order), ogg_plotting_order.ogg_name),
# xtickfontsize=1,
xrotation=90,
# ylims=(.5,356.5),
tickdirection=:out,
framestyle=:grid,
cticks=(1:4, string.(2 .^ 1:4)),
clims=(0, 6.1),
c=cgrad(:bilbao, rev=true),
size=(550,550),
leftmargin=0Plots.mm,
grid=false
)### Strain level zoomin C. comes
ogg_plotting_order.nodeids |> unique
ogg_plotting_order = uniqueOGGresults |>
df->subset(df, :tree_level => x-> x.==("strain level"), :grp1_N => x-> x.>=3, :grp2_N => x-> x.>=3) |>
df->subset(df, :nodeids => (x->get.(Ref(countmap(x)), x, 0) .> 10)) |>
df->sort(df, [:nodeids, :log2FC]) |>
df->filter(:nodeids => ==(523), df)
nid = 523
node = filter(x-> id(x)==(nid), prewalk(strvartree))[1]
idx_taxa = indexin(getleafnames(node), leafnames)
hmapplot = heatmap(log2.(ogg_mtx[idx_taxa, ogg_plotting_order.ogg_idx] .+ 1),
title="C. comes (zoomin); " * L"Q_{BH} < 0.05, |\log_2 FC| > 1",
yticks=false,
xticks=(1:nrow(ogg_plotting_order), ogg_plotting_order.ogg_name),
xrotation=90,
tickdirection=:out,
framestyle=:grid,
cticks=(1:4, string.(2 .^ 1:4)),
clims=(0, 6.1),
c=cgrad(:bilbao, rev=true),
size=(550,550),
leftmargin=0Plots.mm,
grid=false
)Main - Explore differences within E. rectale
targetid = "MSK.16.22"
strvar_leaves = getleaves(strvartree)
basenode = strvar_leaves[findfirst(n->name(n) == targetid, strvar_leaves)]
subtree = readnw(nwstr(nthparent(basenode, 3)))
rectale_idxs = indexin(getleafnames(subtree),strvarobs.Strain_ID)
tiplabels = strvarobs.Donor[rectale_idxs]
idmapping = Dict(k=>v for (k,v) in zip(getleafnames(subtree), tiplabels))
rename_treeleaves!(subtree, idmapping)
rectale_donor_dict = Dict(
"MSK.22"=>"#2B3990",
"MSK.17"=>"#6682C1",
"MSK.16"=>"#FBB040",
"MSK.13"=>"#c9a964",
"MSK.9"=>"#A97C50",
)
rectale_linecolors = map(prewalk(subtree)) do node; rectale_donor_dict
if length(unique(getleafnames(node))) == 1
get(rectale_donor_dict, mode(getleafnames(node)), :grey)
else
:grey
end
end |> x->x[2:end] |> permutedims
plot(subtree,
c=rectale_linecolors,
# alpha=.5,
lw=3, fs=2,
size=(400,600),
rightmargin=2Plots.cm,
leftmargin=10Plots.mm,
ylabel="E. rectale",
framestyle=:grid,
ticks=false,
)
# donor annotations
for (i, (k,v)) in enumerate(rectale_donor_dict)
yvals = findall(==(k), getleafnames(subtree))
yval = median(yvals)
annotate!(1.8, yval, text("$k\n(n=$(length(yvals)))", 9, :center, v))
end
p1 = plot!()ogg_plotting_order = uniqueOGGresults |>
# select test at root of rectale sub tree
df->subset(df, :nodeids => x-> x.==(id(nthparent(basenode, 3)))) |>
df->select(df, [:nodeids, :log2FC, :grp1_N, :grp2_N, :ogg_name, :ogg_idx]) |>
# add in descriptions of each ogg
df->leftjoin(df, uniprot.var[:, [:og, :description]], on = :ogg_name => :og) |>
# sort oggs based on abundence per group
df->sort(df, [:nodeids, :log2FC]) |>
# get shorter descriptions for lables
df->coalesce.(df, "") |>
df->transform(df,
:description => ByRow(x->first(replace.(x, r"\(.*\)"=>""), 75)) => :ogg_labels
);# more abundant in MSK.17 and MSK.22
subset(ogg_plotting_order,
:log2FC => ByRow(<(0)),
:description => ByRow(!=(""))
)| Row | nodeids | log2FC | grp1_N | grp2_N | ogg_name | ogg_idx | description | ogg_labels |
|---|---|---|---|---|---|---|---|---|
| Int64 | Float64 | Int64 | Int64 | String15 | Int64 | String | String | |
| 1 | 433 | -2.52345 | 7 | 13 | COG1345 | 3757 | Required for morphogenesis and for the elongation of the flagellar filament by facilitating polymerization of the flagellin monomers at the tip of growing filament. Forms a capping structure, which prevents flagellin subunits (transported through the central channel of the flagellum) from leaking out without polymerization at the distal end | Required for morphogenesis and for the elongation of the flagellar filament |
| 2 | 433 | -2.26897 | 7 | 13 | COG1344 | 3756 | bacterial-type flagellum-dependent cell motility | bacterial-type flagellum-dependent cell motility |
| 3 | 433 | -2.03848 | 7 | 13 | COG5444 | 5401 | nuclease activity | nuclease activity |
| 4 | 433 | -2.03047 | 7 | 13 | COG5001 | 5338 | cyclic-guanylate-specific phosphodiesterase activity | cyclic-guanylate-specific phosphodiesterase activity |
| 5 | 433 | -2.0 | 7 | 13 | COG3210 | 4625 | domain, Protein | domain, Protein |
| 6 | 433 | -1.94674 | 7 | 13 | COG3547 | 4769 | Transposase (IS116 IS110 IS902 family) | Transposase |
| 7 | 433 | -1.8723 | 7 | 13 | COG4495 | 5124 | Domain of unknown function (DUF4176) | Domain of unknown function |
| 8 | 433 | -1.74526 | 7 | 13 | COG1442 | 3821 | lipopolysaccharide 3-alpha-galactosyltransferase activity | lipopolysaccharide 3-alpha-galactosyltransferase activity |
| 9 | 433 | -1.58496 | 7 | 13 | COG4786 | 5238 | Flagellar basal body rod | Flagellar basal body rod |
| 10 | 433 | -1.50843 | 7 | 13 | COG5279 | 5367 | protein involved in cytokinesis, contains TGc (transglutaminase protease-like) domain | protein involved in cytokinesis, contains TGc domain |
| 11 | 433 | -1.38462 | 7 | 13 | COG1776 | 4031 | chemotaxis | chemotaxis |
| 12 | 433 | -1.35998 | 7 | 13 | COG2944 | 4514 | sequence-specific DNA binding | sequence-specific DNA binding |
| 13 | 433 | -1.29925 | 7 | 13 | COG0842 | 3475 | Transport permease protein | Transport permease protein |
| 14 | 433 | -1.29925 | 7 | 13 | COG1516 | 3869 | flagellar protein fliS | flagellar protein fliS |
| 15 | 433 | -1.22499 | 7 | 13 | COG0835 | 3469 | chemotaxis | chemotaxis |
| 16 | 433 | -1.22499 | 7 | 13 | COG2186 | 4263 | Transcriptional regulator | Transcriptional regulator |
| 17 | 433 | -1.22499 | 7 | 13 | COG2198 | 4271 | Histidine kinase | Histidine kinase |
# more abundant in MSK.9, MSK.13, and MSK.16
subset(ogg_plotting_order,
:log2FC => ByRow(>(0)),
:description => ByRow(!=(""))
)| Row | nodeids | log2FC | grp1_N | grp2_N | ogg_name | ogg_idx | description | ogg_labels |
|---|---|---|---|---|---|---|---|---|
| Int64 | Float64 | Int64 | Int64 | String15 | Int64 | String | String | |
| 1 | 433 | 1.16713 | 7 | 13 | COG5015 | 5347 | Pyridoxamine 5'-phosphate oxidase | Pyridoxamine 5'-phosphate oxidase |
| 2 | 433 | 1.32193 | 7 | 13 | COG3534 | 4763 | alpha-L-arabinofuranosidase | alpha-L-arabinofuranosidase |
| 3 | 433 | 1.64425 | 7 | 13 | COG3378 | 4691 | Phage plasmid primase P4 family | Phage plasmid primase P4 family |
| 4 | 433 | 1.92997 | 7 | 13 | COG0286 | 2997 | site-specific DNA-methyltransferase (adenine-specific) activity | site-specific DNA-methyltransferase activity |
| 5 | 433 | 2.99151 | 7 | 13 | COG0732 | 3379 | type I restriction modification DNA specificity domain | type I restriction modification DNA specificity domain |
pltmtx = ogg_mtx[indexin(strvarobs.Strain_ID, getleafnames(strvartree)), ogg_plotting_order.ogg_idx][rectale_idxs, :]
p2 = heatmap(log2.(pltmtx .+ 1),
yticks=false,
xticks=(1:nrow(ogg_plotting_order), ogg_plotting_order.ogg_labels),
tickdirection=:out,
cticks=(1:4, string.(2 .^ 1:4)),
c=cgrad(:bilbao, rev=true),
leftmargin=0Plots.mm,
rightmargin=4Plots.cm,
topmargin=1.5Plots.cm,
grid=false,
framestyle=:box,
xmirror=true,
xrotation=45,
);
plot(p1, p2, layout=@layout([a{.25w} b]), link=:y, size=(1000, 800))Validation experiment
We noted that groups of strains from donors 22 and 17 had many orthologs related to the flagellum, that were absent from donors 9, 13, and 16. Conversely donors 9, 13, and 16 had orthologs related to viral (bacteriophage) infection. It suggested that in donors 9, 13, and 16 there had been an infection of populations that caused these strains to lose their flagellum and thus motility, likely to become more resistant to this bacteriophage.
We just wanted to test whether these strains where indeed non-motile, to verify that we were detecting accurate and functional differences between these strains. So we grew a sample of these strain in BHIS media, took pictures at the 24 hour mark and a time-series of optical density measurements (OD600).
plot motility assay
earlydf = CSV.read(datadir("exp_raw", "OD600_motility", "earlygrowth_OD600.tsv"), DataFrame, delim="\t") |>
df -> stack(df, 2:5) |>
df -> rename(df, :variable => :time, :value => :OD600);
earlydf = earlydf |>
df -> transform(df,
:ID => (x->join.(first.(split.(x, "."), 2), ".")) => :msk_id,
:ID => (x->last.(split.(x, "."))) => :replicate,
);
vortexdf = CSV.read(datadir("exp_raw", "OD600_motility", "postvortex_OD600.tsv"), DataFrame, delim="\t") |>
df -> stack(df, 2:8) |>
df -> rename(df, :variable => :time, :value => :OD600);
vortexdf = vortexdf |>
df -> transform(df,
:ID => (x->join.(first.(split.(x, "."), 2), ".")) => :msk_id,
:ID => (x->last.(split.(x, "."))) => :replicate,
);
vortexpltdf = vortexdf |>
df->groupby(df, [:time, :msk_id]) |>
df->combine(df,
:OD600 => mean,
:OD600 => std,
);
fulldf = vcat(earlydf, vortexdf)
# fulldf = filter(:msk_id => x->startswith(x, r"(9|17|22|neg)"), fulldf)
fullpltdf = fulldf |>
df->groupby(df, [:time, :msk_id]) |>
df->combine(df,
:OD600 => mean,
:OD600 => std,
);
fullpltdf.OD600_std .= replace(fullpltdf.OD600_std, NaN => 0.); # set correct std for single negative control
cmap=Dict(
"neg" => :grey,
"9.13" => "#A97C50",
"9.15" => "#A97C50",
"16.22" => "#FBB040",
"17.70" => "#6682C1",
"17.78" => "#6682C1",
"22.92" => "#2B3990"
)
plot(
ylabel="OD600",
legend=:outerright,
xrotation=45,
margin=5Plots.Measures.mm,
size=(700,300),
bottommargins=10Plots.mm,
)
@df fulldf scatter!(
:time, :OD600,
group=:ID,
c=[cmap[id] for id in :msk_id],
label="", markersize=3, markerstrokewidth=0,
)
vline!([4], c=:black, lw=.5, linestyle=:dash)
@df fullpltdf plot!(
:time, :OD600_mean,
group=:msk_id, ribbon=:OD600_std,
c=[cmap[id] for id in :msk_id],
fillalpha=.25,
)
annotate!(4.8, .27, text("vortex"))savefig(joinpath(pdir, "motility_experiment_plot.pdf"))"/Users/bend/projects/Doran_etal_2023/plots/strain_variation_tree_and_tests/motility_experiment_plot.pdf"
img = load(datadir("exp_raw", "OD600_motility", "swimmers_after24h.jpg"))
ysize = floor.(Int, quantile(1:size(img,1), [0.2, 0.77])) |> x->x[1]:x[2]
xsize = floor.(Int, quantile(1:size(img,2), [0.15, 0.73])) |> x->x[1]:x[2]
img[ysize, xsize]img = load(datadir("exp_raw", "OD600_motility", "nonswimmers_after24h.jpg"))
ysize = floor.(Int, quantile(1:size(img,1), [0.2, 0.77])) |> x->x[1]:x[2]
xsize = floor.(Int, quantile(1:size(img,2), [0.2, 0.71])) |> x->x[1]:x[2]
img[ysize, xsize]How conserved are these features?
The flagellum is generally highly conserved, it is remarkable that phage seemed to be involved with deleting such seemingly important machinery from the organism.
Thus, we examined the conservation pattern of the 12 annotated gene groups that were absent in E. rectale strains isolated from donors 9, 13, 16 but present in strains isolated from donors 17 and 22 across the entire Spectral Tree
uniprot = readh5ad(datadir("exp_raw", "UP7047", "2020_02_UP7047.h5ad"))
upmtx = uniprot.X[:,:];
uptree = readnw(readline(projectdir("_research", "UP7047_neighborjoined_spitree", "2020_02_UP7047-supporttree_50pct.nw")));
leafnodes = getleaves(uptree);
treeorder = indexin(getleafnames(uptree), uniprot.obs_names);species_name = "rectale"
foundtaxa = uniprot.obs[contains.(uniprot.obs.Species, species_name), [:Kingdom, :Phylum, :Class, :Order, :Family, :Genus, :Species]]
@info "string: '$(species_name)' finds $(nrow(foundtaxa)) taxa:\n$(foundtaxa)"┌ Info: string: 'rectale' finds 1 taxa:
│ 1×7 DataFrame
│ Row │ Kingdom Phylum Class Order Family Genus Species
│ │ String String String String String String String
│ ─────┼─────────────────────────────────────────────────────────────────────────────────────────────────
│ 1 │ Bacteria Firmicutes Clostridia Clostridiales Lachnospiraceae [Eubacterium] rectale
└ @ Main /Users/bend/projects/Doran_etal_2023/notebooks/jl_notebook_cell_df34fa98e69747e1a8f8a730347b8e2f_Y101sZmlsZQ==.jl:3
test_oggs = subset(ogg_plotting_order,
:description => ByRow(!=("")), # drop unannotated oggs
:description => ByRow(!=("domain, Protein")), # drop unannotated oggs
:log2FC => ByRow(<(0)), # drop oggs increased in the presence of phage
).ogg_name
uniprot.var[indexin(sort(test_oggs), uniprot.var_names), [:og, :description]]| Row | og | description |
|---|---|---|
| String | String | |
| 1 | COG0835 | chemotaxis |
| 2 | COG0842 | Transport permease protein |
| 3 | COG1344 | bacterial-type flagellum-dependent cell motility |
| 4 | COG1345 | Required for morphogenesis and for the elongation of the flagellar filament by facilitating polymerization of the flagellin monomers at the tip of growing filament. Forms a capping structure, which prevents flagellin subunits (transported through the central channel of the flagellum) from leaking out without polymerization at the distal end |
| 5 | COG1442 | lipopolysaccharide 3-alpha-galactosyltransferase activity |
| 6 | COG1516 | flagellar protein fliS |
| 7 | COG1776 | chemotaxis |
| 8 | COG2186 | Transcriptional regulator |
| 9 | COG2198 | Histidine kinase |
| 10 | COG2944 | sequence-specific DNA binding |
| 11 | COG3547 | Transposase (IS116 IS110 IS902 family) |
| 12 | COG4495 | Domain of unknown function (DUF4176) |
| 13 | COG4786 | Flagellar basal body rod |
| 14 | COG5001 | cyclic-guanylate-specific phosphodiesterase activity |
| 15 | COG5279 | protein involved in cytokinesis, contains TGc (transglutaminase protease-like) domain |
| 16 | COG5444 | nuclease activity |
# oggs from E. rectale test that are in Uniprot
chosen_oggs_idx = indexin(test_oggs, uniprot.var_names);
# make presence/absence matix `subset_ogg_mtx`
subset_ogg_mtx = upmtx[:, chosen_oggs_idx] .> 0;
chosen_oggs = uniprot.var.og[chosen_oggs_idx]
@show chosen_oggs;chosen_oggs = ["COG1345", "COG1344", "COG5444", "COG5001", "COG3547", "COG4495", "COG1442", "COG4786", "COG5279", "COG1776", "COG2944", "COG0842", "COG1516", "COG0835", "COG2186", "COG2198"]
# collect parent nodes back to pseudo-root of uniprot tree
species_node_idx = findall(contains.(uniprot.obs.Species[treeorder], species_name))[1]
speciesnode = leafnodes[species_node_idx]
depthofspecies = network_distance(uptree, speciesnode)
speciesparents = []
node = speciesnode
for i in 1:depthofspecies
node = parent(node)
push!(speciesparents, node)
end
cladesizes = length.(getleafnames.(speciesparents))
@info "# of taxa per clade = $(cladesizes)"┌ Info: # of taxa per clade = [3, 10, 23, 101, 216, 265, 380, 600, 733, 1022, 1065, 2376, 6709, 7047]
└ @ Main /Users/bend/projects/Doran_etal_2023/notebooks/jl_notebook_cell_df34fa98e69747e1a8f8a730347b8e2f_Y104sZmlsZQ==.jl:12
# only 1 E. rectale in uniprot
sum(==("Agathobacter"), uniprot.obs.Genus)1
Local subtree of E. rectale in Uniprot tree
subset_tree = readnw(nwstr(speciesparents[3]))
for leaf in getleaves(subset_tree)
NewickTree.setname!(leaf, only(uniprot.obs.Species[uniprot.obs.Proteome_ID .== name(leaf)]) )
end
plot(subset_tree, rightmargin=6Plots.cm, fs=9, size=(500,600))Larger subtree of E. rectale in Uniprot tree
subset_tree = readnw(nwstr(speciesparents[4]))
for leaf in getleaves(subset_tree)
NewickTree.setname!(leaf, only(uniprot.obs.Species[uniprot.obs.Proteome_ID .== name(leaf)]) )
end
plot(subset_tree, rightmargin=4Plots.cm, fs=5, size=(400,600))bblsvs = projectinLSV(bbmtx, upusv);rectale_mask = occursin.("rectale", biobank.obs.Species);joined_dij = spectraldistances([upusv.U; bblsvs[rectale_mask, :]], upusv.S, alpha=1.5, q=0.75);rectale_dij = joined_dij[(size(upusv.U,1)+1):end, 1:(end-sum(rectale_mask))]
rectale_upnns = first.(sortperm.(eachrow(rectale_dij)), 5);hcat(biobank.obs[rectale_mask, [:Strain_ID, :NCBI_Species]],
DataFrame(getindex.(Ref(uniprot.obs.Species), rectale_upnns) |> stack |> permutedims, [:nn_1, :nn_2, :nn_3, :nn_4, :nn_5]),
DataFrame(;
# uniprot_nearest_neighbor_Species=uniprot.obs.Species[rectale_upnns],
# uniprot_nearest_neighbor_Phylum=uniprot.obs.Phylum[rectale_upnns],
)
)| Row | Strain_ID | NCBI_Species | nn_1 | nn_2 | nn_3 | nn_4 | nn_5 |
|---|---|---|---|---|---|---|---|
| String | String | String | String | String | String | String | |
| 1 | MSK.9.13 | [Eubacterium] rectale | [Eubacterium] rectale | Aeromonas sp. w55 | Photobacterium damselae | Cyanobacteria bacterium | Bacillus sp. UFRGS-B20 |
| 2 | MSK.13.59 | [Eubacterium] rectale | Photobacterium damselae | Cyanobacteria bacterium | [Eubacterium] rectale | Candidatus Nasuia deltocephalinicola | Aeromonas sp. w55 |
| 3 | MSK.16.22 | [Eubacterium] rectale | [Eubacterium] rectale | Cyanobacteria bacterium | Candidatus Vidania fulgoroideae | Candidatus Carsonella ruddii | Bacillus sp. UFRGS-B20 |
| 4 | MSK.9.15 | [Eubacterium] rectale | Aeromonas sp. w55 | Photobacterium damselae | [Eubacterium] rectale | Cyanobacteria bacterium | Candidatus Tremblaya princeps |
| 5 | MSK.13.48 | [Eubacterium] rectale | Photobacterium damselae | Cyanobacteria bacterium | [Eubacterium] rectale | Candidatus Vidania fulgoroideae | Candidatus Nasuia deltocephalinicola |
| 6 | MSK.13.50 | [Eubacterium] rectale | Photobacterium damselae | Cyanobacteria bacterium | [Eubacterium] rectale | Candidatus Nasuia deltocephalinicola | Candidatus Vidania fulgoroideae |
| 7 | MSK.17.42 | [Eubacterium] rectale | [Eubacterium] rectale | Roseburia sp. CAG:18 | Photobacterium damselae | Cyanobacteria bacterium | Opitutae bacterium SCGC AG-212-L18 |
| 8 | MSK.17.3 | [Eubacterium] rectale | [Eubacterium] rectale | Roseburia sp. CAG:18 | Photobacterium damselae | Cyanobacteria bacterium | Opitutae bacterium SCGC AG-212-L18 |
| 9 | MSK.17.13 | [Eubacterium] rectale | [Eubacterium] rectale | Roseburia sp. CAG:18 | Photobacterium damselae | Cyanobacteria bacterium | Opitutae bacterium SCGC AG-212-L18 |
| 10 | MSK.17.19 | [Eubacterium] rectale | [Eubacterium] rectale | Roseburia sp. CAG:18 | Cyanobacteria bacterium | Photobacterium damselae | Opitutae bacterium SCGC AG-212-L18 |
| 11 | MSK.17.57 | [Eubacterium] rectale | [Eubacterium] rectale | Roseburia sp. CAG:18 | Cyanobacteria bacterium | Photobacterium damselae | Opitutae bacterium SCGC AG-212-L18 |
| 12 | MSK.22.19 | [Eubacterium] rectale | [Eubacterium] rectale | Roseburia sp. CAG:18 | Cyanobacteria bacterium | Photobacterium damselae | Candidatus Tremblaya princeps |
| 13 | MSK.17.70 | [Eubacterium] rectale | [Eubacterium] rectale | Roseburia sp. CAG:18 | Photobacterium damselae | Opitutae bacterium SCGC AG-212-L18 | Cyanobacteria bacterium |
| 14 | MSK.17.78 | [Eubacterium] rectale | [Eubacterium] rectale | Roseburia sp. CAG:18 | Photobacterium damselae | Opitutae bacterium SCGC AG-212-L18 | Cyanobacteria bacterium |
| 15 | MSK.17.79 | [Eubacterium] rectale | [Eubacterium] rectale | Roseburia sp. CAG:18 | Cyanobacteria bacterium | Photobacterium damselae | Opitutae bacterium SCGC AG-212-L18 |
| 16 | MSK.22.23 | [Eubacterium] rectale | [Eubacterium] rectale | Roseburia sp. CAG:18 | Cyanobacteria bacterium | Photobacterium damselae | Candidatus Tremblaya princeps |
| 17 | MSK.22.28 | [Eubacterium] rectale | [Eubacterium] rectale | Roseburia sp. CAG:18 | Cyanobacteria bacterium | Photobacterium damselae | Candidatus Tremblaya princeps |
| 18 | MSK.22.51 | [Eubacterium] rectale | [Eubacterium] rectale | Roseburia sp. CAG:18 | Cyanobacteria bacterium | Photobacterium damselae | Candidatus Tremblaya princeps |
| 19 | MSK.16.45 | [Eubacterium] rectale | [Eubacterium] rectale | Cyanobacteria bacterium | Candidatus Vidania fulgoroideae | Candidatus Carsonella ruddii | Bacillus sp. UFRGS-B20 |
| 20 | MSK.22.92 | [Eubacterium] rectale | [Eubacterium] rectale | Roseburia sp. CAG:18 | Cyanobacteria bacterium | Photobacterium damselae | Candidatus Vidania fulgoroideae |
Conservation of motility expanding out from E. rectale
# for each node find all children and measure percentage of children where
# chosen oggs are present
cladesateachtreelevel = getleafnames.(speciesparents);
frac_present = map(cladesateachtreelevel) do clade_ids
cladeidx = indexin(clade_ids, uniprot.obs_names)
mean(subset_ogg_mtx[cladeidx, :], dims=1) |> vec
end |> stack
# check how many oggs (n=16) are fully conserved across the smallest clade (n=3)
islocallypresent = frac_present[:, 1] .== 1;
@info "# locally present = $(sum(islocallypresent)), # not locally present = $(sum(.!islocallypresent))"
numcols = length(cladesizes);┌ Info: # locally present = 12, # not locally present = 4
└ @ Main /Users/bend/projects/Doran_etal_2023/notebooks/jl_notebook_cell_df34fa98e69747e1a8f8a730347b8e2f_Y120sZmlsZQ==.jl:11
# plot the locally present oggs up through the tree
plot(
xticks=(1:numcols, string.(cladesizes)),
xrotation=45,
ylabel="fractional coverage of taxa",
xlabel="# taxa in clade",
margin=5Plots.Measures.mm,
ylims=(0,1), widen=true,
size=(600,300),
yticks=0:.25:1,
legend=:bottomleft,
)
hline!([.25, .5, .75], c=:grey, alpha=.5, linestyle=:dash)
violin!(permutedims(1:numcols), frac_present[islocallypresent, :], alpha=.3, c="#6182ce")
dotplot!(permutedims(1:numcols), frac_present[islocallypresent, :], c="#6182ce", alpha=.5, mode=:none, )
scatter!([NaN], label=permutedims(["$(species_name) locally conserved OGG (n=$(sum(islocallypresent)))"]), c=["#6182ce"], alpha=[0.5] )these results seemed to show that indeed the orthologs absent from phage-assoiated strains in E. rectale are present on over half of the 250 most closely related species. And, even across all bacteria, there at least 3 orthologs that are conserved across half of bacteria.