Fig. 4 - Place New Strains in Spectral Tree

Author

Benjamin Doran

Published

January 17, 2025

julia-setup
using DrWatson
@quickactivate projectdir()

using HypothesisTests
using MultipleTesting: adjust, BenjaminiHochberg, Bonferroni
using SpectralInference
using NeighborJoining
using NewickTree
using StatsBase
using CSV, DataFrames, Muon
include(srcdir("helpers.jl"))

bbtreedir = datadir("exp_pro", "BB669") |> mkpath
pdir_csbtree = plotsdir("CSB669_trees") |> mkpath
pdir_csb_micurve = plotsdir("CSB669_micurve") |> mkpath

pdir = plotsdir("figure_03") |> mkpath

using StatsPlots
theme(:default, grid=false, label=false, tickdir=:out)

using RCall
R"""
library(ape)
library(treeio)
library(ggtree)
library(ggplot2)
library(tidyverse)

setwd($(projectdir()))
""";

Main - Projecting in strains from the commensal strain bank

Know that we have a latent space of bacterial evolutionary diversity, see previous notebook. We want to know if it informs us about the relatedness of our newly sequenced gut commensal strains. Have we learned something general about biology or just the idiosyncrasies of the UniProt dataset?

To do this we leverage the fact that the UniProt latest spaces is built using SVD/PCA, which means that we can linearly project new data into that latest space.

# read in data
uniprot = readh5ad(datadir("exp_raw", "UP7047", "2020_02_UP7047.h5ad"))
biobank = readh5mu(datadir("exp_raw", "BB669", "BB669.h5mu"))
┌ Warning: Cannot join columns with the same name because var_names are intersecting.
└ @ Muon /Users/bend/.julia/packages/Muon/UKjAF/src/mudata.jl:367
MuData object 669 ✕ 21475
└ metabolites_foldchange
  AnnData object 669 ✕ 50
└ oggs
  AnnData object 669 ✕ 11248
└ UPorder_oggs
  AnnData object 669 ✕ 10177
# remake UniProt latent space
upmtx = uniprot.X[:, :]
upusv = svd(upmtx)
# get strain bank strains annotated in same way as UniProt
bbmtx = biobank["UPorder_oggs"].X[:,:]
size(bbmtx)
(669, 10177)
# project into uniprot spectral latent space
# uses ̂u = newdata * V * S_inverse
bbuhat = projectinLSV(bbmtx, upusv)
669×7047 Matrix{Float64}:
 0.00710957  0.016346   -0.0109378   0.00347082  …  -0.0620108    0.195298
 0.0101422   0.0235557  -0.0183793   0.00654043     -0.290321     0.216655
 0.00988477  0.0228117  -0.0172847   0.00538378     -0.197863     0.268172
 0.00911707  0.0213556  -0.0163378   0.00475377      0.201534     0.00658134
 0.0100938   0.0234382  -0.0181515   0.00425463      0.245223     0.123784
 0.0100751   0.0234404  -0.018099    0.00420404  …   0.247896     0.121264
 0.0126152   0.0291861  -0.0252202   0.00349912      0.204816     0.265976
 0.00982738  0.0232259  -0.0177415   0.00509255      0.25874      0.253145
 0.0130485   0.0308516  -0.0269812   0.00493428      0.270194    -0.0506157
 0.00927287  0.0215109  -0.0162395   0.00516107      0.171792     0.0400535
 ⋮                                               ⋱   ⋮           
 0.00895884  0.0207339  -0.0178657   0.0105853   …  -0.106219     0.317428
 0.00903688  0.0208161  -0.0180687   0.0106894      -0.118437     0.345223
 0.0129656   0.0307315  -0.027649    0.00338373      0.133769     0.110028
 0.00931925  0.0204614  -0.0181975   0.0106452       0.00752627   0.526419
 0.00569943  0.0126175  -0.00958138  0.00200406      0.0114675   -0.0068527
 0.0131057   0.0291628  -0.02421     0.00222497  …  -0.11391      0.220464
 0.0105281   0.0236117  -0.0181563   0.00596968     -0.209295     0.213963
 0.00931614  0.0204485  -0.0181861   0.0106346       0.0130234    0.506454
 0.00562719  0.0121077  -0.0097389   0.00218194      0.200105     0.0085347
# calcuated distance between gut commensal strains contextualized by UniProt strain diversity
dij = spectraldistances(bbuhat, upusv.S; alpha=1.5, q=.75);
treestr = NeighborJoining.newickstring(regNJ(dij), biobank.obs_names)
bbtree = readnw(treestr)
ladderize!(bbtree)
669
# write out CSB biobank tree
open(joinpath(bbtreedir, "BB669-tree.nw"), "w") do io
    writenw(io, bbtree)
end
18586
# plot tree
R"""
bb669obs = read.table($(datadir("exp_raw", "BB669", "BB669_rowmeta.tsv")), sep="\t", header=1)
bb669obs$NCBI.Family[bb669obs$NCBI.Family == "<unclassified>"] = "<unknown family>"
bb669obs$ID = bb669obs$Strain.ID
bb669tree = read.newick($(joinpath(bbtreedir, "BB669-tree.nw")))
bb669tree = as.treedata(left_join(as_tibble(bb669tree), bb669obs, by = c("label" = "Strain.ID")))
bb669tree_tbl = as_tibble(bb669tree)

mode_family = offspring(bb669tree_tbl, bb669tree_tbl$node, tiponly = FALSE, self_include = TRUE)
mode_family = unlist(lapply(mode_family, function(x) modeest::mfv1(x$NCBI.Family, na_rm=TRUE)))

bb669tree_tbl$mode_family = mode_family
bb669tree_tbl$label = paste(str_pad(bb669tree_tbl$ID, 10, side="right"), bb669tree_tbl$NCBI.Species, sep=" ")
bb669tree = as.treedata(bb669tree_tbl)
familynames = names(sort(table(bb669obs$NCBI.Family), decreasing = TRUE))

colorvalues = RColorBrewer::brewer.pal(12, name="Set3")
p = ggtree(bb669tree, 
    aes(color=mode_family), 
    size=.5,
    layout="fan", open.angle=10,
    branch.length="none",
    )  %>% rotate_tree(-90) +
    ggtitle("CSB Spectral Tree (N=669, UP context)") + 
    scale_color_manual(values=colorvalues, breaks=familynames)
ggsave(file.path($pdir_csbtree, "CSB669_SPItree.svg"), 
    p,
    height=10,
    width=10,
)
"""
show_svg(joinpath(pdir_csbtree, "CSB669_SPItree.svg"))
┌ Warning: RCall.jl: Scale for y is already present.
│ Adding another scale for y, which will replace the existing scale.
│ Coordinate system already present. Adding new coordinate system, which will
│ replace the existing one.
└ @ RCall /Users/bend/.julia/packages/RCall/0ggIQ/src/io.jl:172

Add labels to the tips

R"""
p = ggtree(bb669tree, 
    aes(color=mode_family), 
    size=.5,
    layout="fan", open.angle=10,
    branch.length="none",
    )  %>% rotate_tree(-90) +
    geom_tiplab2(aes(text=label), size=1, family="mono") +
    ggtitle("CSB Spectral Tree (N=669, UP context)") +
    scale_color_manual(values=colorvalues, breaks=familynames)
ggsave(file.path($pdir_csbtree, "CSB669_SPItree_withtiplabels.svg"), 
    p,
    height=10,
    width=10,
)
"""
show_svg(joinpath(pdir_csbtree, "CSB669_SPItree_withtiplabels.svg"))
┌ Warning: RCall.jl: Scale for y is already present.
│ Adding another scale for y, which will replace the existing scale.
│ Coordinate system already present. Adding new coordinate system, which will
│ replace the existing one.
└ @ RCall /Users/bend/.julia/packages/RCall/0ggIQ/src/io.jl:172

Sub-trees

Let’s check some local neighbors around certain taxa. Expanding out from individual taxa shows the other strain replicates of that species. And, we see sub-species groupings!

nthparent(n, i) = i < 1 ? n : nthparent(parent(n), i-1)
rename_treeleaves!(tree, idmapping) = begin
    for node in getleaves(tree)
        NewickTree.setname!(node, idmapping[name(node)])
    end
    tree
end
rename_treeleaves! (generic function with 1 method)
bbtree_leaves = getleaves(bbtree);
targetid = "DFI.1.13"
basenode = bbtree_leaves[findfirst(n->name(n) == targetid, bbtree_leaves)]
subtree = readnw(nwstr(nthparent(basenode, 3)))
tiplabels = join.(eachrow(biobank.obs[indexin(getleafnames(subtree), biobank.obs.Strain_ID), [:Strain_ID, :NCBI_Species]]), " ")
idmapping = Dict(k=>v for (k,v) in zip(getleafnames(subtree), tiplabels))
rename_treeleaves!(subtree, idmapping)
plot(subtree,
    c=:turquoise3, alpha=.5,
    lw=2, fs=7,
    size=(600,600),
    rightmargin=5Plots.cm,
    yflip=true
)
targetid = "MSK.17.12"
basenode = bbtree_leaves[findfirst(n->name(n) == targetid, bbtree_leaves)]
subtree = readnw(nwstr(nthparent(basenode, 3)))
tiplabels = join.(eachrow(biobank.obs[indexin(getleafnames(subtree), biobank.obs.Strain_ID), [:Strain_ID, :NCBI_Species]]), " ")
idmapping = Dict(k=>v for (k,v) in zip(getleafnames(subtree), tiplabels))
rename_treeleaves!(subtree, idmapping)
plot(subtree,
    c=:goldenrod, alpha=.5,
    lw=2, fs=7,
    size=(600,600),
    rightmargin=5Plots.cm,
)

One thing to notice as we view these sub trees is that they are grouping by NCBI species, but there are further sub-groupings within each species. The ID convention for our strains is “<sampling hospital>.<donor>.<strain>”. Knowing this naming convention we realized that the sub-groups were corresponding to the donor these strains were sampled from

This becomes more obvious as we color by donor id

donorcolormap = Dict(k=>v for (k,v) in zip(unique(biobank.obs.Donor), palette(:glasbey_bw_n256, length(unique(biobank.obs.Donor)))));
# sub-setting to B. uniformis strains
targetid = "MSK.17.15"
basenode = bbtree_leaves[findfirst(n->name(n) == targetid, bbtree_leaves)]
subtree = readnw(nwstr(nthparent(basenode, 1)))
subtree_ids = getleafnames(subtree)

linecolors = map(prewalk(subtree)) do node
    leafids = getleafnames(node)
    donorids = biobank.obs.Donor[indexin(leafids, biobank.obs.Strain_ID)]
    length(unique(donorids)) == 1 ? donorcolormap[mode(donorids)] : :grey
end |> x->x[2:end] |> permutedims

tiplabels = join.(eachrow(biobank.obs[indexin(getleafnames(subtree), biobank.obs.Strain_ID), [:Strain_ID, :NCBI_Species]]), " ")
idmapping = Dict(k=>v for (k,v) in zip(getleafnames(subtree), tiplabels))
rename_treeleaves!(subtree, idmapping)
treep = plot(subtree,
    c=linecolors,
    lw=2, fs=7,
    size=(600,600),
    rightmargin=5Plots.cm,
)
fullclims = getlims(biobank["metabolites_foldchange"][biobank.obs_names[biobank.obs.kept_species.==1], :].X)
met_mtx = biobank["metabolites_foldchange"][subtree_ids, ["Acetate_rel", "Butyrate_rel", "Succinate_rel"]].X
hp = heatmap(met_mtx,
    c=:bwr,
    # clims=fullclims, 
    clims=getlims(met_mtx),
    framestyle=:grid,
    yticks=false,
    xticks=(1:3, ["Acetate", "Butyrate", "Succinate"]),
    xrotation=30,
)

plot(treep, hp, layout=@layout([a{0.6w} b]), size=(600,600), link=:y)
savefig(joinpath(pdir, "buniformis_with_metabolites.pdf"))
"/Users/bend/projects/Doran_etal_2023/plots/figure_03/buniformis_with_metabolites.pdf"

What components were needed to resolve these clusters? If we plot the projections of the taxa onto each of the components, we see that differences between these sub-species clusters only really shows up after PC 10, which we can see by the beginning of banding in the heatmap.

subtree_idx = indexin(subtree_ids, biobank.obs.Strain_ID)
pltmtx = sign.(bbuhat[subtree_idx, :]) # sign of projection
# pltmtx = sign.(bbuhat[subtree_idx, :]) .* log10.(abs.(bbuhat[subtree_idx, :])) # pseudo-logscale 
# pltmtx = bbuhat[subtree_idx, :] # actual projection
xmarks = [1:9..., 10:10:90..., 100:100:900..., 1000:1000:7000...]
heatmap(pltmtx, 
    xscale=:log10, 
    c=:bwr, 
    clims=getlims(pltmtx),
    framestyle=:grid, 
    xticks=(xmarks, map(x-> (log10(x) % 1 == 0) || x==7000 ? x : "", xmarks)),
    xrotation=45,
    xmirror=true,
    xtickfontsize=7,
    yticks=(1:length(subtree_idx), subtree_ids),
    size=(1200, 500),
    margin=5Plots.mm,
    topmargin=10Plots.mm,
    gridalpha=1,
    xlabel="Principal component",
)
# sub-setting to M. gnavus strains
targetid = "MSK.22.24"
basenode = bbtree_leaves[findfirst(n->name(n) == targetid, bbtree_leaves)]
subtree = readnw(nwstr(nthparent(basenode, 9)))
subtree_ids = getleafnames(subtree)

linecolors = map(prewalk(subtree)) do node
    leafids = getleafnames(node)
    donorids = biobank.obs.Donor[indexin(leafids, biobank.obs.Strain_ID)]
    length(unique(donorids)) == 1 ? donorcolormap[mode(donorids)] : :grey
end |> x->x[2:end] |> permutedims

tiplabels = join.(eachrow(biobank.obs[indexin(getleafnames(subtree), biobank.obs.Strain_ID), [:Strain_ID, :NCBI_Species]]), " ")
idmapping = Dict(k=>v for (k,v) in zip(getleafnames(subtree), tiplabels))
rename_treeleaves!(subtree, idmapping)
treep = plot(subtree,
    c=linecolors,
    lw=2, fs=7,
    size=(600,600),
    rightmargin=5Plots.cm,
)
fullclims = getlims(biobank["metabolites_foldchange"][biobank.obs_names[biobank.obs.kept_species.==1], :].X)
met_mtx = biobank["metabolites_foldchange"][subtree_ids, ["Acetate_rel", "Butyrate_rel", "Succinate_rel"]].X
hp = heatmap(met_mtx,
    c=:bwr,
    # clims=fullclims, 
    clims=getlims(met_mtx),
    framestyle=:grid,
    yticks=false,
    xticks=(1:3, ["Acetate", "Butyrate", "Succinate"]),
    xrotation=30,
)

plot(treep, hp, layout=@layout([a{0.6w} b]), size=(600,600), link=:y)
savefig(joinpath(pdir, "mgnavus_with_metabolites.pdf"))
"/Users/bend/projects/Doran_etal_2023/plots/figure_03/mgnavus_with_metabolites.pdf"
subtree_idx = indexin(subtree_ids, biobank.obs.Strain_ID)
pltmtx = sign.(bbuhat[subtree_idx, :]) # sign of projection
# pltmtx = sign.(bbuhat[subtree_idx, :]) .* log10.(abs.(bbuhat[subtree_idx, :])) # pseudo-logscale 
# pltmtx = bbuhat[subtree_idx, :] # actual projection
xmarks = [1:9..., 10:10:90..., 100:100:900..., 1000:1000:7000...]
heatmap(pltmtx, 
    xscale=:log10, 
    c=:bwr, 
    clims=getlims(pltmtx),
    framestyle=:grid, 
    xticks=(xmarks, map(x-> (log10(x) % 1 == 0) || x==7000 ? x : "", xmarks)),
    xrotation=45,
    xmirror=true,
    xtickfontsize=7,
    yticks=(1:length(subtree_idx), subtree_ids),
    size=(1200, 500),
    margin=5Plots.mm,
    topmargin=10Plots.mm,
    gridalpha=1,
    xlabel="Principal component",
)

Mutual information across tree

Again we wan’t to look systematically over our tree, to verify that the patterns we saw in the above two examples are consistent with the general patterns in the tree.

Thus we compare grouping in our tree to both NCBI taxonomy and GTDB taxonomy.

NCBI taxonomy

NBOOTS = 50;
colnames = [:NCBI_Phylum, :NCBI_Class, :NCBI_Order, :NCBI_Family, :NCBI_Genus, :NCBI_Species, :Donor];
BB_NCBI_taxonomy = biobank.obs[:, colnames]
obs_names = biobank.obs_names.vals;
rowmask = map(eachrow(BB_NCBI_taxonomy)) do row
    !any(==("<unclassified>"), row)
end;
# takes ~ 5 min
mi_results = map(1:NBOOTS) do i
    pairedMI_across_treedepth(eachcol(BB_NCBI_taxonomy), obs_names, bbtree; mask=rowmask, bootstrap=true, ncuts=100, treecut_distancefun=network_distance)
end;
mi_results_df = DataFrame()
for (i, res) in enumerate(mi_results)
    mi_results_df = vcat(
        mi_results_df, 
        hcat(
            DataFrame(bootstrap=i, treedepth=last(res)),
            DataFrame(first(res), colnames),
            DataFrame(scaledcumsum.(first(res)), string.(colnames) .* "_scaledcumsum"),
        )
    )
end

CSV.write(joinpath(bbtreedir, "NCBI_raw_MI_values.csv"), mi_results_df)
first(mi_results_df, 5)
5×16 DataFrame
Row bootstrap treedepth NCBI_Phylum NCBI_Class NCBI_Order NCBI_Family NCBI_Genus NCBI_Species Donor NCBI_Phylum_scaledcumsum NCBI_Class_scaledcumsum NCBI_Order_scaledcumsum NCBI_Family_scaledcumsum NCBI_Genus_scaledcumsum NCBI_Species_scaledcumsum Donor_scaledcumsum
Int64 Float64 Float64 Float64 Float64 Float64 Float64 Float64 Float64 Float64 Float64 Float64 Float64 Float64 Float64 Float64
1 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 1 0.414141 0.00850462 0.00809241 0.00835621 0.00642135 0.000619266 9.23656e-5 0.000151046 0.00112313 0.0010973 0.00113385 0.000976177 0.000189988 4.17765e-5 0.000801146
3 1 0.828283 0.00842231 0.00833935 0.00836786 0.00637592 0.000642796 5.49416e-5 0.000110533 0.00223539 0.00222809 0.00226928 0.00194545 0.000387196 6.66263e-5 0.00138741
4 1 1.24242 0.00985023 0.0100184 0.00994812 0.00741618 0.000701392 6.94886e-5 0.000233373 0.00353622 0.00358655 0.00361914 0.00307286 0.00060238 9.80556e-5 0.00262521
5 1 1.65657 0.0100412 0.00998145 0.00932413 0.00731046 0.000887492 0.0001033 0.000168606 0.00486228 0.00494 0.00488432 0.0041842 0.000874659 0.000144778 0.00351949
pltdf = mi_results_df |>
    df -> stack(df, 10:16, [:bootstrap, :treedepth]) |> 
    df -> transform(df, :variable => ByRow(x->replace(x, "_scaledcumsum"=>"")) => identity) |>
    df -> groupby(df, [:variable, :treedepth]) |>
    df -> combine(df,
        :value => mean => :MI_mean,
        :value => std => :MI_std,
    ) |> 
    df -> sort(df, [])

taxonomylevels = permutedims(string.(colnames))
taxarankcolors = [:red :pink :orange :lightblue :green :aqua :lightgreen];

plot(title="CSB spectral tree (NCBI)", ylabel="Cumulative MI\n (density)", xlabel="Spectral Tree depth",
    legend=:outerright,
    size=(700,250),
    margin=5Plots.mm,  
)
for (tlab, tcol) in reverse(collect(zip(taxonomylevels, taxarankcolors)))
    df = filter(:variable => ==(tlab), pltdf)
    @df df plot!(:treedepth, :MI_mean, ribbon=2 .* :MI_std, label=tlab, c=tcol, lw=1)
end
plot!()

GTDB Taxonomy

NBOOTS = 50;
colnames = [:GTDB_Phylum, :GTDB_Class, :GTDB_Order, :GTDB_Family, :GTDB_Genus, :GTDB_Species, :Donor];
BB_NCBI_taxonomy = biobank.obs[:, colnames]
obs_names = biobank.obs_names.vals;
rowmask = map(eachrow(BB_NCBI_taxonomy)) do row
    !any(==("<unclassified>"), row)
end;
# takes ~ 5 min
mi_results = map(1:NBOOTS) do i
    pairedMI_across_treedepth(eachcol(BB_NCBI_taxonomy), obs_names, bbtree; mask=rowmask, bootstrap=true, ncuts=100, treecut_distancefun=network_distance)
end;
mi_results_df = DataFrame()
for (i, res) in enumerate(mi_results)
    mi_results_df = vcat(
        mi_results_df, 
        hcat(
            DataFrame(bootstrap=i, treedepth=last(res)),
            DataFrame(first(res), colnames),
            DataFrame(scaledcumsum.(first(res)), string.(colnames) .* "_scaledcumsum"),
        )
    )
end

CSV.write(joinpath(bbtreedir, "GTDB_raw_MI_values.csv"), mi_results_df)
first(mi_results_df, 5)
5×16 DataFrame
Row bootstrap treedepth GTDB_Phylum GTDB_Class GTDB_Order GTDB_Family GTDB_Genus GTDB_Species Donor GTDB_Phylum_scaledcumsum GTDB_Class_scaledcumsum GTDB_Order_scaledcumsum GTDB_Family_scaledcumsum GTDB_Genus_scaledcumsum GTDB_Species_scaledcumsum Donor_scaledcumsum
Int64 Float64 Float64 Float64 Float64 Float64 Float64 Float64 Float64 Float64 Float64 Float64 Float64 Float64 Float64 Float64
1 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 1 0.414141 0.00853729 0.00830565 0.00795017 0.00702883 0.000448211 3.18137e-5 0.000232687 0.00118489 0.00115355 0.00110083 0.00111744 0.000151996 1.50974e-5 0.00132657
3 1 0.828283 0.00820851 0.00839012 0.00832932 0.00641712 0.00046902 3.44598e-5 0.000162505 0.00232414 0.00231883 0.00225415 0.00213764 0.000311049 3.14506e-5 0.00225302
4 1 1.24242 0.00968021 0.00965786 0.00950944 0.0082036 0.00041402 3.02168e-5 0.00027292 0.00366766 0.00366019 0.00357089 0.00344185 0.00045145 4.57903e-5 0.00380895
5 1 1.65657 0.00987676 0.00938308 0.00932834 0.00809571 0.000429243 5.0877e-5 0.000268859 0.00503845 0.00496338 0.00486255 0.0047289 0.000597014 6.99344e-5 0.00534174
pltdf = mi_results_df |>
    df -> stack(df, 10:16, [:bootstrap, :treedepth]) |> 
    df -> transform(df, :variable => ByRow(x->replace(x, "_scaledcumsum"=>"")) => identity) |>
    df -> groupby(df, [:variable, :treedepth]) |>
    df -> combine(df,
        :value => mean => :MI_mean,
        :value => std => :MI_std,
    ) |> 
    df -> sort(df, [])

taxonomylevels = permutedims(string.(colnames))
taxarankcolors = [:red :pink :orange :lightblue :green :aqua :lightgreen];

plot(title="CSB spectral tree (GTDB)",  ylabel="Cumulative MI\n (density)", xlabel="Spectral Tree depth",
    legend=:outerright,
    size=(700,250),
    margin=5Plots.mm,  
)
for (tlab, tcol) in reverse(collect(zip(taxonomylevels, taxarankcolors)))
    df = filter(:variable => ==(tlab), pltdf)
    @df df plot!(:treedepth, :MI_mean, ribbon=2 .* :MI_std, label=tlab, c=tcol, lw=1)
end
plot!()

Relative Distances

Comparing the Spectral tree to the 16S or bac120 trees. We find that we are able to represent genomic strain level variation where neither 16S nor Bac120 annotations can.

# strain ids for 669 samples in CSB
obs_names = biobank.obs.Strain_ID;

# re-read in biobank tree (same as what is plotted above)
bbtree = readnw(readline(joinpath(bbtreedir, "BB669-tree.nw")))
as_polytomy!(n->NewickTree.support(n)<0.5, bbtree)
ladderize!(bbtree)
leafnames_spitree = getleafnames(bbtree); # same order as returned by patristic_distances
order_treespitree = indexin(leafnames_spitree, obs_names);

# read in tree built from bac120 annotations
treebac120 = readnw(readline(projectdir("_research","BB669_bac120_phyml","BB669_bac120.phy-supporttree.txt")))
as_polytomy!(n->NewickTree.support(n)<0.5, treebac120)
ladderize!(treebac120)
leafnames_bac120 = getleafnames(treebac120); # same order as returned by patristic_distances
order_treebac120 = indexin(leafnames_bac120, obs_names);

# read in tree built from 16S sequence
tree16s = readnw(readline(projectdir("_research","BB669_16S_phyml","BB669_16S.phy-supporttree.txt")))
as_polytomy!(n->NewickTree.support(n)<0.5, tree16s)
ladderize!(tree16s)
leafnames_16S = getleafnames(tree16s); # same order as returned by patristic_distances
order_tree16S = indexin(leafnames_16S, obs_names);

# 1 taxa from the CSB got dropped because we couldn't get a good annotated 16S read
# so I am collecting the sample mapping here between the taxa in the tree and
# the taxa in CSB
order_spi_tree16s = indexin(leafnames_16S, leafnames_spitree);
# likewise 5 taxa from CSB got dropped when annotating with bac120
order_spi_treebac120 = indexin(leafnames_bac120, leafnames_spitree);
bbtaxa = biobank.obs[:,[:NCBI_Phylum, :NCBI_Class, :NCBI_Order, :NCBI_Family, :NCBI_Genus, :NCBI_Species]];
taxanomicdistance = pairwise(eachrow(Matrix(bbtaxa)[:,6:-1:1])) do a, b
    x = findfirst(a .== replace(b, "<unclassified>"=>"missing"))
    x === nothing ? 6 : x-1
end;
# get patristic distances between each leaf of the spectral tree
sixPij = patristic_distances(tree16s);
bacPij = patristic_distances(treebac120);
spiPij = patristic_distances(bbtree);

# mask to collect just upper triangle of distance matrix for each dataset
uppertriangle_six = triu(trues(size(sixPij)), 1);
uppertriangle_bac = triu(trues(size(bacPij)), 1);
uppertriangle_spi = triu(trues(size(spiPij)), 1);

# get upper triangle
sixPij = sixPij[uppertriangle_six];
bacPij = bacPij[uppertriangle_bac];
spiPij = spiPij[uppertriangle_spi];

# reorder distance matrix to same order as metadata file
sixTij = taxanomicdistance[order_tree16S, order_tree16S][uppertriangle_six];
bacTij = taxanomicdistance[order_treebac120, order_treebac120][uppertriangle_bac];
spiTij = taxanomicdistance[order_treespitree, order_treespitree][uppertriangle_spi];

# scaled distances to between 0 and 1
spiPij_rel = (spiPij ./ maximum(spiPij));
bacPij_rel = (bacPij ./ maximum(bacPij));
sixPij_rel = (sixPij ./ maximum(sixPij));
plot relative density distributions
p1 = plot(
    xlims=(0,1), 
    widen=1, 
    xticks=(0:.25:1, ["0.0", "", "0.5", "", "1.0"]),
    # ylabel="same species",
    # size=(400, 250),
)
# patristic distance of pairs that are the same genus
violin!(["same\n genus"], spiPij_rel[spiTij .== 1], # normed dist
    trim=true, side=:right,
    c=:blue, fill=0, linecolor=:black, lw=1,
    label="Spectral tree",
    permute=(:x, :y),
)
violin!(["same\n genus"], bacPij_rel[bacTij .== 1], # normed dist
    trim=true, side=:right,
    c=:yellow, fill=0, linecolor=:black, lw=1,
    label="Bac120 tree",
    permute=(:x, :y),
)
# patristic distance of pairs that are the same species
violin!(["same\n species"], spiPij_rel[spiTij .== 0], # normed dist
    trim=true, side=:right,
    c=:blue, fill=0, linecolor=:black, lw=1,
    # label="Spectral Tree",
    permute=(:x, :y),
)
violin!(["same\n species"], bacPij_rel[bacTij .== 0], # normed dist
    trim=true, side=:right,
    c=:yellow, fill=0, linecolor=:black, lw=1,
    # label="16S",
    permute=(:x, :y),
)

p2 = plot(
    xlims=(0,1), 
    widen=1, 
    xticks=(0:.25:1, ["0.0", "", "0.5", "", "1.0"]),
    # ylabel="same species",
    # size=(400, 250),
)
# patristic distance of pairs that are the same genus
violin!(["same\n genus"], spiPij_rel[spiTij .== 1], # normed dist
    trim=true, side=:right,
    c=:blue, fill=0, linecolor=:black, lw=1,
    label="Spectral tree",
    permute=(:x, :y),
)
violin!(["same\n genus"], sixPij_rel[sixTij .== 1], # normed dist
    trim=true, side=:right,
    c=:orange, fill=0, linecolor=:black, lw=1,
    label="16S tree",
    permute=(:x, :y),
)
# patristic distance of pairs that are the same species
violin!(["same\n species"], spiPij_rel[spiTij .== 0], # normed dist
    trim=true, side=:right,
    c=:blue, fill=0, linecolor=:black, lw=1,
    # label="Spectral Tree",
    permute=(:x, :y),
)
violin!(["same\n species"], sixPij_rel[sixTij .== 0], # normed dist
    trim=true, side=:right,
    c=:orange, fill=0, linecolor=:black, lw=1,
    # label="16S",
    permute=(:x, :y),
)
plot(p1, p2, layout=grid(1,2), size=(800, 250), margin=3Plots.mm)
# difference in distributions is significant to below floating point precision
# same species distributions
@show pval = pvalue(MannWhitneyUTest(spiPij_rel[spiTij.==0], bacPij_rel[bacTij.==0]); tail=:both);
@show pval = pvalue(MannWhitneyUTest(spiPij_rel[spiTij.==0], sixPij_rel[sixTij.==0]); tail=:both);
# same genus but different species distributions
@show pval = pvalue(MannWhitneyUTest(spiPij_rel[spiTij.==1], bacPij_rel[bacTij.==0]); tail=:both);
@show pval = pvalue(MannWhitneyUTest(spiPij_rel[spiTij.==1], sixPij_rel[sixTij.==0]); tail=:both);
pval = pvalue(MannWhitneyUTest(spiPij_rel[spiTij .== 0], bacPij_rel[bacTij .== 0]); tail = :both) = 0.0
pval = pvalue(MannWhitneyUTest(spiPij_rel[spiTij .== 0], sixPij_rel[sixTij .== 0]); tail = :both) = 0.0
pval = pvalue(MannWhitneyUTest(spiPij_rel[spiTij .== 1], bacPij_rel[bacTij .== 0]); tail = :both) = 0.0
pval = pvalue(MannWhitneyUTest(spiPij_rel[spiTij .== 1], sixPij_rel[sixTij .== 0]); tail = :both) = 0.0
plot(xlims=(0,.3), size=(600,200), tickdir=:out)
histogram!(spiPij_rel[spiTij.==0], nbins=50, c=:blue, alpha=.5, label="Spectral tree")
histogram!(bacPij_rel[bacTij.==0], nbins=15, c=:yellow, alpha=.5, label="Bac120 tree")
histogram!(sixPij_rel[sixTij.==0], nbins=50, c=:orange, alpha=.5, label="16S tree")
p1 = histogram(bacPij_rel[bacTij.==0], nbins=0:0.01:0.35, c=:yellow, alpha=.5, label="Bac120 tree")
p2 = histogram(sixPij_rel[sixTij.==0], nbins=0:0.01:0.35, c=:orange, alpha=.5, label="16S tree")
p3 = histogram(spiPij_rel[spiTij.==0], nbins=0:0.01:0.35, c=:blue, alpha=.5, label="Spectral tree")
plot(p1, p2, p3, layout = grid(3,1), size=(500, 400), lw=0.5)
savefig(joinpath(pdir, "reative-distance-of-samespecies-pairs-histograms.pdf"))
"/Users/bend/projects/Doran_etal_2023/plots/figure_03/reative-distance-of-samespecies-pairs-histograms.pdf"

Fig. S9 - Conflation of taxonomy

# spectral distance in context of uniprot
dij = spectraldistances(bbuhat, upusv.S; alpha=1.5, q=.75) ./ size(upusv.V, 1);
# spectral distance isolated to strain bank
BBusv = svd(biobank["oggs"].X[:, :]) 
BBdij = spectraldistances(BBusv.U, BBusv.S, alpha=1.5, q=0.75) ./ size(BBusv.V, 1);
# taxonomic distance: number of levels before two strains share a taxonomic classification

bbtaxa = biobank.obs[:,[:NCBI_Phylum, :NCBI_Class, :NCBI_Order, :NCBI_Family, :NCBI_Genus, :NCBI_Species]];
# bbtaxa = biobank.obs[:,[:GTDB_Phylum, :GTDB_Class, :GTDB_Order, :GTDB_Family, :GTDB_Genus, :GTDB_Species]];

taxanomicdistance = pairwise(eachrow(Matrix(bbtaxa)[:,6:-1:1])) do a, b
    x = findfirst(a .== replace(b, "<unclassified>"=>"missing"))
    x === nothing ? 6 : x-1
end;
n = 6
labels = ["same species", "same genus", "same family", "same order", "same class", "same phylum", "same kingdom"];
plot(size=(800,600), 
    title="Comparison of UniProt to BioBank context (All 223,446 pairs)", 
    ylabel="Relative Spetral distance", 
    tickdirection=:out,
    # format=:png, dpi=200,
    margin=10Plots.Measures.mm,
    xticks=(1:n, labels[1:n]),
    xlims=(.5, n +.5),
    xrotation=45,
    legend=:outerbottom,
    grid=false,
)
xdata = squareform(taxanomicdistance)
ydata1 = squareform((dij ./ maximum(dij)))
ydata2 = squareform((BBdij ./ maximum(BBdij)))
for i in 0:n
    mask = xdata .== i
    violin!(xdata[mask].+1, ydata1[mask],
        label="",
        c=:blue, 
        side=:left,
        trim=true,
        )
    violin!(xdata[mask].+1, ydata2[mask],
        label="",
        c=:orange, 
        side=:right,
        trim=true
        )
end
allspeciesdists = ydata1[xdata.==0]
allspeciesdists2 = ydata2[xdata.==0]
scatter!([.9], [mean(allspeciesdists)], yerror=1.5std(allspeciesdists), c=:black, label="μ±1.5σ uniprot species distances", shape=:diamond)
# scatter!([1.1], [mean(allspeciesdists2)], yerror=1.5std(allspeciesdists2), c=:black,  shape=:diamond)
# hline!([mean(allspeciesdists)+std(allspeciesdists)], c=:red, linestyle=:dash)
hline!([mean(allspeciesdists)+1.5std(allspeciesdists)], c=:red, label="μ+1.5σ uniprot species distances", linestyle=:dash)
# hline!([mean(allspeciesdists2)+1.5std(allspeciesdists2)], c=:orange, linestyle=:dash)
plot!([0 0], c=[:blue :orange], label=["UniProt" "BioBank"])
annotate!(1:6, ones(6), ["n.s.", "***", "***", "***", "", "***"], fontsize=5)
plot!()
savefig(joinpath(pdir, "SPIdistUPvsSPIdistBB_splitbyNCBItaxonomy_groupedviolin.pdf"))
"/Users/bend/projects/Doran_etal_2023/plots/figure_03/SPIdistUPvsSPIdistBB_splitbyNCBItaxonomy_groupedviolin.pdf"

KS and wilcoxon test of each distribution

xdata = squareform(taxanomicdistance)
ydata1 = squareform((dij ./ maximum(dij)))
ydata2 = squareform((BBdij ./ maximum(BBdij)))
for i in 0:n
    mask = xdata .== i
    pval = pvalue(ApproximateTwoSampleKSTest(ydata1[mask], ydata2[mask]))
    @show (i, pval * 7)
end
(i, pval * 7) = (0, 1.4737693215045304e-24)
(i, pval * 7) = (1, 0.0)
(i, pval * 7) = (2, 0.0)
(i, pval * 7) = (3, 0.0)
(i, pval * 7) = (4, NaN)
(i, pval * 7) = (5, 0.0)
(i, pval * 7) = (6, 0.0)
xdata = squareform(taxanomicdistance)
ydata1 = squareform((dij ./ maximum(dij)))
ydata2 = squareform((BBdij ./ maximum(BBdij)))
for i in [0, 1, 2, 3, 5, 6]
    mask = xdata .== i
    pval = pvalue(MannWhitneyUTest(ydata1[mask], ydata2[mask]); tail=:right)
    @show (i, pval * 7)
end
(i, pval * 7) = (0, 0.4301946781397137)
(i, pval * 7) = (1, 0.0)
(i, pval * 7) = (2, 0.0)
(i, pval * 7) = (3, 0.0)
(i, pval * 7) = (5, 0.0)
(i, pval * 7) = (6, 0.0)

Conflated pairs

issameorder = taxanomicdistance .== 3;
speciesthreshold = mean(allspeciesdists)+1.5std(allspeciesdists)
0.171036523273783

There are no pairs of taxa that are from different families that are closer together than any pair of strains that belong to the same species, when we contextualize by the entire distribution of bacteria in UniProt.

BBUPisbelowthreshold = dij .< speciesthreshold;
any(issameorder .& BBUPisbelowthreshold)
false

If we constrain ourselves to just our strain bank there are pairs of taxa that are from different families that are closer together than any pair of strains that belong to the same species.

BBisbelowthreshold = BBdij .< speciesthreshold
any(issameorder .& BBisbelowthreshold)
true
incorrecttaxa = union(getindex.(findall(issameorder .& BBisbelowthreshold), 1), getindex.(findall(issameorder .& BBisbelowthreshold), 2))
orderedincorrecttaxa = incorrecttaxa[sortperm(bbtaxa[incorrecttaxa, :].NCBI_Family)];
orderedincorrecttaxa_families = bbtaxa[orderedincorrecttaxa, :NCBI_Family];
countmap(orderedincorrecttaxa_families)
Dict{String, Int64} with 6 entries:
  "Rikenellaceae"    => 9
  "Oscillospiraceae" => 1
  "Prevotellaceae"   => 14
  "Tannerellaceae"   => 19
  "Lachnospiraceae"  => 1
  "Bacteroidaceae"   => 214
fam_labs = unique(orderedincorrecttaxa_families)
fam_results = map(fam_labs) do fam
    idxs = findall(==(fam), orderedincorrecttaxa_families)
    (; pos=mean(idxs), len=length(idxs), lab=fam)
end
tick_results = filter(x->x.len > 1, fam_results)
4-element Vector{@NamedTuple{pos::Float64, len::Int64, lab::String}}:
 (pos = 107.5, len = 214, lab = "Bacteroidaceae")
 (pos = 223.5, len = 14, lab = "Prevotellaceae")
 (pos = 235.0, len = 9, lab = "Rikenellaceae")
 (pos = 249.0, len = 19, lab = "Tannerellaceae")
pltmtx = float.((issameorder .& BBisbelowthreshold)[orderedincorrecttaxa, orderedincorrecttaxa])
pltmtx[pltmtx .== 0] .= NaN
plot(
    size=(600,600), 
    ticks=(getfield.(tick_results, :pos), getfield.(tick_results, :lab)),
    xrotation=30,
)
heatmap!(pltmtx, c=[:white, :orange], ratio=1,
)
vline!([217, 230, 239], c=:black, lw=1)
hline!([217, 230, 239], c=:black, lw=1)
savefig(joinpath(pdir, "incorrecttaxa_heatmap.pdf"))
"/Users/bend/projects/Doran_etal_2023/plots/figure_03/incorrecttaxa_heatmap.pdf"