Fig. S21 - Cog categories in Spectral Tree

Author

Benjamin Doran

Published

January 17, 2025

Julia Setup
using DrWatson
@quickactivate projectdir()

using CSV, Muon, DataFrames
using Chain
using StatsBase
using SpectralInference, NewickTree
using StatsPlots
theme(:default, grid=false, label=false, tickdir=:out)

pdir = plotsdir("reviewer_plots") |> mkpath
rdir = projectdir("_research", "strain_variation_wilcoxon_tests") |> mkpath
uniprot = readh5ad(datadir("exp_raw", "UP7047", "2020_02_UP7047.h5ad"))
biobank = readh5mu(datadir("exp_raw", "BB669", "BB669.h5mu"))
┌ Warning: Cannot join columns with the same name because var_names are intersecting.
└ @ Muon /Users/bend/.julia/packages/Muon/UKjAF/src/mudata.jl:367
MuData object 669 ✕ 21475
└ metabolites_foldchange
  AnnData object 669 ✕ 50
└ oggs
  AnnData object 669 ✕ 11248
└ UPorder_oggs
  AnnData object 669 ✕ 10177
upmtx = uniprot.X[:, :]
upusv = svd(upmtx)
bbmtx = biobank["UPorder_oggs"].X[:,:]
@show size(bbmtx)
keptspecies = sort(filter(x-> last(x) >= 20, countmap(biobank.obs.Species)), byvalue=true, rev=true)
filter!(!=("unclassified"), keptspecies)
# species => number of strains belonging to that species
size(bbmtx) = (669, 10177)
OrderedCollections.OrderedDict{String, Int64} with 12 entries:
  "Phocaeicola vulgatus"         => 88
  "[Ruminococcus] gnavus"        => 41
  "Bacteroides thetaiotaomicron" => 35
  "Anaerostipes hadrus"          => 31
  "Bacteroides uniformis"        => 27
  "unclassified"                 => 26
  "Blautia luti"                 => 24
  "Bifidobacterium breve"        => 24
  "Coprococcus comes"            => 23
  "Dorea formicigenerans"        => 22
  "Blautia wexlerae"             => 21
  "[Eubacterium] rectale"        => 20

Tree plot

strvarmtx = bbmtx[biobank.obs.kept_species.==1,:];
@show size(strvarmtx)

strvarobs = biobank.obs[biobank.obs.kept_species.==1, :]
strvar_obsnames = strvarobs.Strain_ID

# infer hierarchical relationships of just the species with
# greater than 20 strain replicates
uhat = projectinLSV(strvarmtx, upusv)
dij = spectraldistances(uhat, upusv.S; alpha=1.5, q=.75) ./ size(uniprot, 2)
strvartree_hc = UPGMA_tree(dij)
strvartree = readnw(SpectralInference.newickstring(strvartree_hc, strvar_obsnames))
# ladderize!(strvartree)
size(strvarmtx) = (356, 10177)
(((((((DFI.1.247:0.003931399,DFI.1.135:0.003931399):0.0261128,DFI.1.173:0.0300442):1.102017,MSK.17.15:1.132061):0.4641554,((((((MSK.19.71:0.001116743,(MSK.19.54:0.0001738008,MSK.19.4:0.0001738008):0.0009429425):0.001027894,MSK.19.91:0.002144637):0.0006451383,MSK.19.84:0.002789775):0.9853182,(MSK.20.30:0.03613531,(MSK.20.3:0.009151378,MSK.20.79:0.009151378):0.02698393):0.9519726):0.01828051,(MSK.16.71:0.1179327,(MSK.16.7:0.06787214,(MSK.16.46:0.001618285,(MSK.16.44:0.0001595074,MSK.16.39:0.0001595074):0.001458778):0.06625386):0.05006053):0.8884558):0.1753671,((DFI.3.57:0.9677974,((MSK.18.66:0.001366842,(((MSK.18.33:2.82369e-17,MSK.18.78:2.82369e-17):0.000426245,((MSK.18.89:3.029125e-17,MSK.18.30:3.029125e-17):0.0,MSK.18.25:3.029125e-17):0.000426245):0.0002283746,MSK.18.77:0.0006546196):0.0007122219):0.0002409801,MSK.18.74:0.001607822):0.9661896):0.1983591,DFI.4.149:1.166157):0.01559899):0.414461):1.683172,((((DFI.3.23:0.1084715,((((DFI.1.133:0.002279274,DFI.1.35:0.002279274):0.0005995152,DFI.1.127:0.002878789):0.001521423,DFI.1.126:0.004400213):0.007541952,DFI.1.146:0.01194217):0.09652931):1.471801,(((DFI.5.25:0.9557872,MSK.9.7:0.9557872):0.2712638,((MSK.9.20:1.079934,(DFI.3.15:0.1813803,DFI.3.65:0.1813803):0.8985541):0.105503,((DFI.4.142:1.004294,MSK.10.8:1.004294):0.1452508,((((MSK.15.13:0.1005877,MSK.15.52:0.1005877):0.7592938,((MSK.22.66:0.03705821,((MSK.22.75:0.01576487,((MSK.22.90:0.008254969,(MSK.22.7:0.004311205,MSK.23.26:0.004311205):0.003943764):0.003581096,MSK.22.81:0.01183606):0.003928806):0.003889076,(((MSK.23.48:0.003585361,((((MSK.22.14:1.286854e-5,MSK.22.41:1.286854e-5):0.0004015048,(MSK.22.3:1.286854e-5,MSK.22.30:1.286854e-5):0.0004015048):0.0002768957,(MSK.23.53:1.93772e-17,(MSK.23.52:1.93772e-17,MSK.22.47:1.93772e-17):0.0):0.0006912691):0.00022138,(MSK.22.38:0.0004876222,MSK.22.36:0.0004876222):0.0004250269):0.002672712):0.0007826417,((MSK.23.51:0.001127075,(MSK.22.48:0.0004876222,MSK.22.21:0.0004876222):0.0006394528):0.0005693853,MSK.23.32:0.00169646):0.002671542):0.004446563,MSK.22.5:0.008814565):0.01083938):0.01740426):0.007253248,MSK.23.57:0.04431146):0.81557):0.04789299,MSK.6.33:0.9077745):0.1379388,((((MSK.16.86:0.001271842,(MSK.16.2:0.0004552147,MSK.16.26:0.0004552147):0.000816627):0.0008964462,((MSK.16.47:0.0003015461,((MSK.16.83:3.234215e-17,(MSK.16.75:3.234215e-17,(MSK.16.90:3.234215e-17,MSK.16.61:3.234215e-17):0.0):0.0):0.0001773087,MSK.16.55:0.0001773087):0.0001242374):0.0003329315,MSK.16.84:0.0006344776):0.00153381):0.004430316,MSK.16.10:0.006598604):0.9103236,((MSK.19.88:0.06874913,MSK.19.40:0.06874913):0.2166141,(((MSK.19.26:0.03619959,MSK.19.29:0.03619959):0.06432178,((MSK.19.35:0.00494406,MSK.19.85:0.00494406):0.08595237,MSK.19.93:0.09089643):0.009624929):0.09654039,(((MSK.19.95:0.004715797,MSK.19.45:0.004715797):0.06352701,(MSK.19.32:0.009199552,(MSK.19.16:0.002269505,((MSK.19.20:0.0004489371,MSK.19.21:0.0004489371):0.0009555423,MSK.19.19:0.001404479):0.0008650259):0.006930047):0.05904325):0.02281524,MSK.19.86:0.09105805):0.1060037):0.08830149):0.6315589):0.1287912):0.1038312):0.03589288):0.04161359):0.0788548,(MSK.17.64:0.03981748,((MSK.17.92:0.004235512,(((MSK.17.56:0.002300801,((MSK.17.71:0.0007187355,MSK.17.5:0.0007187355):0.0009545024,MSK.17.91:0.001673238):0.0006275626):0.0008096886,(MSK.17.87:0.001715472,MSK.17.65:0.001715472):0.001395017):0.0008522348,MSK.17.93:0.003962724):0.0002727882):0.01453939,(MSK.17.55:0.003196651,MSK.17.67:0.003196651):0.01557826):0.02104258):1.266088):0.2743667):0.05606193,(MSK.17.73:1.480551,((MSK.20.41:0.01663719,MSK.20.27:0.01663719):0.2046407,MSK.20.28:0.2212779):1.259273):0.1557837):0.09538753,(MSK.7.27:0.1047519,((MSK.8.31:0.01858135,((MSK.7.25:0.01364896,(MSK.8.9:0.009811226,MSK.7.15:0.009811226):0.003837738):0.002931416,(MSK.8.25:0.008360049,MSK.8.24:0.008360049):0.008220332):0.00200097):0.008267197,(MSK.8.30:0.01584241,MSK.8.17:0.01584241):0.01100614):0.07790337):1.62697):1.547666):0.3151586,((((DFI.4.108:0.08803098,(DFI.4.151:0.005295598,DFI.4.133:0.005295598):0.08273538):0.7546437,(((MSK.15.45:0.004183471,(MSK.14.51:0.00243926,MSK.15.48:0.00243926):0.001744211):0.5352159,MSK.18.2:0.5393994):0.111049,(MSK.5.5:0.006865608,MSK.5.9:0.006865608):0.6435827):0.1922264):0.1090774,((DFI.1.47:0.03187683,(DFI.1.229:0.02823812,(((DFI.1.170:0.00910144,DFI.1.169:0.00910144):0.01302156,(((DFI.1.239:0.004891845,DFI.1.238:0.004891845):0.002316509,DFI.1.228:0.007208354):0.004196748,((DFI.2.27:0.0074562,DFI.1.54:0.0074562):0.002341859,DFI.1.150:0.009798059):0.001607043):0.0107179):0.002934123,DFI.1.128:0.02505713):0.003180988):0.003638711):0.005067316,DFI.1.236:0.03694414):0.9148079):0.5634948,(((MSK.16.9:0.003721056,(MSK.16.57:0.002797762,MSK.16.43:0.002797762):0.0009232938):0.01352062,((MSK.16.60:0.004130997,(MSK.16.35:0.002686137,MSK.16.65:0.002686137):0.00144486):0.004142534,MSK.16.62:0.008273531):0.008968148):1.440133,(((MSK.23.85:0.0107431,(MSK.23.49:0.005975667,(MSK.23.45:0.002134073,MSK.23.25:0.002134073):0.003841595):0.004767436):0.02427398,MSK.23.42:0.03501708):1.053664,(DFI.2.13:0.002545152,DFI.2.69:0.002545152):1.086136):0.3686933):0.05787267):2.0793):0.6213968,((((((((MSK.22.91:0.006000617,MSK.23.10:0.006000617):0.01394575,MSK.23.18:0.01994636):0.06940606,(((MSK.23.71:0.007384685,((MSK.23.46:0.002136866,MSK.23.61:0.002136866):0.002871905,MSK.23.63:0.005008771):0.002375914):0.002962586,(MSK.23.55:0.006649823,MSK.23.93:0.006649823):0.003697449):0.04779344,MSK.23.56:0.05814071):0.03121171):0.8463041,MSK.22.96:0.9356565):0.5997846,(((MSK.5.17:1.355554,(MSK.15.77:1.187518,((MSK.7.28:0.008891872,MSK.7.31:0.008891872):1.12789,((((MSK.19.33:0.06707205,MSK.19.38:0.06707205):0.3834988,(MSK.17.82:0.005630039,MSK.17.63:0.005630039):0.4449408):0.4552386,(MSK.5.19:0.7976107,MSK.11.9:0.7976107):0.1081987):0.07597088,((MSK.15.59:0.004283845,(MSK.15.54:0.002501327,((MSK.15.56:0.000949914,((MSK.15.9:2.066643e-17,MSK.15.10:2.066643e-17):0.0005510349,MSK.15.18:0.0005510349):0.000398879):0.0009573024,MSK.15.58:0.001907216):0.0005941104):0.001782518):0.03148856,MSK.15.32:0.03577241):0.9460078):0.1550013):0.05073667):0.1680361):0.08524652,MSK.7.5:1.440801):0.03602388,((MSK.23.4:0.04376574,(MSK.23.20:0.04056606,MSK.23.17:0.04056606):0.003199687):0.03595454,((MSK.22.24:0.007479792,(MSK.23.41:0.003403041,MSK.23.81:0.003403041):0.004076751):0.01324146,((MSK.23.82:0.007051144,(MSK.23.92:0.002371883,MSK.23.91:0.002371883):0.004679261):0.006038249,MSK.23.60:0.01308939):0.007631856):0.05899903):1.397104):0.05861643):1.570597,(((((MSK.22.101:0.01095634,((MSK.22.107:0.002560393,(((((MSK.22.65:1.465381e-17,(MSK.22.109:1.465381e-17,MSK.22.108:1.465381e-17):0.0):0.0007052648,((MSK.22.111:0.0003952708,((MSK.22.104:1.494598e-17,MSK.22.102:1.494598e-17):0.0001087377,(MSK.22.99:1.222084e-17,MSK.22.105:1.222084e-17):0.0001087377):0.0002865331):0.00018868,MSK.22.20:0.0005839507):0.0001213141):0.0007678837,((((MSK.22.64:0.0001087377,MSK.22.57:0.0001087377):0.000357825,(MSK.22.68:0.0002259348,((MSK.22.45:1.085738e-17,MSK.22.100:1.085738e-17):0.0001087377,MSK.22.11:0.0001087377):0.0001171972):0.0002406279):0.0005098126,MSK.22.113:0.0009763753):0.0003361338,MSK.22.2:0.001312509):0.0001606394):0.0006822291,MSK.22.4:0.002155378):7.73444e-5,MSK.22.106:0.002232722):0.0003276709):0.001586078,MSK.22.34:0.004146471):0.006809865):0.001807876,(MSK.22.80:0.008527999,MSK.22.73:0.008527999):0.004236214):2.348372,((((MSK.9.15:0.1102293,MSK.9.13:0.1102293):0.8040537,(MSK.13.50:0.1088255,(MSK.13.48:0.02865297,MSK.13.59:0.02865297):0.08017258):0.8054574):0.04037098,(MSK.16.22:0.0002646188,MSK.16.45:0.0002646188):0.9543893):0.5339903,(((MSK.17.57:0.04572468,((MSK.17.19:0.002149147,(MSK.17.3:0.001419598,MSK.17.79:0.001419598):0.0007295488):0.002146011,(MSK.17.42:0.001674471,MSK.17.13:0.001674471):0.002620687):0.04142952):0.0140878,(MSK.17.78:0.0001980385,MSK.17.70:0.0001980385):0.05961444):0.9932575,((MSK.22.23:0.02486634,(MSK.22.28:0.00121411,(MSK.22.51:0.0003283846,MSK.22.19:0.0003283846):0.0008857251):0.02365223):0.02434618,MSK.22.92:0.04921252):1.003857):0.4355742):0.8724917):0.3367614,(((MSK.13.15:0.2545377,MSK.13.39:0.2545377):1.181066,((((DFI.1.64:0.1240709,(((DFI.1.210:0.007850086,DFI.1.180:0.007850086):0.001335135,(DFI.1.224:0.003935376,DFI.1.155:0.003935376):0.005249845):0.07300825,((((DFI.1.81:0.007899643,(DFI.1.90:0.0004271598,DFI.1.18:0.0004271598):0.007472483):0.005063089,(DFI.1.219:0.005759959,DFI.1.58:0.005759959):0.007202773):0.01000289,(DFI.1.215:0.01774203,(DFI.1.97:0.006590653,DFI.1.230:0.006590653):0.01115137):0.005223598):0.00744931,DFI.1.72:0.03041493):0.05177854):0.04187744):0.05356914,((DFI.1.220:0.002501934,DFI.1.75:0.002501934):0.01485597,DFI.1.38:0.01735791):0.1602822):0.6333854,MSK.17.61:0.8110254):0.5203661,MSK.23.29:1.331391):0.1042126):0.8429461,((MSK.22.63:1.924021,MSK.8.22:1.924021):0.1901947,((MSK.16.63:1.416139,(((((MSK.16.12:0.02356956,((MSK.16.14:0.002828994,MSK.16.50:0.002828994):0.001042326,MSK.16.11:0.003871319):0.01969824):0.01215441,MSK.16.59:0.03572397):0.9168443,((((MSK.17.85:0.009759773,((MSK.17.27:0.0009075602,MSK.17.39:0.0009075602):0.002127122,MSK.17.59:0.003034682):0.006725091):0.006864783,MSK.17.45:0.01662456):0.03047753,(MSK.17.80:0.01747769,MSK.17.21:0.01747769):0.0296244):0.6151507,DFI.3.84:0.6622528):0.2903154):0.1290716,(((MSK.18.12:0.1586501,MSK.18.76:0.1586501):0.7422176,(MSK.11.23:0.1265154,(MSK.11.50:0.07421506,MSK.11.53:0.07421506):0.05230031):0.7743523):0.1485727,(MSK.20.88:0.06861799,MSK.20.24:0.06861799):0.9808224):0.03219944):0.1224964,DFI.3.85:1.204136):0.2120032):0.1170642,MSK.10.14:1.533204):0.5810122):0.1643344):0.4193471):0.1360889,(((((MSK.22.59:0.03864444,MSK.23.78:0.03864444):0.08558431,MSK.22.93:0.1242288):1.362419,((DFI.4.9:1.377677,((DFI.6.71:1.2296,((DFI.3.45:0.9621406,(((((MSK.20.76:0.005984939,MSK.20.13:0.005984939):0.03242875,MSK.20.44:0.03841369):0.663187,(MSK.18.31:0.135443,MSK.18.3:0.135443):0.5661577):0.06994164,MSK.18.71:0.7715423):0.11539,(MSK.18.16:0.7670487,MSK.17.7:0.7670487):0.1198837):0.07520822):0.1788079,DFI.5.15:1.140949):0.08865152):0.06894472,MSK.20.14:1.298545):0.07913274):0.03180937,((((MSK.19.37:0.9812445,((MSK.14.18:0.008399756,((MSK.15.33:0.001447049,MSK.15.30:0.001447049):0.002441439,MSK.15.6:0.003888489):0.004511268):0.05110599,MSK.15.15:0.05950574):0.9217387):0.167797,MSK.14.2:1.149042):0.115018,MSK.15.29:1.26406):0.09101424,MSK.15.46:1.355074):0.05441309):0.07716042):0.0884017,MSK.23.95:1.575049):0.6137477,((((MSK.6.16:0.2087748,(MSK.6.26:0.006732394,MSK.6.25:0.006732394):0.2020424):1.037348,((MSK.22.86:0.101325,MSK.22.56:0.101325):0.98436,((MSK.6.6:0.6537612,((((MSK.17.18:0.001026552,MSK.17.60:0.001026552):0.5047028,MSK.19.60:0.5057293):0.08250677,((MSK.13.24:0.1359065,MSK.13.38:0.1359065):0.4208074,MSK.15.26:0.5567139):0.03152225):0.0609556,MSK.16.79:0.6491917):0.004569495):0.2206944,MSK.18.38:0.8744556):0.2112294):0.1604382):0.2594362,(MSK.11.25:1.415963,MSK.16.34:1.415963):0.08959638):0.42123,MSK.9.19:1.926789):0.2620073):0.6451896):0.2720521):0.1911737,((((MSK.11.20:0.2185118,((MSK.11.19:0.007115789,MSK.11.27:0.007115789):0.01952213,MSK.11.47:0.02663792):0.1918739):0.6826251,(((MSK.22.35:0.7317976,(MSK.17.49:0.6459351,(MSK.13.17:0.5621654,(MSK.13.2:0.005003621,(MSK.13.6:0.004403209,MSK.13.43:0.004403209):0.0006004121):0.5571618):0.08376975):0.08586242):0.04431587,MSK.11.2:0.7761134):0.120572,MSK.6.11:0.8966854):0.004451496):0.04604542,((((((((MSK.10.27:0.01044862,MSK.10.31:0.01044862):0.05242078,MSK.10.20:0.0628694):0.03998712,MSK.10.7:0.1028565):0.6292068,(((MSK.9.4:0.008945339,MSK.9.9:0.008945339):0.0874901,MSK.9.11:0.09643544):0.566493,(((DFI.4.141:0.03331342,DFI.4.155:0.03331342):0.06626768,DFI.4.26:0.0995811):0.06017711,DFI.4.30:0.1597582):0.5031702):0.06913491):0.130885,MSK.14.37:0.8629483):0.003442671,(MSK.14.17:0.07033848,MSK.14.23:0.07033848):0.7960525):0.04170974,DFI.5.34:0.9081008):0.01721933,(MSK.15.34:0.1403227,(MSK.15.37:0.1083422,MSK.14.29:0.1083422):0.03198043):0.7849974):0.02186222):0.1228962,MSK.15.40:1.070079):2.227134):0.3173817,(MSK.18.5:2.254935,MSK.14.58:2.254935):1.359659):0.6013497);
countmap(strvarobs.NCBI_Phylum)
Dict{String, Int64} with 3 entries:
  "Firmicutes"     => 182
  "Bacteroidetes"  => 150
  "Actinobacteria" => 24
phylum_color_dict = Dict(k=>v for (k,v) in zip(unique(strvarobs.NCBI_Phylum), palette(:Accent)[[4,2,3]]));
phylum_linecolors = map(prewalk(strvartree)) do node; strvarobs, strvar_obsnames, phylum_color_dict
    leafids = getleafnames(node)
    leafphylum = strvarobs.NCBI_Phylum[indexin(leafids, strvar_obsnames)]
    get(phylum_color_dict, mode(leafphylum), :grey)
end |> x->x[2:end] |> permutedims

plot(strvartree,
    fs=2, rightmargin=7Plots.mm,
    c=phylum_linecolors,
    lw=3,
    size=(400, 600),
)
# phyla 
for (i, (k,v)) in enumerate(zip(unique(strvarobs.NCBI_Phylum), palette(:Accent)[[4,2,3]]))
    annotate!(0, i*10 - 30, text(k, 7, :left, :black,v), c=v)
end
plot!()
speciescolordf = CSV.read(datadir("exp_raw","BB669","subsettreecolors.csv"), DataFrame)
species_color_dict = Dict(k=>v for (k,v) in zip(speciescolordf.species_name, speciescolordf.color));
speciesvector = strvarobs.Species[indexin(getleafnames(strvartree), strvar_obsnames)];
species_linecolors = map(prewalk(strvartree)) do node; strvarobs, strvar_obsnames, species_color_dict
    leafids = getleafnames(node)
    leafspecies = strvarobs.Species[indexin(leafids, strvar_obsnames)]
    if length(unique(leafspecies)) == 1
        get(species_color_dict, mode(leafspecies), :grey)
    else
        :grey
    end
end |> x->x[2:end] |> permutedims

 plot(strvartree,
    yticks=false,
    fs=1, rightmargin=5.4Plots.cm,
    c=species_linecolors,
    lw=3, size=(750, 750),
    framestyle=:grid,
)
# species 
for (i, (k,v)) in enumerate(zip(speciescolordf.species_name, speciescolordf.color))
    yval = median(findall(==(k), speciesvector))
    annotate!(4.5, yval, text(k, 9, :left, v))
end
annotate!(4.5, 356, text("Blautia luti (2)", 9, :left, species_color_dict["Blautia luti"]))
# vline!([.5,2.5])
fancy_tree_plot = plot!()
savefig(joinpath(pdir, "str_var_tree_colored_by_species.pdf"));

OGG annotations to COG groups

testresults = CSV.read(joinpath(rdir, "full_ogg_wilcoxon_testresults_on_big10species.csv"), DataFrame);
sigtestresults = filter(:qval_BH => <(0.05), testresults) |>
      df -> filter(:logeffectsize => >(1), df) |>
      df -> filter(:effectsize => >(1), df) |>
      df -> filter(:log2FC => x -> abs.(x) .> 1, df);
first(sigtestresults, 5)
5×26 DataFrame
Row nodeids nodeheight nodedepth grp1_N grp2_N grp1_phylum_mode grp2_phylum_mode grp1_species_mode grp2_species_mode grp1_species_donor_mode grp2_species_donor_mode ogg_name ogg_idx ogg_freqs grp1_prp_exp grp2_prp_exp effectsize logeffectsize log2FC cliffs_d pvals qvals pval_BH pval_Bon qval_BH tree_level
Int64 Float64 Float64 Int64 Int64 String15 String15 String31 String31 String String String15 Int64 Float64 Float64 Float64 Float64 Float64 Float64 Float64 Float64 Float64 Float64 Float64 Float64 String15
1 1 0.0 4.21594 150 206 Bacteroidetes Firmicutes Phocaeicola vulgatus [Ruminococcus] gnavus Phocaeicola vulgatus MSK.22 Bifidobacterium breve MSK.22 2Z7ID 8 0.407303 0.946667 0.0145631 1.09688 1.02216 1.0335 0.934628 2.22378e-67 0.0 2.08024e-65 1.16327e-61 0.0 phylum level
2 1 0.0 4.21594 150 206 Bacteroidetes Firmicutes Phocaeicola vulgatus [Ruminococcus] gnavus Phocaeicola vulgatus MSK.22 Bifidobacterium breve MSK.22 2Z7IJ 9 0.410112 0.0 0.708738 1.19222 1.32881 -1.16471 -0.708738 8.5582e-38 0.0 6.95591e-36 4.47683e-32 0.0 phylum level
3 1 0.0 4.21594 150 206 Bacteroidetes Firmicutes Phocaeicola vulgatus [Ruminococcus] gnavus Phocaeicola vulgatus MSK.22 Bifidobacterium breve MSK.22 2Z7KK 20 0.441011 0.873333 0.126214 1.54894 1.13715 1.17935 0.822006 7.76604e-49 0.0 6.69046e-47 4.06245e-43 0.0 phylum level
4 1 0.0 4.21594 150 206 Bacteroidetes Firmicutes Phocaeicola vulgatus [Ruminococcus] gnavus Phocaeicola vulgatus MSK.22 Bifidobacterium breve MSK.22 2Z7T7 46 0.511236 0.0 0.883495 1.57742 1.6749 -1.08511 -0.883495 4.2507e-54 0.0 3.73707e-52 2.22356e-48 0.0 phylum level
5 1 0.0 4.21594 150 206 Bacteroidetes Firmicutes Phocaeicola vulgatus [Ruminococcus] gnavus Phocaeicola vulgatus MSK.22 Bifidobacterium breve MSK.22 2Z7X2 68 0.384831 0.913333 0.0 2.32667 1.54657 1.54657 0.913333 1.23025e-63 0.0 1.14185e-61 6.43549e-58 0.0 phylum level
  • CELLULAR PROCESSES AND SIGNALING
    • [D] Cell cycle control, cell division, chromosome partitioning
    • [M] Cell wall/membrane/envelope biogenesis
    • [N] Cell motility
    • [O] Post-translational modification, protein turnover, and chaperones
    • [T] Signal transduction mechanisms
    • [U] Intracellular trafficking, secretion, and vesicular transport
    • [V] Defense mechanisms
    • [W] Extracellular structures
    • [Y] Nuclear structure
    • [Z] Cytoskeleton
  • INFORMATION STORAGE AND PROCESSING
    • [A] RNA processing and modification
    • [B] Chromatin structure and dynamics
    • [J] Translation, ribosomal structure and biogenesis
    • [K] Transcription
    • [L] Replication, recombination and repair
  • METABOLISM
    • [C] Energy production and conversion
    • [E] Amino acid transport and metabolism
    • [F] Nucleotide transport and metabolism
    • [G] Carbohydrate transport and metabolism
    • [H] Coenzyme transport and metabolism
    • [I] Lipid transport and metabolism
    • [P] Inorganic ion transport and metabolism
    • [Q] Secondary metabolites biosynthesis, transport, and catabolism
  • POORLY CHARACTERIZED
    • [R] General function prediction only
    • [S] Function unknown
COG_cat_desc_map = Dict(
    'D' => "Cell cycle control, cell division, chromosome partitioning",
    'M' => "Cell wall/membrane/envelope biogenesis",
    'N' => "Cell motility",
    'O' => "Post-translational modification, protein turnover, and chaperones",
    'T' => "Signal transduction mechanisms",
    'U' => "Intracellular trafficking, secretion, and vesicular transport",
    'V' => "Defense mechanisms",
    'W' => "Extracellular structures",
    'Y' => "Nuclear structure",
    'Z' => "Cytoskeleton",
    'A' => "RNA processing and modification",
    'B' => "Chromatin structure and dynamics",
    'J' => "Translation, ribosomal structure and biogenesis",
    'K' => "Transcription",
    'L' => "Replication, recombination and repair",
    'C' => "Energy production and conversion",
    'E' => "Amino acid transport and metabolism",
    'F' => "Nucleotide transport and metabolism",
    'G' => "Carbohydrate transport and metabolism",
    'H' => "Coenzyme transport and metabolism",
    'I' => "Lipid transport and metabolism",
    'P' => "Inorganic ion transport and metabolism",
    'Q' => "Secondary metabolites biosynthesis, transport, and catabolism",
    'R' => "General function prediction only",
    'S' => "Function unknown",
)
COG_cat_group_map = Dict(
    'D' => "CELLULAR PROCESSES AND SIGNALING",
    'M' => "CELLULAR PROCESSES AND SIGNALING",
    'N' => "CELLULAR PROCESSES AND SIGNALING",
    'O' => "CELLULAR PROCESSES AND SIGNALING",
    'T' => "CELLULAR PROCESSES AND SIGNALING",
    'U' => "CELLULAR PROCESSES AND SIGNALING",
    'V' => "CELLULAR PROCESSES AND SIGNALING",
    'W' => "CELLULAR PROCESSES AND SIGNALING",
    'Y' => "CELLULAR PROCESSES AND SIGNALING",
    'Z' => "CELLULAR PROCESSES AND SIGNALING",
    'A' => "INFORMATION STORAGE AND PROCESSING",
    'B' => "INFORMATION STORAGE AND PROCESSING",
    'J' => "INFORMATION STORAGE AND PROCESSING",
    'K' => "INFORMATION STORAGE AND PROCESSING",
    'L' => "INFORMATION STORAGE AND PROCESSING",
    'C' => "METABOLISM",
    'E' => "METABOLISM",
    'F' => "METABOLISM",
    'G' => "METABOLISM",
    'H' => "METABOLISM",
    'I' => "METABOLISM",
    'P' => "METABOLISM",
    'Q' => "METABOLISM",
    'R' => "UNKNOWN FUNCTION",
    'S' => "UNKNOWN FUNCTION",
);
ogg_annotations =
    @chain begin
        CSV.read(datadir("exp_raw", "UP7047", "2_annotations.tsv"), DataFrame)
        coalesce.("")
        select(Not(:level))
        transform(
            :category => ByRow(x -> !contains(x, r"S|R")) => :isannotated,
            :category => ByRow(x-> join(unique([COG_cat_desc_map[c] for c in x]), " & ")) => :category_description,
            :category => ByRow(x-> join(unique([COG_cat_group_map[c] for c in x]), " & ")) => :category_group,
        )
    end
206782×6 DataFrame
206757 rows omitted
Row ogg_name category description isannotated category_description category_group
String7 String7 String Bool String String
1 2Z7HP S Tocopherol cyclase false Function unknown UNKNOWN FUNCTION
2 2Z7HQ S Protein of unknown function (DUF1329) false Function unknown UNKNOWN FUNCTION
3 2Z7HR S Domain of unknown function (DUF4474) false Function unknown UNKNOWN FUNCTION
4 2Z7HS S Psort location CytoplasmicMembrane, score false Function unknown UNKNOWN FUNCTION
5 2Z7HT S Protein of unknown function (DUF1822) false Function unknown UNKNOWN FUNCTION
6 2Z7HU S false Function unknown UNKNOWN FUNCTION
7 2Z7HV S false Function unknown UNKNOWN FUNCTION
8 2Z7HW S InterPro IPR010496 false Function unknown UNKNOWN FUNCTION
9 2Z7HX S TraU protein false Function unknown UNKNOWN FUNCTION
10 2Z7HY S Phage major capsid protein, P2 family false Function unknown UNKNOWN FUNCTION
11 2Z7HZ S Domain of unknown function (DUF4392) false Function unknown UNKNOWN FUNCTION
12 2Z7I0 L Restriction endonuclease EcoRV true Replication, recombination and repair INFORMATION STORAGE AND PROCESSING
13 2Z7I1 S false Function unknown UNKNOWN FUNCTION
206771 COG5652 V VanZ like family true Defense mechanisms CELLULAR PROCESSES AND SIGNALING
206772 COG5653 M Protein involved in cellulose biosynthesis true Cell wall/membrane/envelope biogenesis CELLULAR PROCESSES AND SIGNALING
206773 COG5654 K RES domain protein true Transcription INFORMATION STORAGE AND PROCESSING
206774 COG5655 L RePlication protein true Replication, recombination and repair INFORMATION STORAGE AND PROCESSING
206775 COG5658 S integral membrane protein false Function unknown UNKNOWN FUNCTION
206776 COG5659 L transposition true Replication, recombination and repair INFORMATION STORAGE AND PROCESSING
206777 COG5660 K integral membrane protein true Transcription INFORMATION STORAGE AND PROCESSING
206778 COG5661 O secreted Zn-dependent protease true Post-translational modification, protein turnover, and chaperones CELLULAR PROCESSES AND SIGNALING
206779 COG5662 K AntiSigma factor true Transcription INFORMATION STORAGE AND PROCESSING
206780 COG5663 F Belongs to the 5'(3')-deoxyribonucleotidase family true Nucleotide transport and metabolism METABOLISM
206781 COG5664 O Bacterial protein of unknown function (DUF922) true Post-translational modification, protein turnover, and chaperones CELLULAR PROCESSES AND SIGNALING
206782 COG5665 L phosphatidylcholine phospholipase C activity true Replication, recombination and repair INFORMATION STORAGE AND PROCESSING
df = leftjoin(
    select(sigtestresults, [:nodeids, :nodeheight, :tree_level, :ogg_name,]),
    ogg_annotations,
    on=:ogg_name,
)
pltdf =
    @chain df begin
        coalesce.("")
        groupby(:tree_level)
        combine(:isannotated => mean)
    end
3×2 DataFrame
Row tree_level isannotated_mean
String15 Float64
1 phylum level 0.838554
2 species level 0.821844
3 strain level 0.571963
@df pltdf bar(:tree_level, :isannotated_mean,
    ylims=(0,1),
    size=(300, 400),
    xrotation=20,
    ylabel="fraction annotated",
)
savefig(joinpath(pdir, "fraction_oggs_annotated_allnodes.pdf"))
"/Users/bend/projects/Doran_etal_2023/plots/reviewer_plots/fraction_oggs_annotated_allnodes.pdf"

Acetate specific annotations

strvar_tree = readnw(open(readline, datadir("exp_pro","BB669","strvar-spitree.nw")));
met_mtx = biobank["metabolites_foldchange"][:, "Acetate_rel"].X;
met_ids = biobank.obs_names;
# nodes where the average change in aceate is 2 times higher in one group vs the other
acetate_differences =
    map(prewalk(strvar_tree)) do node
        (isleaf(node) || any(isleaf.(children(node)))) && return missing
        nodeid = id(node)
        g1 = met_mtx[indexin(getleafnames(node[1]), met_ids)]
        g2 = met_mtx[indexin(getleafnames(node[2]), met_ids)]
        effect = abs(mean(g1) - mean(g2))
        return (; nodeid, effect)
    end |> skipmissing |> DataFrame |> df -> (filter(:effect => >=(1), df))
12×2 DataFrame
Row nodeid effect
UInt16 Float64
1 2 1.10003
2 88 1.97374
3 136 1.28064
4 302 3.38908
5 383 1.43674
6 385 2.48998
7 435 1.39884
8 483 1.31227
9 491 1.04347
10 631 1.15746
11 649 1.111
12 685 1.02646
@chain sigtestresults begin
    subset(:nodeids => ByRow((acetate_differences.nodeid)))
    @aside @info "number of sig oggs: $(nrow(_)); over $(length(unique(_.nodeids))) unique nodes: $(unique(_.nodeids))" 
    groupby([:tree_level, :nodeids]) 
    combine(nrow)
    sort(:nrow, rev=true)
end
┌ Info: number of sig oggs: 728; over 10 unique nodes: [2, 88, 136, 302, 383, 385, 435, 631, 649, 685]
└ @ Main /Users/bend/projects/Doran_etal_2023/notebooks/jl_notebook_cell_df34fa98e69747e1a8f8a730347b8e2f_X21sZmlsZQ==.jl:3
10×3 DataFrame
Row tree_level nodeids nrow
String15 Int64 Int64
1 species level 385 296
2 species level 2 118
3 species level 302 113
4 species level 383 82
5 strain level 435 41
6 strain level 136 38
7 strain level 88 17
8 strain level 685 11
9 strain level 631 7
10 strain level 649 5
colordict = Dict(
    k => v for (k, v) in zip(sort(unique(ogg_annotations.category_group)), palette(:seaborn_colorblind))
)
Dict{String, RGB{Float64}} with 10 entries:
  "CELLULAR PROCESSES AND SIGNALIN… => RGB{Float64}(0.00392157,0.45098,0.698039)
  "INFORMATION STORAGE AND PROCESS… => RGB{Float64}(0.792157,0.568627,0.380392)
  "CELLULAR PROCESSES AND SIGNALIN… => RGB{Float64}(0.870588,0.560784,0.0196078)
  "INFORMATION STORAGE AND PROCESS… => RGB{Float64}(0.8,0.470588,0.737255)
  "METABOLISM"                      => RGB{Float64}(0.984314,0.686275,0.894118)
  "CELLULAR PROCESSES AND SIGNALIN… => RGB{Float64}(0.00784314,0.619608,0.45098)
  "UNKNOWN FUNCTION"                => RGB{Float64}(0.337255,0.705882,0.913725)
  "METABOLISM & INFORMATION STORAG… => RGB{Float64}(0.92549,0.882353,0.2)
  "INFORMATION STORAGE AND PROCESS… => RGB{Float64}(0.835294,0.368627,0.0)
  "METABOLISM & CELLULAR PROCESSES… => RGB{Float64}(0.580392,0.580392,0.580392)
strain_level_acetate_oggs =
    @chain sigtestresults begin
        subset(
            :nodeids => ByRow((acetate_differences.nodeid)),
            :tree_level => ByRow(==("strain level")),
        )
        # select([:nodeids, :grp1_N, :grp2_N, :ogg_name])
        select([:ogg_name])
        groupby(:ogg_name)
        combine(nrow)
        leftjoin(ogg_annotations, on=:ogg_name)
        unique
        # subset(:isannotated => ByRow(==(true)))
        sort([:nrow, :category, :ogg_name])
        # groupby([:nodeids])
        # combine(:isannotated => mean)
    end

labs, vals =
    @chain begin
        countmap(strain_level_acetate_oggs.category_group)
        sort(byvalue=true, rev=true)
        (collect(keys(_)), collect(values(_)))
    end


pie(labs, vals,
    c=getindex.(Ref(colordict), labs),
    size=(550, 500),
    legend=:outerbottom,
    title="Strain level"
)
thetas = 2pi * (cumsum(vals / sum(vals)) - ((vals / sum(vals)) / 2))
r = 1.1
annotate!(r * cos.(thetas), r * sin.(thetas), text.(string.(vals), 6))
savefig(joinpath(pdir, "acetate_strain_level_ogg_category_groups.pdf"))
"/Users/bend/projects/Doran_etal_2023/plots/reviewer_plots/acetate_strain_level_ogg_category_groups.pdf"
species_level_acetate_oggs =
    @chain sigtestresults begin
        subset(
            :nodeids => ByRow((acetate_differences.nodeid)),
            :tree_level => ByRow(==("species level")),
        )
        # select([:nodeids, :grp1_N, :grp2_N, :ogg_name])
        select([:ogg_name])
        groupby(:ogg_name)
        combine(nrow)
        leftjoin(ogg_annotations, on=:ogg_name)
        unique
        # subset(:isannotated => ByRow(==(true)))
        sort([:nrow, :category, :ogg_name])
        # groupby([:nodeids])
        # combine(:isannotated => mean)
    end

labs, vals =
    @chain begin
        countmap(species_level_acetate_oggs.category_group)
        sort(byvalue=true, rev=true)
        (collect(keys(_)), collect(values(_)))
    end


pie(labs, vals,
    c=getindex.(Ref(colordict), labs),
    size=(550, 500),
    legend=:outerbottom,
    title="Species level",
)
thetas = 2pi * (cumsum(vals / sum(vals)) - ((vals / sum(vals)) / 2))
r = 1.1
annotate!(r * cos.(thetas), r * sin.(thetas), text.(string.(vals), 6))
savefig(joinpath(pdir, "acetate_species_level_ogg_category_groups.pdf"))
"/Users/bend/projects/Doran_etal_2023/plots/reviewer_plots/acetate_species_level_ogg_category_groups.pdf"

All Metabolites

# get metabolite data
metabolite_names_full = biobank["metabolites_foldchange"].var.label;
bb_met_lfc = biobank["metabolites_foldchange"].X[:, :];

# filter to metaboltes with at least 10% measureable data
measurable_metabolites_mask = mapslices(c->mean(c .==0.) < .9, bb_met_lfc[biobank.obs.kept_species .== 1, :], dims=1)|>vec;
keepmetabolites_mask = measurable_metabolites_mask;
metabolite_names = metabolite_names_full[keepmetabolites_mask];
metabolite_label = biobank["metabolites_foldchange"].var.label[keepmetabolites_mask];
metabolite_ids = biobank["metabolites_foldchange"].var.ID[keepmetabolites_mask];
met_mtx = bb_met_lfc[:, keepmetabolites_mask]
met_ids = biobank.obs_names;
strvar_tree = readnw(open(readline, datadir("exp_pro","BB669","strvar-spitree.nw")));
# nodes where the average change in aceate is 2 times higher in one group vs the other
metabolite_differences =
    map(prewalk(strvar_tree)) do node
        (isleaf(node) || any(isleaf.(children(node)))) && return missing
        nodeid = id(node)
        g1 = met_mtx[indexin(getleafnames(node[1]), met_ids), :]
        g2 = met_mtx[indexin(getleafnames(node[2]), met_ids), :]
        effect = abs.(mean(g1, dims=1) .- mean(g2, dims=1))
        return (; nodeid, effect)
    end

threshold = 1 # 2-fold difference

node_selectors =
    @chain metabolite_differences begin
        skipmissing
        DataFrame
        transform(:effect => ByRow(x -> tuple((x .> threshold)...)) => "effect_" .* metabolite_ids)
        select(Not(:effect))
        getindex.(Ref(_.nodeid), eachcol(_)[2:end])
    end;
df =
    map(metabolite_label, node_selectors) do ml, ns
        @chain sigtestresults begin
            subset(:nodeids => ByRow((ns)))
            select([:tree_level, :ogg_name])
            groupby([:tree_level, :ogg_name])
            combine(nrow)
            leftjoin(ogg_annotations, on=:ogg_name)
            unique
            sort([:nrow, :category, :ogg_name])
            insertcols(1, :metabolite_label => ml)
        end
    end |> x -> reduce(vcat, x)
pltdf =
    @chain df begin
        groupby([:metabolite_label, :tree_level, :category_group])
        combine(nrow)
        groupby([:metabolite_label, :tree_level])
        transform(
            :nrow => (x -> x ./ sum(x)) => :nrow_frac
        )
        groupby([:tree_level, :category_group])
        combine(
            :nrow_frac => mean,
            :nrow_frac => std,
            :nrow_frac => median,
            :nrow_frac => iqr,
        )
        sort([:tree_level, :nrow_frac_mean, :category_group])
    end
22×6 DataFrame
Row tree_level category_group nrow_frac_mean nrow_frac_std nrow_frac_median nrow_frac_iqr
String15 String? Float64 Float64 Float64 Float64
1 phylum level CELLULAR PROCESSES AND SIGNALING & METABOLISM 0.00240964 4.54848e-19 0.00240964 0.0
2 phylum level INFORMATION STORAGE AND PROCESSING & CELLULAR PROCESSES AND SIGNALING 0.00481928 9.09697e-19 0.00481928 0.0
3 phylum level METABOLISM & INFORMATION STORAGE AND PROCESSING 0.00722892 0.0 0.00722892 0.0
4 phylum level METABOLISM & CELLULAR PROCESSES AND SIGNALING 0.0216867 0.0 0.0216867 0.0
5 phylum level INFORMATION STORAGE AND PROCESSING 0.13253 2.91103e-17 0.13253 0.0
6 phylum level UNKNOWN FUNCTION 0.161446 2.91103e-17 0.161446 0.0
7 phylum level CELLULAR PROCESSES AND SIGNALING 0.306024 5.82206e-17 0.306024 0.0
8 phylum level METABOLISM 0.363855 5.82206e-17 0.363855 0.0
9 species level METABOLISM & INFORMATION STORAGE AND PROCESSING 0.00661051 0.00263808 0.00559246 0.00354854
10 species level METABOLISM & CELLULAR PROCESSES AND SIGNALING 0.00788784 0.00386172 0.00840574 0.004182
11 species level INFORMATION STORAGE AND PROCESSING & CELLULAR PROCESSES AND SIGNALING 0.013102 0.00595208 0.0137633 0.00927329
12 species level INFORMATION STORAGE AND PROCESSING 0.130227 0.100512 0.110672 0.0882675
13 species level UNKNOWN FUNCTION 0.209926 0.0964315 0.188362 0.169214
14 species level CELLULAR PROCESSES AND SIGNALING 0.234602 0.0220361 0.237209 0.0187001
15 species level METABOLISM 0.425399 0.0809253 0.430108 0.122563
16 strain level METABOLISM & CELLULAR PROCESSES AND SIGNALING 0.00904381 0.00679671 0.00729927 0.00388252
17 strain level INFORMATION STORAGE AND PROCESSING & CELLULAR PROCESSES AND SIGNALING 0.0102366 0.00388368 0.00928793 0.00342857
18 strain level METABOLISM & INFORMATION STORAGE AND PROCESSING 0.0449953 0.0712116 0.00555556 0.0550548
19 strain level METABOLISM 0.132385 0.045579 0.122262 0.0422955
20 strain level CELLULAR PROCESSES AND SIGNALING 0.198778 0.0593026 0.185759 0.0551051
21 strain level INFORMATION STORAGE AND PROCESSING 0.248418 0.191368 0.204357 0.0407011
22 strain level UNKNOWN FUNCTION 0.486934 0.103846 0.508 0.0976351
plt_labels = names(unstack(pltdf, :tree_level, :category_group, :nrow_frac_mean))[2:end]
x_labels = unique(pltdf.tree_level) |> sort
pltmtx_mean = Matrix(coalesce.(unstack(pltdf, :tree_level, :category_group, :nrow_frac_mean), 0.0)[:, 2:end])
pltmtx_std = Matrix(coalesce.(unstack(pltdf, :tree_level, :category_group, :nrow_frac_std), 0.0)[:, 2:end])
groupedbar(pltmtx_mean, 
    yerrorbar=pltmtx_std,
    label=permutedims(plt_labels),
    xticks = (1:3, x_labels),
    legend=:outerbottom,
    size=(650,600),
    ylims=(0, .75),
    c=getindex.(Ref(colordict), plt_labels) |> permutedims,
    ylabel="fraction",
    title="all metabolites",
    )
savefig(joinpath(pdir,"all_metabolites_groupedbar.pdf"))
"/Users/bend/projects/Doran_etal_2023/plots/reviewer_plots/all_metabolites_groupedbar.pdf"