┌ Warning: Cannot join columns with the same name because var_names are intersecting.
└ @ Muon /Users/bend/.julia/packages/Muon/UKjAF/src/mudata.jl:367
# 356 species with >= 20 strain replicatesfull_train_mask = biobank.obs.kept_species;
trnYdf = biobank.obs[full_train_mask, :];
# spectral distances are invariant fold partitions # because everything is being projected into UniProt latent space# so precompute them...UPfullPCs = bbusv.U[full_train_mask, :] *Diagonal(bbusv.S[:]);partitions =getintervals(bbusv.S, alpha=1.5, q=0.75);Dij =spectraldistances(bbusv.U, bbusv.S, partitions) ./size(bbusv.V, 1); # 669 full CSBsubsetDij = Dij[full_train_mask, full_train_mask]; # 356 strain replicate CSB
# get metabolite datametabolite_names_full =replace.(biobank["metabolites_foldchange"].var_names, "_rel"=>"");bb_met_lfc = biobank["metabolites_foldchange"].X[:, :];bb_met_lfc[isinf.(bb_met_lfc)] .=0.0;metabolicdistance =pairwise(Euclidean(), bb_met_lfc; dims=1);# filter to metaboltes with at least 10% measureable datameasurable_metabolites_mask =mapslices(c ->mean(c .==0.0) <0.9, bb_met_lfc[full_train_mask, :], dims=1) |> vec;keepmetabolites_mask = measurable_metabolites_mask;metabolite_names = metabolite_names_full[keepmetabolites_mask];metabolite_label = biobank["metabolites_foldchange"].var.label[keepmetabolites_mask]# subset to 356 sample datasetmetab_trnY = bb_met_lfc[full_train_mask, keepmetabolites_mask];metab_bbextraY = bb_met_lfc[.!full_train_mask, keepmetabolites_mask];
Plot tree and metabolites
When we look at the Spectral Tree in relation to metabolites, we see structure in that clades tend to have similar metabolite capacities.
We define the SLE-Lasso models, based on the tree structure by using a Lasso model to discover which clade branches (portions of the tree) are most important for describing particular metabolites.
# Takes 3 minK =1# Make predictions with SPI-LASSO on 1 nearest neighborλ =0.001lambdas =exp10.(range(-4, 0, length=101))lambdacol =last(findmin(x ->abs(x - λ), lambdas))REPS =5NFOLDS =4adjust_rsquared(r2, n, df) =1- (1- r2) * ((n -1) / (n -1- df))
adjust_rsquared (generic function with 1 method)
seed!(424242) # this is stable within julia versions, for exact results use Julia 1.10cv =StratifiedCV(nfolds=NFOLDS, shuffle=true);folds =vcat([train_test_pairs(cv, 1:sum(full_train_mask), trnYdf.Species) for i in1:REPS]...)oof_preds_df_stacked =DataFrame()oof_dropout_preds_df_stacked =DataFrame()inf_preds_df_stacked =DataFrame()coefdf =DataFrame()models_tbl = []for (i, (fold_trn, fold_tst)) incollect(enumerate(folds))# Use tree to get lineage traces for each training and test sample# using projections of taxa into UniProt so these loadings are constant regardless of folds foldPCs = UPfullPCs[fold_trn, :]# trn_nns = map(r->partialsortperm(r, 1:K), eachrow(subsetDij[fold_trn, fold_trn])) tst_nns =map(r ->partialsortperm(r, 1:K), eachrow(subsetDij[fold_tst, fold_trn]))# oob_nns = map(r -> partialsortperm(r, 1:K), eachrow(subset_oob_Dij[:, fold_trn])) bbextra_nns =map(r ->partialsortperm(r, 1:K), eachrow(subset_bbextra_Dij[:, fold_trn]))# UPGMA tree building... foldhc =UPGMA_tree(subsetDij[fold_trn, fold_trn]) foldtree =readnw(SpectralInference.newickstring(foldhc, trnYdf.Strain_ID[fold_trn])) ordered_treeids =getleafids(foldtree)[indexin(trnYdf.Strain_ID[fold_trn], getleafnames(foldtree))]# ladderize!(foldtree, rev=false)# Make SLE ancester encoding for training set trnX_all =@chainbeginspectral_lineage_encoding(foldtree, ordered_treeids)getfield.(:sle) stackfloat.(_)end isinternal_fold =map(!isleaf, prewalk(foldtree)) trnX = trnX_all[:, isinternal_fold] num_descendent_species =map(prewalk(foldtree)) do nodeif !isroot(node) sps = trnYdf.Species[indexin(getleafnames(parent(node)), trnYdf.Strain_ID)]returnlength(unique(sps))elseNaNendend# prepare mask for dropping out subspecies branches# any branch that only has a single species as desendents of its parent dropout_mask = num_descendent_species[isinternal_fold] .<=1.0# Get features for each out-of-fold isolate oofX =map(tst_nns) do nn trnX[nn, :] |> mtx ->mean(mtx, dims=1)# df -> combine(df, [c => mean for c in 1:size(df, 2)])end|> x ->vcat(x...)# fit lasso model individual_metabolite_results = []# (target_idx, (target, mlabel)) = (findfirst(==("Phenylacetate"), metabolite_label), ("Phenylacetate", "Phenylacetate"))for (target_idx, (target, mlabel)) inenumerate(zip(metabolite_names, metabolite_label)) mpath =glmnet(trnX, metab_trnY[fold_trn, target_idx], Normal(); lambda=lambdas, )# get coefficients dropping out subspecies branches betas_droppedout =Matrix(deepcopy(mpath.betas)) betas_droppedout[dropout_mask, :] .=0.0push!(models_tbl, (; metabolite_name = target, metabolite_label = mlabel, fold = ((i -1) % NFOLDS) +1, resample = ((i -1) ÷ NFOLDS) +1, model = mpath ))# save results of trained model# in fold predictions inf_preds_df_stacked =vcat(inf_preds_df_stacked, DataFrame(:row_id => fold_trn,:msk_id => trnYdf.Strain_ID[fold_trn],:metabolite_name => target,:metabolite_label => mlabel,:fold => ((i -1) % NFOLDS) +1,:resample => ((i -1) ÷ NFOLDS) +1,:truth => metab_trnY[fold_trn, target_idx],# :preds => GLMNet.predict(mpath, trnX)[:, lambdacol], [Symbol("preds_$i") => v for (i, v) inzip(lambdas, eachcol(GLMNet.predict(mpath, trnX)))]...))# out-of-fold predictions oof_preds_df_stacked =vcat(oof_preds_df_stacked, DataFrame(:row_id => fold_tst,:msk_id => trnYdf.Strain_ID[fold_tst],:metabolite_name => target,:metabolite_label => mlabel,:fold => ((i -1) % NFOLDS) +1,:resample => ((i -1) ÷ NFOLDS) +1,:truth => metab_trnY[fold_tst, target_idx],# :preds => GLMNet.predict(mpath, tstX)[:, lambdacol], [Symbol("preds_$i") => v for (i, v) inzip(lambdas, eachcol(GLMNet.predict(mpath, oofX)))]...))# out-of-fold predictions dropping out subspecies branches oof_dropout_preds_df_stacked =vcat(oof_dropout_preds_df_stacked, DataFrame(:row_id => fold_tst,:msk_id => trnYdf.Strain_ID[fold_tst],:metabolite_name => target,:metabolite_label => mlabel,:fold => ((i -1) % NFOLDS) +1,:resample => ((i -1) ÷ NFOLDS) +1,:truth => metab_trnY[fold_tst, target_idx],# :preds => GLMNet.predict(mpath, tstX)[:, lambdacol], [Symbol("preds_$i") => v for (i, v) inzip(lambdas, eachcol(oofX * betas_droppedout))]...))# coefs of model coefdf =vcat(coefdf, DataFrame(:metabolite_name => target,:metabolite_label => mlabel,:fold => ((i -1) % NFOLDS) +1,:resample => ((i -1) ÷ NFOLDS) +1,:num_species_descendents => num_descendent_species[isinternal_fold], [Symbol("coefs_$k") => v for (k, v) inzip(lambdas, eachcol(mpath.betas))]... ))endprintln("on $(i)th resample")endCSV.write(joinpath(rdir, "oof_predictions_stacked_SLE_lambda=many.csv"), oof_preds_df_stacked)CSV.write(joinpath(rdir, "oof_dropout_predictions_stacked_SLE_lambda=many.csv"), oof_dropout_preds_df_stacked)CSV.write(joinpath(rdir, "infold_predictions_stacked_SLE_lambda=many.csv"), inf_preds_df_stacked)CSV.write(joinpath(rdir, "coefs_SLE_lambda=many.csv"), coefdf)JLD2.save(joinpath(rdir, "models_SLE_lambda=many.jld2"), Dict("models"=> models_tbl))JLD2.save(joinpath(rdir, "folds_SLE_lambda=many.jld2"), @strdict(folds))
on 1th resample
on 2th resample
on 3th resample
on 4th resample
on 5th resample
on 6th resample
on 7th resample
on 8th resample
on 9th resample
on 10th resample
on 11th resample
on 12th resample
on 13th resample
on 14th resample
on 15th resample
on 16th resample
on 17th resample
on 18th resample
on 19th resample
on 20th resample
Predictive capacity under a ‘leave-one-out’ training scheme where entire species are removed from the training set. SLE-LASSO models are trained leaving out 1 of 11 species and tested on the single species that was left out. Each of the 10 species used for training contains 20 or more strains. Mean predictive capacity ± 1 standard deviation (error bars) are shown for the training sets (top) and the validation sets (bottom). ‘inf’ represents adjusted r2 values that could not be calculated.
usingDistributions: Normal# Takes 1 minK =1# Make predictions with SPI-LASSO on 1 nearest neighborλ =1e-3REPS =5NFOLDS =4lambdas =exp10.(range(0, -4, length=100));folds_l1o_species =map(sort(unique(trnYdf.Species))) do speciesname trn_idx =findall(!=(speciesname), trnYdf.Species) tst_idx =findall(==(speciesname), trnYdf.Species) (trn_idx, tst_idx)endresults = []for (metname, metlabel) inzip(metabolite_names, metabolite_label)for (i, (fold_trn, fold_tst)) incollect(enumerate(folds_l1o_species))# Use tree to get lineage traces for each training and test sample# using projections of taxa into UniProt so these loadings are constant regardless of folds foldPCs = UPfullPCs[fold_trn, :] trn_nns =map(r ->partialsortperm(r, 1:K), eachrow(subsetDij[fold_trn, fold_trn])) tst_nns =map(r ->partialsortperm(r, 1:K), eachrow(subsetDij[fold_tst, fold_trn]))# oob_nns = map(r->partialsortperm(r, 1:K), eachrow(subset_oob_Dij[:, fold_trn]))# UPGMA tree building... foldhc =UPGMA_tree(subsetDij[fold_trn, fold_trn]) foldtree =readnw(SpectralInference.newickstring(foldhc, trnYdf.Strain_ID[fold_trn]))# ladderize!(foldtree, rev=false)# Make SLE ancester encoding trnXdf_all =map(prewalk(foldtree)) do node tmp =zeros(length(fold_trn)) tmp[indexin(getleafnames(node), trnYdf.Strain_ID[fold_trn])] .=1"node__$(id(node))"=> tmpend|> DataFrame isinternal_fold =map(!isleaf, prewalk(foldtree))# reorder nodes by tree depth treedists =mapinternalnodes(foldtree) do nodenetwork_distance(foldtree, node)end trnXdf = trnXdf_all[:, isinternal_fold] trnXdf = trnXdf[:, sortperm(treedists)]rename!(trnXdf, ["node__$i" for i in1:size(trnXdf, 2)])# Get features for each out-of-fold isolate tstXdf =map(tst_nns) do nn trnXdf[nn, :] |> df ->combine(df, [c => mean for c in1:size(df, 2)])end|> x ->vcat(x...) trnX =Matrix(trnXdf) tstX =Matrix(tstXdf) trnY = metab_trnY[fold_trn, metabolite_names.==metname] |> vec tstY = metab_trnY[fold_tst, metabolite_names.==metname] |> vec# @show metname, size(trnX), size(trnY) modelpath =glmnet(trnX, trnY, Normal(); lambda=lambdas) n =size(trnX, 2) dfs =map(x ->sum(x .!=0), eachcol(modelpath.betas)) trnR2 =rsquared.(eachcol(GLMNet.predict(modelpath, trnX)), Ref(trnY)) tstR2 =rsquared.(eachcol(GLMNet.predict(modelpath, tstX)), Ref(tstY))push!(results, (;# model=modelpath, metabolite_name=metname, metabolite_label=metlabel, leftout_species=(only∘unique)(trnYdf.Species[fold_tst]), train_r2=trnR2, test_r2=tstR2, train_adjr2=adjust_rsquared.(trnR2, n, dfs), test_adjr2=adjust_rsquared.(tstR2, n, dfs), lambda=modelpath.lambda, model_dfs=dfs ))endend
Adjusted r2 (y-axis) versus penalty value of SLE-LASSO model (x-axis) for predicting metabolite concentrations across all strains. Adjusted r2 value plotted for both training and test set with peak predictive capacity of the test set delineated as a yellow dot in each plot (see key).
Mean magnitude of coefficients in SLE-model (y-axis) versus penalty value (x-axis) for each metabolite (panel). Dashed curves correspond to coefficients of SLE branches defining differences amongst strains belonging to the same species (‘inter-species variation’); solid curves corresponds to coefficients of SLE branches defining differences amongst strains belonging to different species (‘intra-species variation’). Solid vertical gray line in each plot delineates the penalty value for which the peak predictive SLE-LASSO model is observed as shown in Figure 7D.
seed!(424242)cv =StratifiedCV(nfolds=NFOLDS, shuffle=true);folds =vcat([train_test_pairs(cv, 1:sum(full_train_mask), trnYdf.Species) for i in1:REPS]...)oof_preds_df_stacked =DataFrame()inf_preds_df_stacked =DataFrame()coefdf =DataFrame()models_tbl = [](i, (fold_trn, fold_tst)) = (1, folds[1])# (i, (fold_trn, fold_tst)) = (2, folds[2])# Use tree to get lineage traces for each training and test sample# using projections of taxa into UniProt so these loadings are constant regardless of foldsfoldPCs = UPfullPCs[fold_trn, :]tst_nns =map(r ->partialsortperm(r, 1:K), eachrow(subsetDij[fold_tst, fold_trn]))# UPGMA tree building...foldhc =UPGMA_tree(subsetDij[fold_trn, fold_trn])foldtree =readnw(SpectralInference.newickstring(foldhc, trnYdf.Strain_ID[fold_trn]))ladderize!(foldtree, rev=false)ordered_treeids =getleafids(foldtree)[indexin(trnYdf.Strain_ID[fold_trn], getleafnames(foldtree))]# Make SLE ancester encoding for training settrnX_all =@chainbeginspectral_lineage_encoding(foldtree, ordered_treeids)getfield.(:sle) stackfloat.(_)endisinternal_fold =map(!isleaf, prewalk(foldtree))trnX = trnX_all[:, isinternal_fold]num_descendent_species =map(prewalk(foldtree)) do nodeif !isroot(node) sps = trnYdf.Species[indexin(getleafnames(parent(node)), trnYdf.Strain_ID)]returnlength(unique(sps))elseNaNendend# Get features for each out-of-fold isolateoofX =map(tst_nns) do nn trnX[nn, :] |> mtx ->mean(mtx, dims=1)end|> x ->vcat(x...)# fit lasso model(target_idx, (target, mlabel)) = (findfirst(==("Acetate"), metabolite_label), ("Acetate", "Acetate"))mdl = GLMNet.glmnetcv(trnX, metab_trnY[fold_trn, target_idx], Normal(); lambda=lambdas,)push!(models_tbl, (; metabolite_name=target, metabolite_label=mlabel, fold=((i -1) % NFOLDS) +1, resample=((i -1) ÷ NFOLDS) +1, fold_trn, fold_tst, foldtree=foldtree, model=mdl,))println("on $(i)th resample")
on 1th resample
# best lambda model for acetatebest_lambda_from_fig7 =only(@rsubset(bestlambdamodels, :metabolite_label =="Acetate").lambda)bestlambda_idx =last(findmin(x->abs(x-best_lambda_from_fig7), mdl.lambda))
Measured (y-axis) versus predicted (x-axis) relative concentration of metabolite where prediction is computed from the peak predictive SLE-LASSO model from Figure 7D. Dots are individual strains colored by species (see color key).