Fig. 2 - Convergent evolution example

Author

Benjamin Doran

Published

January 17, 2025

Julia Setup

using DrWatson
@quickactivate projectdir()

using SpectralInference # custom package from $projectdir/src
using NewickTree
using Gotree_jll, Goalign_jll
using StatsBase, NeighborJoining
using Distances, Clustering
using DataFrames, CSV
using Muon
using Symbolics
using StatsPlots
theme(:default, grid=false, label=false, tickdir=:out)
using LinearAlgebra
using LaTeXStrings
include(srcdir("helpers.jl"))
heatmapcols = [:purple, :black, :yellow];
generationcols = ["#B3D2FC" "#2B689E" "#338591"];

Main - Example with convergent evolution

Spectral decomposition’s like SVD and PCA are based on co-variation, therefore they incorporate information about contextual dependence between features and taxa. This means that Spectral Inference can infer the correct ancestral tree even in cases of convergent evolution. Below we created an alignment of 18 taxa described by 9 features, where there are multiple evolutionary paths to get to each feature, i.e., each feature is only a good genomic feature that matches a true ancestral divergence in the context of other features.

Spectral Inference is able to perfectly recapitulate this ancestral tree, where standard phylogenetic tools fail.

M = Float64.([
    1 0 1 0 0 0 1 0 1;
    0 1 1 0 0 0 1 0 1;
    1 1 0 0 0 0 1 0 1;
    1 0 1 0 0 0 0 1 1;
    0 1 1 0 0 0 0 1 1;
    1 1 0 0 0 0 0 1 1;
    1 0 1 0 0 0 1 1 0;
    0 1 1 0 0 0 1 1 0;
    1 1 0 0 0 0 1 1 0;
    1 0 1 1 0 1 0 0 0;
    0 1 1 1 0 1 0 0 0;
    1 1 0 1 0 1 0 0 0;
    1 0 1 0 1 1 0 0 0;
    0 1 1 0 1 1 0 0 0;
    1 1 0 0 1 1 0 0 0;
    1 0 1 1 1 0 0 0 0;
    0 1 1 1 1 0 0 0 0;
    1 1 0 1 1 0 0 0 0;
]);
heatmap(M, 
    c=[:white, :black], 
    ratio=1, 
    xticks=(1:9, ["Pos $i" for i in 1:9]),
    xrotation=45,
    yticks=(1:18, 'a':'r'),
    yflip=true,
    size=(300, 600),
    margin=5Plots.mm,
)

usv = svd(M);

plot(
    ylabel="explained variance (%)", 
    xlabel="spectral component", 
    legend=:none, 
    ylims=(0,1), xlims=(.5,9.5), 
    size=(500,500)
)
scatter!((usv.S.^2)/sum(usv.S.^2), 
    xticks=1:14, c=:black, markersize=5,
)
annotate!((1:length(usv.S)) .+ .1 , (usv.S.^2)/sum(usv.S.^2).+.06, round.((usv.S.^2)/sum(usv.S.^2)*100, digits=1))

Phylogenetic inference with FastME, PhyML, and SPI

In the tree plots below only branches with an orange dot are statistically supported.

toyMSA_18x9_dir = datadir("sims", "toyMSA_18x9") |> mkpath
# write out alignment
writephylip(
    joinpath(toyMSA_18x9_dir, "MSA.phylip"),
    join.(eachrow((replace(M, 0.0 => 'A', 1.0 => 'T')))),
    collect('a':'r')
)

# FastME
run(`julia $(projectdir("scripts", "runners", "runFastME.jl")) 
    -i $(datadir("sims", "toyMSA_18x9", "MSA.phylip"))
    -o $(projectdir("_research", "toyMSA_18x9", "FastME"))
    -m JC69
`)
# PhyML
run(`julia $(projectdir("scripts", "runners", "runPhyML.jl")) 
    -i $(datadir("sims", "toyMSA_18x9", "MSA.phylip"))
    -o $(projectdir("_research", "toyMSA_18x9", "PhyML"))
    -m JC69
`)
# SpectralInference
run(`julia $(projectdir("scripts", "runners", "runSPI.jl")) 
    -i $(datadir("sims", "toyMSA_18x9", "MSA.phylip"))
    -o $(projectdir("_research", "toyMSA_18x9", "SPI"))
    -m JC69
`);
pdir_toyMSA_18x9 = plotsdir("toyMSA_18x9") |> mkpath

  Activating project at `~/projects/Doran_etal_2023`

[ Info: Starting FastME on MSA
[ Info: using Booster to compute support values
[ Info: stopping run
 ──────────────────────────────────────────────────────────────────────
                              Time                    Allocations      
                     ───────────────────────   ────────────────────────
  Tot / % measured:       586ms /  91.3%           51.6MiB /  89.7%    

 Section     ncalls     time    %tot     avg     alloc    %tot      avg
 ──────────────────────────────────────────────────────────────────────
 total            1    535ms  100.0%   535ms   46.3MiB  100.0%  46.3MiB
   booster        1    220ms   41.1%   220ms   1.16MiB    2.5%  1.16MiB
   FastME         1    181ms   33.9%   181ms   3.29MiB    7.1%  3.29MiB
 ──────────────────────────────────────────────────────────────────────
┌ Info: timing
│   show(time) = nothing
└   println("") = nothing

  Activating project at `~/projects/Doran_etal_2023`

[ Info: Starting PhyML on MSA
[ Info: stopping run
 ────────────────────────────────────────────────────────────────────
                            Time                    Allocations      
                   ───────────────────────   ────────────────────────
 Tot / % measured:      375ms /  87.4%           50.1MiB /  89.4%    

 Section   ncalls     time    %tot     avg     alloc    %tot      avg
 ────────────────────────────────────────────────────────────────────
 total          1    328ms  100.0%   328ms   44.8MiB  100.0%  44.8MiB
   PhyML        1    194ms   59.2%   194ms   2.97MiB    6.6%  2.97MiB
 ────────────────────────────────────────────────────────────────────
┌ Info: timing
│   show(time) = nothing
└   println("") = nothing

  Activating project at `~/projects/Doran_etal_2023`

[ Info: Starting SPI inference
[ Info: Setting up workspace
[ Info: Running SPI
[ Info: Writing out SPI Tree
[ Info: Starting Bootstrap with 100
[ Info: Writing out Bootstrap trees
[ Info: using Booster to compute support values
[ Info: Finishing run
 ────────────────────────────────────────────────────────────────────────────────
                                        Time                    Allocations      
                               ───────────────────────   ────────────────────────
       Tot / % measured:            1.68s /  95.9%            294MiB /  98.0%    

 Section               ncalls     time    %tot     avg     alloc    %tot      avg
 ────────────────────────────────────────────────────────────────────────────────
 total                      1    1.61s  100.0%   1.61s    289MiB  100.0%   289MiB
   running SPI              1    930ms   57.8%   930ms    167MiB   58.0%   167MiB
   running bootstra...      1    183ms   11.4%   183ms   67.4MiB   23.3%  67.4MiB
 ────────────────────────────────────────────────────────────────────────────────
┌ Info: 
│ timing
│   show(time) = nothing
└   println("") = nothing

"/Users/bend/projects/Doran_etal_2023/plots/toyMSA_18x9"

method = "FastME"
run(pipeline(`$(gotree()) draw svg -c -w 400 -H 400 --support-cutoff .5 --with-branch-support`,
    stdin=projectdir("_research", "toyMSA_18x9", method, "MSA-supporttree.nw"),
    stdout=joinpath(pdir_toyMSA_18x9, method * ".svg")
))
show_svg(joinpath(pdir_toyMSA_18x9, method * ".svg"))

method = "PhyML"
run(pipeline(`$(gotree()) draw svg -c -w 400 -H 400 --support-cutoff .5 --with-branch-support`,
    stdin=projectdir("_research", "toyMSA_18x9", method, "MSA.phylip-supporttree.txt"),
    stdout=joinpath(pdir_toyMSA_18x9, method * ".svg")
))
show_svg(joinpath(pdir_toyMSA_18x9, method * ".svg"))

method = "SPI"
run(pipeline(`$(gotree()) draw svg -c -w 400 -H 400 --support-cutoff .5 --with-branch-support`,
    stdin=projectdir("_research", "toyMSA_18x9", method, "MSA-supporttree.nw"),
    stdout=joinpath(pdir_toyMSA_18x9, method * ".svg")
))
show_svg(joinpath(pdir_toyMSA_18x9, method * ".svg"))

How is information encoded about convergent processes?

We show that subsets of principal components, hold information that project taxa to different positions based on the ancestral path they took to obtain a particular genomic feature.

bar(M[:, 5],
    xticks=(1:18, 'a':'r'),
    xlabel="taxa",
    ylabel="Pos 5",
    size=(600, 300),
)
vline!([9.5])

taxa a-i and j-r split at the first generation. using position 5 as a gene marker would place j-l with a-i as they both lost this feature through convergent processes.

Mspectrallyfiltered = usv.U[:, 5:8] * Diagonal(usv.S[5:8]) * usv.Vt[5:8, :]
heatmap(Mspectrallyfiltered, 
    c=:bwr, 
    ratio=1, 
    xticks=(1:9, ["Pos $i" for i in 1:9]),
    xrotation=45,
    yticks=(1:18, 'a':'r'),
    yflip=true,
    size=(300, 600),
    margin=5Plots.mm,
)

recreated alignment matrix from components 5-8. These components shows the mutations incuded in generation F2 in blue

bar(Mspectrallyfiltered[:, 5],
    xticks=(1:18, 'a':'r'),
    xlabel="taxa",
    ylabel="Pos 5",
    size=(600, 300),
    ylims=(-.6667, .33334),
    widen=true,
    yticks=([-0.666, 0.0, .3333], [L"\frac{-2}{3}", L"0", L"\frac{1}{3}"])
)
vline!([9.5])

recreating the alignment using components 5-8, transforms position 5 in such a way that taxa j-l are distict because they lost this feature in a different context as to taxa a-i

We show in this case as well that information regarding more recent generational differences correlate to deeper principal components

spectralcorrs = map([i:(i+2) for i in 1:(size(M,2)-2)]) do window
    spectralcorrelations(usv.U, window)
end;

F1mask = kron([1 0; 0 1], ones(3,3), ones(3,3))
F2mask = kron(Diagonal(ones(6)), ones(3,3));
uppertriangle = triu(trues(18, 18), 1);

f1mi = map(spectralcorrs) do spcorr 
    empiricalMI(spcorr[uppertriangle], (F1mask .== 1)[uppertriangle]) # edges=-1:0.001:1
end
f2mi = map(spectralcorrs) do spcorr 
    empiricalMI(spcorr[uppertriangle], (F2mask .== 1)[uppertriangle]) # edges=-1:0.001:1
end

plot(
    ylabel="Cumulative MI\n (density)",
    xlabel="Principal component Window\n [Principal component start to end]",
    yticks=[0.0, .5, 1.0],
    xticks=(2:8, ["[$i to $(i+2)]" for i in 1:7]),
    xrotation=45,
    margin=5Plots.mm,
)
plot!(scaledcumsum(vcat(0, f1mi)), c=:red, marker=true, label="F1", lw=2,)
plot!(scaledcumsum(vcat(0, f2mi)), c=:orange, marker=true, label="F2", lw=2,)

Fig. S5 - Timing benchmark

Spectral Inference is fast, it is based on PCA. So it has the potential to scale quite large. Thousands of taxa on a laptop, larger using clusters and distributed computing.

Here we just show that Spectral Inference (SPI) does not grow exponentially with the number of taxa.

## create UniProt subsets
uniprot = readh5ad(datadir("exp_raw", "UP7047", "2020_02_UP7047.h5ad"))
rowmeta = uniprot.obs;

outdir = datadir("exp_pro", "UPsubsetMSAs") |> mkpath
pdir = plotsdir("timeing_benchmark") |> mkpath

"/Users/bend/projects/Doran_etal_2023/plots/timeing_benchmark"

Subset to different number of taxa

25 taxa at Genus level (Ruminococcus)
50 taxa at Family level (Rhodospirillaceae)
103 taxa at Order level (Oceanospirillales)
211 taxa at Class level (Bacteroidia)

write out each subset as pseudo-alignment

## Genus
genus_choice_df = combine(nrow, groupby(rowmeta, :Genus)) |>
    df->sort(df, :nrow, rev=true) |>
    df->filter(:nrow=>x->24 .< x .< 27, df)
display(genus_choice_df)

mask = findall(==("Ruminococcus"), rowmeta.Genus);
subsetMSA = Char.(replace(uniprot.X[mask, :] .> 0.0, true=>'T', false=>'A'))
open(joinpath(outdir, "UPsubset_taxa=25_level=Genus_group=Ruminococcus_alphabet=AT.phy"), "w") do io
    println(io, size(subsetMSA, 1), " ", size(subsetMSA, 2))
    for (id, row) in zip(rowmeta.proteomeID[mask], eachslice(subsetMSA, dims=1))
        println(io, id, "    ", join(row))
    end
end

## Family
family_choice_df = combine(nrow, groupby(rowmeta, :Family)) |>
    df->sort(df, :nrow, rev=true) |>
    df->filter(:nrow=>x->48 .< x .< 52, df)
display(family_choice_df)

mask = findall(==("Rhodospirillaceae"), rowmeta.Family);
subsetMSA = Char.(replace(uniprot.X[mask, :] .> 0.0, true=>'T', false=>'A'))
open(joinpath(outdir, "UPsubset_taxa=50_level=Family_group=Rhodospirillaceae_alphabet=AT.phy"), "w") do io
    println(io, size(subsetMSA, 1), " ", size(subsetMSA, 2))
    for (id, row) in zip(rowmeta.proteomeID[mask], eachslice(subsetMSA, dims=1))
        println(io, id, "    ", join(row))
    end
end

## Order
order_choice_df = combine(nrow, groupby(rowmeta, :Order)) |>
    df->sort(df, :nrow, rev=true) |>
    df->filter(:nrow=>x->95 .< x .< 105, df)
display(order_choice_df)

mask = findall(==("Oceanospirillales"), rowmeta.Order);
subsetMSA = Char.(replace(uniprot.X[mask, :] .> 0.0, true=>'T', false=>'A'))
open(joinpath(outdir, "UPsubset_taxa=103_level=Order_group=Oceanospirillales_alphabet=AT.phy"), "w") do io
    println(io, size(subsetMSA, 1), " ", size(subsetMSA, 2))
    for (id, row) in zip(rowmeta.proteomeID[mask], eachslice(subsetMSA, dims=1))
        println(io, id, "    ", join(row))
    end
end

## Class
class_choice_df = combine(nrow, groupby(rowmeta, :Class)) |>
    df->sort(df, :nrow, rev=true) |>
    df->filter(:nrow=>x->190 .< x .< 215, df)
display(class_choice_df)

mask = findall(==("Bacteroidia"), rowmeta.Class);
subsetMSA = Char.(replace(uniprot.X[mask, :] .> 0.0, true=>'T', false=>'A'))
open(joinpath(outdir, "UPsubset_taxa=211_level=Class_group=Bacteroidia_alphabet=AT.phy"), "w") do io
    println(io, size(subsetMSA, 1), " ", size(subsetMSA, 2))
    for (id, row) in zip(rowmeta.proteomeID[mask], eachslice(subsetMSA, dims=1))
        println(io, id, "    ", join(row))
    end
end

2×2 DataFrame

Row	Genus	nrow
	String	Int64
1	Helicobacter	26
2	Ruminococcus	25

2×2 DataFrame

Row	Family	nrow
	String	Int64
1	Alteromonadaceae	51
2	Rhodospirillaceae	50

2×2 DataFrame

Row	Order	nrow
	String	Int64
1	Oceanospirillales	103
2	Pseudomonadales	97

1×2 DataFrame

Row	Class	nrow
	String	Int64
1	Bacteroidia	211

## submit job to cluster to run all phylogenetic inference tasks in parallel
# run(`sbatch $(projectdir("scripts", "slurm-run-PI-on-UPsubsets.sbatch"))`)

time_per_run_df = CSV.read(projectdir("_research", "UPsubsetMSAs", "timeperjob.log"), DataFrame, delim="\t");

pltdf = sort(time_per_run_df, :Seq) |>
    df->filter(:Exitval=> ==(0), df) |>
    df->transform(df, :Command=>(x->replace.(x, r"(.*)((?<=\/t)[0-9]*)(.*)"=>s"\2"))=> :Ntaxa ) |>
    df->transform(df, :Command=>(x->replace.(x, r"(.*)((?<=runners\/run)[a-zA-Z]*)(.*)"=>s"\2"))=> :method);

pltdf = pltdf[:, [:method, :Ntaxa, :JobRuntime]];
pltdf.Ntaxa = parse.(Int, pltdf.Ntaxa);
pltdf = sort(pltdf, [:method, :Ntaxa]);
pltdf = subset(pltdf, :method => ByRow(!=("FastME")))
pltdf

20×3 DataFrame

Row	method	Ntaxa	JobRuntime
	String	Int64	Float64
1	FastTree	25	883.487
2	FastTree	50	2011.95
3	FastTree	103	4044.1
4	FastTree	211	6906.7
5	MrBayes	25	728.282
6	MrBayes	50	1730.24
7	MrBayes	103	7628.71
8	MrBayes	211	9608.94
9	PhyML	25	439.039
10	PhyML	50	2441.14
11	PhyML	103	7875.46
12	PhyML	211	10119.2
13	RAxML	25	4905.39
14	RAxML	50	8285.4
15	RAxML	103	10093.6
16	RAxML	211	13513.1
17	SPI	25	65.94
18	SPI	50	67.507
19	SPI	103	83.168
20	SPI	211	138.193

plot(title="Job runtime of phylogeny inference methods",
    ylabel="time (seconds)",
    yticks=([60, 600, 1800, 3600, 7200, 10800, 14400], ["1m", "10m", "30m", "1h", "2h", "3h", "4h"]),
    xticks=([25, 50, 103, 211], [
        "Genus:Ruminococcus\n #taxa=25",
        "Family:Rhodospirillaceae\n #taxa=50",
        "Order:Oceanospirillales\n #taxa=103",
        "Class:Bacteroidia\n #taxa=211",
         ]),
    xrotation=30,
    margin=5Plots.Measures.mm,
    ylims=(-100, 14500),
    xlims=(0, 220),
    legend=:topleft,
)
@df pltdf plot!(:Ntaxa, :JobRuntime, group=:method, markers=true,
    linewidth=2,
    markersize=6,
    markerstrokewidth=0,
    c=permutedims(vcat([1],3:7)))

plot(title="Job runtime of phylogeny inference methods",
    ylabel="time (seconds)",
    yticks=([60, 600, 1800, 3600, 7200, 10800, 14400], ["1m", "10m", "30m", "1h", "2h", "3h", "4h"]),
    xticks=([25, 50, 103, 211], [
        "Genus:Ruminococcus\n #taxa=25",
        "Family:Rhodospirillaceae\n #taxa=50",
        "Order:Oceanospirillales\n #taxa=103",
        "Class:Bacteroidia\n #taxa=211",
         ]),
    xrotation=30,
    margin=5Plots.Measures.mm,
    # ylims=(-100, 14500),
    # xlims=(0, 220),
    legend=:right,
)
@df pltdf plot!(:Ntaxa, :JobRuntime, group=:method, markers=true,
    linewidth=2,
    markersize=6,
    markerstrokewidth=0,
    c=permutedims(vcat([1],3:7)),
    yscale=:log10,
)

savefig(joinpath(pdir, "timeing_benchmark_logscale.pdf"))

"/Users/bend/projects/Doran_etal_2023/plots/timeing_benchmark/timeing_benchmark_logscale.pdf"