Setting up data

Author

Benjamin Doran

Published

January 17, 2025

# setup for local file handling
using DrWatson
@quickactivate projectdir()

# load extra libraries
using CSV, DataFrames, Muon, MAT
using StatsBase
include(srcdir("helpers.jl"))

getlims (generic function with 2 methods)

Install & Setup R packages

import CondaPkg; 
CondaPkg.activate!(ENV);
using RCall

R"""
options(repos=c(CRAN="https://repo.miserver.it.umich.edu/cran"))
install.packages("tidyverse")
install.packages("ggplot2")
install.packages("ape")

if (!require("BiocManager", quietly = TRUE)) {
    install.packages("BiocManager")
}

BiocManager::install("treeio")
BiocManager::install("ggtree")
"""

Check R packages

# Check loading package works
R"""
library(ape)
library(treeio)
library(ggtree)
library(ggplot2)
library(tidyverse)

setwd($(projectdir()))
""";

bbdir = datadir("exp_raw", "BB669")
updir = datadir("exp_raw", "UP7047")

"/Users/bend/projects/Doran_etal_2023/data/exp_raw/UP7047"

Read in the UniProt data and associated metadata.

The main data is a count matrix of 7047 bacterial strains described by 10,177 Orthologous Gene Groups (OGGs). Each element of the matrix shows the number (count) of sequences in the strain that belong to that OGG. Along with this are associated metadata for each strain and each orthologous gene group.

uniprot = readh5ad(joinpath(updir, "2020_02_UP7047.h5ad"))

AnnData object 7047 ✕ 10177

# count matrix
uniprot.X[1:5, 1:5]

5×5 Matrix{Float64}:
 1.0  0.0  1.0  2.0  1.0
 1.0  0.0  1.0  2.0  0.0
 1.0  0.0  1.0  3.0  1.0
 0.0  0.0  0.0  0.0  0.0
 1.0  0.0  1.0  2.0  0.0

first(uniprot.obs, 5)

5×17 DataFrame

Row	proteomeID	TaxID	Proteome_ID	Tax_ID	OSCODE	SUPERREGNUM	x__1_	x__2_	x__3_	SpeciesName	Kingdom	Phylum	Class	Order	Family	Genus	Species
	String	Int64	String	Int64	String	String	Int64	Int64	Int64	String	String	String	String	String	String	String	String
1	UP000283387	1261403	UP000283387	1261403		bacteria	4641	0	4643	Mangrovibacterium diazotrophicum	Bacteria	Bacteroidetes	Bacteroidia	Marinilabiliales	Prolixibacteraceae	Mangrovibacterium	Mangrovibacterium diazotrophicum
2	UP000181985	1897729	UP000181985	1897729		bacteria	3147	0	3150	Halomonas aestuarii	Bacteria	Proteobacteria	Gammaproteobacteria	Oceanospirillales	Halomonadaceae	Halomonas	Halomonas aestuarii
3	UP000220675	1962403	UP000220675	1962403		bacteria	4593	0	4595	Novosphingobium sp. PC22D	Bacteria	Proteobacteria	Alphaproteobacteria	Sphingomonadales	Sphingomonadaceae	Novosphingobium	Novosphingobium sp. PC22D
4	UP000013137	1188234	UP000013137	1188234		bacteria	601	0	601	Mycoplasma alkalescens 14918	Bacteria	Tenericutes	Mollicutes	Mycoplasmatales	Mycoplasmataceae	Mycoplasma	Mycoplasma alkalescens
5	UP000067399	1303921	UP000067399	1303921		bacteria	1467	0	1471	endosymbiont of Bathymodiolus septemdierum str. Myojin knoll	Bacteria	Proteobacteria	Gammaproteobacteria				Bathymodiolus septemdierum thioautotrophic gill symbiont

first(uniprot.var, 5)

5×6 DataFrame

Row	og	level	nm	description	COG_categories	OG_ID
	String	Int64	Int64	String	String	String
1	COG0001	2	4694	glutamate-1-semialdehyde 2,1-aminomutase activity	H	COG0001@2
2	COG0003	2	2423	Pfam Anion-transporting ATPase	P	COG0003@2
3	COG0002	2	3957	N-acetyl-gamma-glutamyl-phosphate reductase activity	E	COG0002@2
4	COG0004	2	5256	ammonium transporteR	P	COG0004@2
5	COG0005	2	3783	purine-nucleoside phosphorylase activity	F	COG0005@2

Read in Biobank data & make unified dataset

The BioBank (aka: Commensal Strain Bank, aka: CSB) is the strain bank we collected. It comprises 669 strains that have at least 1% distinct sequence identity to any other strain in the BioBank (i.e. we removed clonal duplicates).

All these strains were whole genome sequenced. And from that sequencing we annotated each strain by orthologous gene groups (OGG). This annotation was done for each strain by counting the number of coding regions that matched to each orthologous gene groups in the eggNOG database (v5.0) database and filtering out gene-groups that had no matches. This created the matrix in BB669_oggs.mat where strains correspond to each row and OGGs correspond to each column and each entry holds the number of protein sequences that matched to that OGG.

We also measured the metabolic phenotype of these strains across a number of short-chain fatty acids and other small molecules. The file B669_metabolites_foldchange.csv contains the measurements we use in this paper, where each row contains a strain and the columns contain metabolite compounds the strains either digest or produce. The degree of digestion or production of each metabolite is compared as the ratio between the relative concentration of the compound in media cultured with the strain versus blank media without any strain. We take the log2 transform of this ratio and report the log2-fold-change (log2FC) as our measure of metabolic activity.

BB669_ogg_data = matread(joinpath(bbdir, "BB669_oggs.mat"))
metadata = CSV.read(joinpath(bbdir, "BB669_rowmeta.tsv"), DataFrame; normalizenames=true)
metabolite_df = CSV.read(joinpath(bbdir, "BB669_metabolites_foldchange.tsv"), DataFrame)
metabolite_labels = replace.(names(metabolite_df), "_rel"=>"")[3:end]
rename!(metabolite_df, names(metabolite_df) .=> CSV.normalizename.(names(metabolite_df)));

#= 
    use function in helpers.jl to match measured orthologs in our strain bank 
    to those measured in UniProt bacteria.
=#
BBoggs_UPorder = match_column_order(
    BB669_ogg_data["data"], 
    BB669_ogg_data["var_names"], 
    uniprot.var_names
);

# this finds all species with at least 20 strain replicates in the dataset
# this subset of species are those we statistically test for strain level differences later on
keptspecies = string.(keys(sort(filter(x-> last(x) >= 20, countmap(metadata.Species)), byvalue=true, rev=true)))
filter!(!=("unclassified"), keptspecies)
keptspecies_mask = in.(metadata.Species, Ref(keptspecies));
keptspecies # species with at least 20 replicates in the dataset

11-element Vector{String}:
 "Phocaeicola vulgatus"
 "[Ruminococcus] gnavus"
 "Bacteroides thetaiotaomicron"
 "Anaerostipes hadrus"
 "Bacteroides uniformis"
 "Blautia luti"
 "Bifidobacterium breve"
 "Coprococcus comes"
 "Dorea formicigenerans"
 "Blautia wexlerae"
 "[Eubacterium] rectale"

Create unified file for future analysis, this will be easier to load in later notebooks

biobank_ogg = AnnData(
    X=BB669_ogg_data["data"],
    obs_names=string.(BB669_ogg_data["obs_names"]),
    var_names=string.(BB669_ogg_data["var_names"])
)
biobank_ogg_UP = AnnData(
    X=BBoggs_UPorder,
    obs_names=string.(BB669_ogg_data["obs_names"]),
    var_names=uniprot.var_names
)

met_mtx = metabolite_df[:, 3:end] |>
    x->coalesce.(x, NaN) |> # missing values (just checking)
    x->Matrix(x) |> # convert datatype
    x->log2.(x) |> # actual transform
    x->replace(x, -Inf => 0.0) # 0s in foldchange were below detection limit

@show any(isnan, met_mtx)

biobank_met = AnnData(
    X=met_mtx,
    obs_names=metabolite_df[:, :ID],
    var_names=names(metabolite_df)[3:end],
    var=DataFrame(ID=names(metabolite_df)[3:end], label=metabolite_labels)
)
biobank_met.layers["raw"] = Matrix(coalesce.(metabolite_df[:, 3:end], NaN))
biobank = MuData(mod=Dict(
    "oggs" => biobank_ogg,
    "UPorder_oggs" => biobank_ogg_UP,
    "metabolites_foldchange" => biobank_met,
))
biobank.obs = insertcols(String.(metadata), :kept_species=>keptspecies_mask)
biobank

any(isnan, met_mtx) = false

┌ Warning: Cannot join columns with the same name because var_names are intersecting.
└ @ Muon /Users/bend/.julia/packages/Muon/UKjAF/src/mudata.jl:367

MuData object 669 ✕ 21475
└ metabolites_foldchange
  AnnData object 669 ✕ 50
└ oggs
  AnnData object 669 ✕ 11248
└ UPorder_oggs
  AnnData object 669 ✕ 10177

writeh5mu(joinpath(bbdir, "BB669.h5mu"), biobank)