# setup for local file handling
using DrWatson
@quickactivate projectdir()
# load extra libraries
using CSV, DataFrames, Muon, MAT
using StatsBase
include(srcdir("helpers.jl"))
getlims (generic function with 2 methods)
# setup for local file handling
using DrWatson
@quickactivate projectdir()
# load extra libraries
using CSV, DataFrames, Muon, MAT
using StatsBase
include(srcdir("helpers.jl"))
getlims (generic function with 2 methods)
import CondaPkg;
activate!(ENV);
CondaPkg.using RCall
"""
Roptions(repos=c(CRAN="https://repo.miserver.it.umich.edu/cran"))
install.packages("tidyverse")
install.packages("ggplot2")
install.packages("ape")
if (!require("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
BiocManager::install("treeio")
BiocManager::install("ggtree")
"""
# Check loading package works
"""
Rlibrary(ape)
library(treeio)
library(ggtree)
library(ggplot2)
library(tidyverse)
setwd($(projectdir()))
""";
= datadir("exp_raw", "BB669")
bbdir = datadir("exp_raw", "UP7047") updir
"/Users/bend/projects/Doran_etal_2023/data/exp_raw/UP7047"
The main data is a count matrix of 7047 bacterial strains described by 10,177 Orthologous Gene Groups (OGGs). Each element of the matrix shows the number (count) of sequences in the strain that belong to that OGG. Along with this are associated metadata for each strain and each orthologous gene group.
= readh5ad(joinpath(updir, "2020_02_UP7047.h5ad")) uniprot
AnnData object 7047 ✕ 10177
# count matrix
1:5, 1:5] uniprot.X[
5×5 Matrix{Float64}:
1.0 0.0 1.0 2.0 1.0
1.0 0.0 1.0 2.0 0.0
1.0 0.0 1.0 3.0 1.0
0.0 0.0 0.0 0.0 0.0
1.0 0.0 1.0 2.0 0.0
first(uniprot.obs, 5)
Row | proteomeID | TaxID | Proteome_ID | Tax_ID | OSCODE | SUPERREGNUM | x__1_ | x__2_ | x__3_ | SpeciesName | Kingdom | Phylum | Class | Order | Family | Genus | Species |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
String | Int64 | String | Int64 | String | String | Int64 | Int64 | Int64 | String | String | String | String | String | String | String | String | |
1 | UP000283387 | 1261403 | UP000283387 | 1261403 | bacteria | 4641 | 0 | 4643 | Mangrovibacterium diazotrophicum | Bacteria | Bacteroidetes | Bacteroidia | Marinilabiliales | Prolixibacteraceae | Mangrovibacterium | Mangrovibacterium diazotrophicum | |
2 | UP000181985 | 1897729 | UP000181985 | 1897729 | bacteria | 3147 | 0 | 3150 | Halomonas aestuarii | Bacteria | Proteobacteria | Gammaproteobacteria | Oceanospirillales | Halomonadaceae | Halomonas | Halomonas aestuarii | |
3 | UP000220675 | 1962403 | UP000220675 | 1962403 | bacteria | 4593 | 0 | 4595 | Novosphingobium sp. PC22D | Bacteria | Proteobacteria | Alphaproteobacteria | Sphingomonadales | Sphingomonadaceae | Novosphingobium | Novosphingobium sp. PC22D | |
4 | UP000013137 | 1188234 | UP000013137 | 1188234 | bacteria | 601 | 0 | 601 | Mycoplasma alkalescens 14918 | Bacteria | Tenericutes | Mollicutes | Mycoplasmatales | Mycoplasmataceae | Mycoplasma | Mycoplasma alkalescens | |
5 | UP000067399 | 1303921 | UP000067399 | 1303921 | bacteria | 1467 | 0 | 1471 | endosymbiont of Bathymodiolus septemdierum str. Myojin knoll | Bacteria | Proteobacteria | Gammaproteobacteria | Bathymodiolus septemdierum thioautotrophic gill symbiont |
first(uniprot.var, 5)
Row | og | level | nm | description | COG_categories | OG_ID |
---|---|---|---|---|---|---|
String | Int64 | Int64 | String | String | String | |
1 | COG0001 | 2 | 4694 | glutamate-1-semialdehyde 2,1-aminomutase activity | H | COG0001@2 |
2 | COG0003 | 2 | 2423 | Pfam Anion-transporting ATPase | P | COG0003@2 |
3 | COG0002 | 2 | 3957 | N-acetyl-gamma-glutamyl-phosphate reductase activity | E | COG0002@2 |
4 | COG0004 | 2 | 5256 | ammonium transporteR | P | COG0004@2 |
5 | COG0005 | 2 | 3783 | purine-nucleoside phosphorylase activity | F | COG0005@2 |
The BioBank (aka: Commensal Strain Bank, aka: CSB) is the strain bank we collected. It comprises 669 strains that have at least 1% distinct sequence identity to any other strain in the BioBank (i.e. we removed clonal duplicates).
All these strains were whole genome sequenced. And from that sequencing we annotated each strain by orthologous gene groups (OGG). This annotation was done for each strain by counting the number of coding regions that matched to each orthologous gene groups in the eggNOG database (v5.0) database and filtering out gene-groups that had no matches. This created the matrix in BB669_oggs.mat
where strains correspond to each row and OGGs correspond to each column and each entry holds the number of protein sequences that matched to that OGG.
We also measured the metabolic phenotype of these strains across a number of short-chain fatty acids and other small molecules. The file B669_metabolites_foldchange.csv
contains the measurements we use in this paper, where each row contains a strain and the columns contain metabolite compounds the strains either digest or produce. The degree of digestion or production of each metabolite is compared as the ratio between the relative concentration of the compound in media cultured with the strain versus blank media without any strain. We take the log2 transform of this ratio and report the log2-fold-change (log2FC) as our measure of metabolic activity.
= matread(joinpath(bbdir, "BB669_oggs.mat"))
BB669_ogg_data = CSV.read(joinpath(bbdir, "BB669_rowmeta.tsv"), DataFrame; normalizenames=true)
metadata = CSV.read(joinpath(bbdir, "BB669_metabolites_foldchange.tsv"), DataFrame)
metabolite_df = replace.(names(metabolite_df), "_rel"=>"")[3:end]
metabolite_labels rename!(metabolite_df, names(metabolite_df) .=> CSV.normalizename.(names(metabolite_df)));
#=
use function in helpers.jl to match measured orthologs in our strain bank
to those measured in UniProt bacteria.
=#
= match_column_order(
BBoggs_UPorder "data"],
BB669_ogg_data["var_names"],
BB669_ogg_data[
uniprot.var_names );
# this finds all species with at least 20 strain replicates in the dataset
# this subset of species are those we statistically test for strain level differences later on
= string.(keys(sort(filter(x-> last(x) >= 20, countmap(metadata.Species)), byvalue=true, rev=true)))
keptspecies filter!(!=("unclassified"), keptspecies)
= in.(metadata.Species, Ref(keptspecies));
keptspecies_mask # species with at least 20 replicates in the dataset keptspecies
11-element Vector{String}:
"Phocaeicola vulgatus"
"[Ruminococcus] gnavus"
"Bacteroides thetaiotaomicron"
"Anaerostipes hadrus"
"Bacteroides uniformis"
"Blautia luti"
"Bifidobacterium breve"
"Coprococcus comes"
"Dorea formicigenerans"
"Blautia wexlerae"
"[Eubacterium] rectale"
Create unified file for future analysis, this will be easier to load in later notebooks
= AnnData(
biobank_ogg =BB669_ogg_data["data"],
X=string.(BB669_ogg_data["obs_names"]),
obs_names=string.(BB669_ogg_data["var_names"])
var_names
)= AnnData(
biobank_ogg_UP =BBoggs_UPorder,
X=string.(BB669_ogg_data["obs_names"]),
obs_names=uniprot.var_names
var_names
)
= metabolite_df[:, 3:end] |>
met_mtx x->coalesce.(x, NaN) |> # missing values (just checking)
x->Matrix(x) |> # convert datatype
x->log2.(x) |> # actual transform
x->replace(x, -Inf => 0.0) # 0s in foldchange were below detection limit
@show any(isnan, met_mtx)
= AnnData(
biobank_met =met_mtx,
X=metabolite_df[:, :ID],
obs_names=names(metabolite_df)[3:end],
var_names=DataFrame(ID=names(metabolite_df)[3:end], label=metabolite_labels)
var
)"raw"] = Matrix(coalesce.(metabolite_df[:, 3:end], NaN))
biobank_met.layers[= MuData(mod=Dict(
biobank "oggs" => biobank_ogg,
"UPorder_oggs" => biobank_ogg_UP,
"metabolites_foldchange" => biobank_met,
))= insertcols(String.(metadata), :kept_species=>keptspecies_mask)
biobank.obs biobank
any(isnan, met_mtx) = false
┌ Warning: Cannot join columns with the same name because var_names are intersecting.
└ @ Muon /Users/bend/.julia/packages/Muon/UKjAF/src/mudata.jl:367
MuData object 669 ✕ 21475
└ metabolites_foldchange
AnnData object 669 ✕ 50
└ oggs
AnnData object 669 ✕ 11248
└ UPorder_oggs
AnnData object 669 ✕ 10177
writeh5mu(joinpath(bbdir, "BB669.h5mu"), biobank)