# Make sure these are all installed as packages first
# Load necessary libraries
library(readxl)
library(pheatmap)
library(dplyr)
library(tidyr)
library(tibble)
library(RColorBrewer)
library(miaViz)
library(scater)
library(mia)
library(TreeSummarizedExperiment)
library(here)
library(readr)
library(phyloseq)
library(DESeq2)
library(ggrepel)
library(gridExtra)
library(vegan)
Healthy Global Dataset Metagenomics: Short-read taxonomic based metagenomic analysis of healthy human saliva samples in R
This analysis was used in the short-read (Illumina) taxonomic based metagenomic analysis of healthy human saliva samples in R posted as a preprint on bioRxiv:
Shotgun metagenomic analysis of the oral microbiomes of children with noma reveals a novel disease-associated organism
Michael Olaleye, Angus M O’Ferrall, Richard N. Goodman, Deogracia W Kabila, Miriam Peters, Gregoire Falq, Joseph Samuel, Donal Doyle, Diana Gomez, Gbemisola Oloruntuyi, Shafiu Isah, Adeniyi S Adetunji, Elise N. Farley, Nicholas J Evans, Mark Sherlock, Adam P. Roberts, Mohana Amirtharajah, Stuart Ainsworth
bioRxiv 2025.06.24.661267; doi: https://doi.org/10.1101/2025.06.24.661267
1. Getting Started in R
1.1 Installing and Loading Packages
Install all necessary packages into R.
2. Import and Clean Data
We will be importing MetaPhlan style Bracken data
First we’ll write a function to import MetaPhlan style bracken data
2.2 Load taxonomic data
= "../data/noma_HMP_saliva_bracken_MetaPhlan_style_report_bacteria_20_health_controls_only.txt"
file_path
# Import data
= loadFromMetaphlan(file_path)
tse_metaphlan_healthy
# Defining the TSE for the rest of the script
= tse_metaphlan_healthy
tse_metaphlan
# Use makePhyloseqFromTreeSE from Miaverse
= makePhyloseqFromTreeSE(tse_metaphlan) phyloseq_metaphlan
2.2 Add metadata
# Define the function to update metadata
= function(tse_object) {
update_sample_metadata # Extract sample names (column names)
= colnames(tse_object)
sample_names
# Create the "accession" column based on sample name prefix
= ifelse(grepl("^SRS", sample_names), "SRS",
accession ifelse(grepl("^SRR", sample_names), "SRR",
ifelse(grepl("^DRR", sample_names), "DRR", NA)))
# Create the "location" column based on sample name prefix
= ifelse(grepl("^SRS", sample_names), "USA",
location ifelse(grepl("^SRR", sample_names), "Denmark",
ifelse(grepl("^DRR", sample_names), "Japan", NA)))
# Create a DataFrame with the new metadata columns
= DataFrame(accession = accession, location = location)
sample_metadata rownames(sample_metadata) = sample_names
# Add the metadata as colData to the TreeSummarizedExperiment object
colData(tse_object) = sample_metadata
# Return the updated object
return(tse_object)
}
# Example usage:
= update_sample_metadata(tse_metaphlan)
tse_metaphlan = altExp(tse_metaphlan, "Genus")
tse_metaphlan_genus = update_sample_metadata(tse_metaphlan_genus)
tse_metaphlan_genus
head(as.data.frame(colData(tse_metaphlan)))
accession location
SRS013942 SRS USA
SRS014468 SRS USA
SRS014692 SRS USA
SRS015055 SRS USA
SRS019120 SRS USA
SRR5892208 SRR Denmark
head(as.data.frame(colData(tse_metaphlan_genus)))
accession location
SRS013942 SRS USA
SRS014468 SRS USA
SRS014692 SRS USA
SRS015055 SRS USA
SRS019120 SRS USA
SRR5892208 SRR Denmark
2.3 Inspecting the Data
3. Non-parametric statistical tests
3.1 Preparing the data
# See above "Converting TSE to other common data formats e.g. Phyloseq"
# Use makePhyloseqFromTreeSE from Miaverse
= tse_metaphlan
tse_metaphlan_healthy = tse_metaphlan_genus
tse_metaphlan_healthy_genus
# make an assay for abundance
= transformAssay(tse_metaphlan_healthy, assay.type="counts", method="relabundance")
tse_metaphlan_healthy
taxonomyRanks(tse_metaphlan_healthy)
[1] "Kingdom" "Phylum" "Class" "Order" "Family" "Genus" "Species"
# Check that colData was added successfully
colData(tse_metaphlan_healthy_genus)
DataFrame with 20 rows and 2 columns
accession location
<character> <character>
SRS013942 SRS USA
SRS014468 SRS USA
SRS014692 SRS USA
SRS015055 SRS USA
SRS019120 SRS USA
... ... ...
DRR214959 DRR Japan
DRR214960 DRR Japan
DRR214961 DRR Japan
DRR214962 DRR Japan
DRR241310 DRR Japan
= colData(tse_metaphlan_healthy_genus)
metadata_df
= as.data.frame(colData(tse_metaphlan_healthy_genus))
metadata_healthy_genus metadata_healthy_genus
accession location
SRS013942 SRS USA
SRS014468 SRS USA
SRS014692 SRS USA
SRS015055 SRS USA
SRS019120 SRS USA
SRR5892208 SRR Denmark
SRR5892209 SRR Denmark
SRR5892210 SRR Denmark
SRR5892211 SRR Denmark
SRR5892212 SRR Denmark
SRR5892213 SRR Denmark
SRR5892214 SRR Denmark
SRR5892215 SRR Denmark
SRR5892216 SRR Denmark
SRR5892217 SRR Denmark
DRR214959 DRR Japan
DRR214960 DRR Japan
DRR214961 DRR Japan
DRR214962 DRR Japan
DRR241310 DRR Japan
# species
= makePhyloseqFromTreeSE(tse_metaphlan_healthy)
phyloseq_healthy # genus
= makePhyloseqFromTreeSE(tse_metaphlan_genus)
phyloseq_healthy
= transform_sample_counts(phyloseq_healthy, function(x) 1E6 * x/sum(x))
phyloseq_healthy_esd
ntaxa(phyloseq_healthy_esd)
[1] 1757
nsamples(phyloseq_healthy_esd)
[1] 20
3.1 Permanova across entire dataset
library(vegan)
set.seed(123456)
# Calculate bray curtis distance matrix on main variables
= phyloseq::distance(phyloseq_healthy_esd, method = "bray")
olp.bray = data.frame(sample_data(phyloseq_healthy_esd))
sample.olp.df = vegan::adonis2(olp.bray ~ accession , data = sample.olp.df)
permanova_all permanova_all
Permutation test for adonis under reduced model
Permutation: free
Number of permutations: 999
vegan::adonis2(formula = olp.bray ~ accession, data = sample.olp.df)
Df SumOfSqs R2 F Pr(>F)
Model 2 0.64828 0.48414 7.9774 0.001 ***
Residual 17 0.69075 0.51586
Total 19 1.33903 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Next we will test the beta dispersion
# All together now
::adonis2(olp.bray ~ accession, data = sample.olp.df) vegan
Permutation test for adonis under reduced model
Permutation: free
Number of permutations: 999
vegan::adonis2(formula = olp.bray ~ accession, data = sample.olp.df)
Df SumOfSqs R2 F Pr(>F)
Model 2 0.64828 0.48414 7.9774 0.001 ***
Residual 17 0.69075 0.51586
Total 19 1.33903 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
= betadisper(olp.bray, sample.olp.df$accession) beta
Warning in betadisper(olp.bray, sample.olp.df$accession): some squared
distances are negative and changed to zero
permutest(beta)
Permutation test for homogeneity of multivariate dispersions
Permutation: free
Number of permutations: 999
Response: Distances
Df Sum Sq Mean Sq F N.Perm Pr(>F)
Groups 2 0.006237 0.0031184 0.3448 999 0.708
Residuals 17 0.153764 0.0090449
# we don't want this to be significant
3.2 Anosim across entire dataset
= get_variable(phyloseq_healthy_esd, "accession")
condition_group set.seed (123456)
anosim(distance(phyloseq_healthy_esd, "bray"), condition_group)
Call:
anosim(x = distance(phyloseq_healthy_esd, "bray"), grouping = condition_group)
Dissimilarity: bray
ANOSIM statistic R: 0.681
Significance: 0.001
Permutation: free
Number of permutations: 999
= anosim(distance(phyloseq_healthy_esd, "bray"), condition_group)
condition_ano condition_ano
Call:
anosim(x = distance(phyloseq_healthy_esd, "bray"), grouping = condition_group)
Dissimilarity: bray
ANOSIM statistic R: 0.681
Significance: 0.001
Permutation: free
Number of permutations: 999
3.3 MRPP across entire dataset
#condition
= phyloseq::distance(phyloseq_healthy_esd, method = "bray") # Calculate bray curtis distance matrix
olp.bray = get_variable(phyloseq_healthy_esd, "accession") # Make condition Grouping
condition_group # Run MRPP
set.seed(123456)
::mrpp(olp.bray, condition_group, permutations = 999,
veganweight.type = 1, strata = NULL, parallel = getOption("mc.cores"))
Call:
vegan::mrpp(dat = olp.bray, grouping = condition_group, permutations = 999, weight.type = 1, strata = NULL, parallel = getOption("mc.cores"))
Dissimilarity index: bray
Weights for groups: n
Class means and counts:
DRR SRR SRS
delta 0.2508 0.2399 0.3368
n 5 10 5
Chance corrected within-group agreement A: 0.2447
Based on observed delta 0.2669 and expected delta 0.3533
Significance of delta: 0.001
Permutation: free
Number of permutations: 999
3.4 Permanova function for specific taxa
# Extract the counts and taxonomic table
= assay(tse_metaphlan_genus, "counts")
counts = rowData(tse_metaphlan_genus)$Genus # Replace "Genus" with your taxonomic level of interest
tax_table = colData(tse_metaphlan_genus)
sample_data
= as.data.frame(sample_data)
groups
# Aggregate counts by Genus
= rowsum(counts, tax_table)
aggregated_counts
# Create a new aggregated TreeSummarizedExperiment object
= TreeSummarizedExperiment(assays = list(counts = aggregated_counts),
tse_aggregated colData = sample_data)
# Calculate relative abundances
= sweep(assay(tse_aggregated, "counts"), 2, colSums(assay(tse_aggregated, "counts")), FUN = "/") * 100
relative_abundances
# Convert to a data frame and group by Treatment
= as.data.frame(t(relative_abundances))
relative_df
set.seed (123456)
# Define the vector of genera names (without the "g__" prefix)
= c("Prevotella", "Treponema", "Neisseria", "Bacteroides",
genera "Filifactor", "Porphyromonas", "Fusobacterium", "Escherichia",
"Selenomonas", "Aggregatibacter", "Capnocytophaga",
"Streptococcus", "Actinomyces", "Campylobacter", "Dialister", "Gemella",
"Haemophilus", "Leptotrichia", "Rothia", "Schaalia", "Tannerella", "Veillonella")
# Initialize an empty data frame to store the results
= data.frame(Genus = character(), pvalue = numeric(), stringsAsFactors = FALSE)
permanova_taxa_results
# Loop over each genus
for (genus in genera) {
set.seed (123456)
# Subset the data for the genus; adjust column selection as needed
= relative_df %>% select(paste0("g__", genus))
subset_data
# Calculate the Bray-Curtis distance
= vegdist(subset_data, method = "bray")
bray_dist
# Run PERMANOVA using adonis2
= adonis2(bray_dist ~ accession, data = groups)
adonis_result
# Extract the p-value for the sample_type factor (usually in the first row)
= adonis_result$`Pr(>F)`[1]
pval
# Append the result to the results data frame
= rbind(permanova_taxa_results, data.frame(Genus = genus, pvalue = pval))
permanova_taxa_results
}
print(permanova_taxa_results)
Genus pvalue
1 Prevotella 0.070
2 Treponema 0.001
3 Neisseria 0.085
4 Bacteroides 0.019
5 Filifactor 0.081
6 Porphyromonas 0.004
7 Fusobacterium 0.001
8 Escherichia 0.115
9 Selenomonas 0.533
10 Aggregatibacter 0.001
11 Capnocytophaga 0.002
12 Streptococcus 0.001
13 Actinomyces 0.002
14 Campylobacter 0.001
15 Dialister 0.001
16 Gemella 0.283
17 Haemophilus 0.001
18 Leptotrichia 0.497
19 Rothia 0.001
20 Schaalia 0.001
21 Tannerella 0.060
22 Veillonella 0.002
= print(permanova_taxa_results) ptr
Genus pvalue
1 Prevotella 0.070
2 Treponema 0.001
3 Neisseria 0.085
4 Bacteroides 0.019
5 Filifactor 0.081
6 Porphyromonas 0.004
7 Fusobacterium 0.001
8 Escherichia 0.115
9 Selenomonas 0.533
10 Aggregatibacter 0.001
11 Capnocytophaga 0.002
12 Streptococcus 0.001
13 Actinomyces 0.002
14 Campylobacter 0.001
15 Dialister 0.001
16 Gemella 0.283
17 Haemophilus 0.001
18 Leptotrichia 0.497
19 Rothia 0.001
20 Schaalia 0.001
21 Tannerella 0.060
22 Veillonella 0.002
4. Principal Components analysis
4.1 PCA with prcomp
# Assume `tse` is your TreeSummarizedExperiment object
# Extract the assay data (e.g., expression matrix)
= assay(tse_metaphlan_genus)
expr_data
colnames(tse_metaphlan)
[1] "SRS013942" "SRS014468" "SRS014692" "SRS015055" "SRS019120"
[6] "SRR5892208" "SRR5892209" "SRR5892210" "SRR5892211" "SRR5892212"
[11] "SRR5892213" "SRR5892214" "SRR5892215" "SRR5892216" "SRR5892217"
[16] "DRR214959" "DRR214960" "DRR214961" "DRR214962" "DRR241310"
= as.data.frame(expr_data)
expr_data_df ncol(expr_data_df)
[1] 20
= t(expr_data_df)
t_expr_data_df
# Identify constant columns (zero variance)
= apply(t_expr_data_df, 2, function(col) var(col) == 0)
zero_var_cols # Display columns with zero variance
names(t_expr_data_df)[zero_var_cols]
NULL
# Remove constant columns
= t_expr_data_df[, !zero_var_cols]
t_expr_data_df_filtered
colnames(t_expr_data_df_filtered) = gsub(".*(g__*)", "\\1", colnames(t_expr_data_df_filtered))
# Optionally scale and center the data (depends on your data)
#expr_data_scaled = scale(t_expr_data_df)
# Run PCA using the prcomp function
= prcomp(t_expr_data_df_filtered, center = TRUE, scale. = TRUE)
pca_result
# View the summary of the PCA result
summary(pca_result)
Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7
Standard deviation 33.4472 14.4946 9.08689 7.35815 6.88639 5.95437 5.1038
Proportion of Variance 0.6573 0.1234 0.04851 0.03181 0.02786 0.02083 0.0153
Cumulative Proportion 0.6573 0.7807 0.82925 0.86106 0.88892 0.90975 0.9251
PC8 PC9 PC10 PC11 PC12 PC13 PC14
Standard deviation 4.4433 4.15710 3.79462 3.54310 3.48885 3.2999 3.20788
Proportion of Variance 0.0116 0.01015 0.00846 0.00738 0.00715 0.0064 0.00605
Cumulative Proportion 0.9367 0.94681 0.95527 0.96265 0.96980 0.9762 0.98224
PC15 PC16 PC17 PC18 PC19 PC20
Standard deviation 2.95908 2.57030 2.32659 2.19166 2.15529 9.14e-14
Proportion of Variance 0.00514 0.00388 0.00318 0.00282 0.00273 0.00e+00
Cumulative Proportion 0.98739 0.99127 0.99445 0.99727 1.00000 1.00e+00
#dev.off()
4.2 plot PCA with base R
# Plot the PCA results (e.g., using ggplot2 or base R plotting)
plot(pca_result$x[,1], pca_result$x[,2],
xlab = "PC1", ylab = "PC2", main = "PCA Plot")
4.3 plot PCA with ggplot
# Optionally, visualize PCA with ggplot2 for more customized plotting
library(ggplot2)
= as.data.frame(pca_result$x)
pca_df $sample = rownames(pca_result$x)
pca_df$sample2 = colnames(expr_data)
pca_df
ggplot(pca_df, aes(x = PC1, y = PC2, label = sample, col = sample)) +
geom_point() +
geom_text(vjust = -0.5) +
ggtitle("PCA of TreeSummarizedExperiment Data")
4.3 plot PCA without DRR accessions
Now we will remove the DRR accession
= c("DRR214959", "DRR214960", "DRR214961", "DRR214962", "DRR241310")
drops = expr_data_df[ , !(names(expr_data_df) %in% drops)]
expr_data_df ncol(expr_data_df)
[1] 15
= t(expr_data_df)
t_expr_data_df
# Identify constant columns (zero variance)
= apply(t_expr_data_df, 2, function(col) var(col) == 0)
zero_var_cols # Display columns with zero variance
names(t_expr_data_df)[zero_var_cols]
NULL
# Remove constant columns
= t_expr_data_df[, !zero_var_cols]
t_expr_data_df_filtered
colnames(t_expr_data_df_filtered) = gsub(".*(g__*)", "\\1", colnames(t_expr_data_df_filtered))
# Run PCA using the prcomp function
= prcomp(t_expr_data_df_filtered, center = TRUE, scale. = TRUE)
pca_result
#dev.off()
# Optionally, visualize PCA with ggplot2 for more customized plotting
library(ggplot2)
= as.data.frame(pca_result$x)
pca_df $sample = rownames(pca_result$x)
pca_df
ggplot(pca_df, aes(x = PC1, y = PC2, label = sample, col = sample)) +
geom_point() +
geom_text(vjust = -0.5) +
ggtitle("PCA of TreeSummarizedExperiment Data")