## ----setup, include=FALSE----------------------------------------------------------------------- library(knitr) knitr::opts_chunk$set(eval = TRUE) knitr::opts_chunk$set(cache = TRUE) options(width = 98) opts_chunk$set(message = FALSE, error = FALSE, warning = FALSE, cache = TRUE, fig.width = 8, fig.height = 7) ## ----NeededPackages, message=FALSE-------------------------------------------------------------- cranpkgs=c("ggplot2","devtools","adaptiveGPCA","treelapse", "ade4","structSSI", "PMA","vegan", "ggrepel","gridExtra") BioCpkgs=c("dada2", "curatedMetagenomicData", "phyloseq", "DECIPHER","genefilter") ghpkgs= c("treeDA", "treelapse","metavizr") instp <- cranpkgs %in% installed.packages() if(any(!instp)) { install.packages(cranpkgs[!instp],repos="https://cloud.r-project.org") } instp <- BioCpkgs %in% installed.packages() if(any(!instp)) { source("http://bioconductor.org/biocLite.R") biocLite(BioCpkgs [!instp], ask = F) } if(!("treeDA" %in% installed.packages())) BiocInstaller::biocLite("jfukuyama/treeDA", dependencies = TRUE) if(!("treelapse" %in% installed.packages())) BiocInstaller::biocLite("krisrs1128/treelapse", dependencies = TRUE) suppressPackageStartupMessages(sapply(c(cranpkgs, BioCpkgs, ghpkgs), require, character.only = TRUE)) ## ----non-local---------------------------------------------------------------------------------- ps_connect <-url("https://raw.githubusercontent.com/spholmes/F1000_workflow/master/data/ps.rds") ps = readRDS(ps_connect) ps ## ----taxfilter0--------------------------------------------------------------------------------- # Show available ranks in the dataset rank_names(ps) # Create table, number of features for each phyla table(tax_table(ps)[, "Phylum"], exclude = NULL) ## ----removeNAphyla------------------------------------------------------------------------------ ps <- subset_taxa(ps, !is.na(Phylum) & !Phylum %in% c("", "uncharacterized")) ## ----prevfilter0-------------------------------------------------------------------------------- # Compute prevalence of each feature, store as data.frame prevdf = apply(X = otu_table(ps), MARGIN = ifelse(taxa_are_rows(ps), yes = 1, no = 2), FUN = function(x){sum(x > 0)}) # Add taxonomy and total read counts to this data.frame prevdf = data.frame(Prevalence = prevdf, TotalAbundance = taxa_sums(ps), tax_table(ps)) ## ----lowprev------------------------------------------------------------------------------------ plyr::ddply(prevdf, "Phylum", function(df1){cbind(mean(df1$Prevalence),sum(df1$Prevalence))}) ## ----taxfilter---------------------------------------------------------------------------------- # Define phyla to filter filterPhyla = c("Fusobacteria", "Deinococcus-Thermus") # Filter entries with unidentified Phylum. ps1 = subset_taxa(ps, !Phylum %in% filterPhyla) ps1 ## ----plotprevalence, fig.width=9, fig.height=5, fig.cap="Taxa prevalence versus total counts."---- # Subset to the remaining phyla prevdf1 = subset(prevdf, Phylum %in% get_taxa_unique(ps1, "Phylum")) ggplot(prevdf1, aes(TotalAbundance, Prevalence / nsamples(ps),color=Phylum)) + # Include a guess for parameter geom_hline(yintercept = 0.05, alpha = 0.5, linetype = 2) + geom_point(size = 2, alpha = 0.7) + scale_x_log10() + xlab("Total Abundance") + ylab("Prevalence [Frac. Samples]") + facet_wrap(~Phylum) + theme(legend.position="none") ## ----prevalencefilter--------------------------------------------------------------------------- # Define prevalence threshold as 5% of total samples prevalenceThreshold = 0.05 * nsamples(ps) prevalenceThreshold # Execute prevalence filter, using `prune_taxa()` function keepTaxa = rownames(prevdf1)[(prevdf1$Prevalence >= prevalenceThreshold)] ps2 = prune_taxa(keepTaxa, ps) ## ----taxglom------------------------------------------------------------------------------------ # How many genera would be present after filtering? length(get_taxa_unique(ps2, taxonomic.rank = "Genus")) ps3 = tax_glom(ps2, "Genus", NArm = TRUE) ## ----tipglom------------------------------------------------------------------------------------ h1 = 0.4 ps4 = tip_glom(ps2, h = h1) ## ----plotglomprep------------------------------------------------------------------------------- multiPlotTitleTextSize = 15 p2tree = plot_tree(ps2, method = "treeonly", ladderize = "left", title = "Before Agglomeration") + theme(plot.title = element_text(size = multiPlotTitleTextSize)) p3tree = plot_tree(ps3, method = "treeonly", ladderize = "left", title = "By Genus") + theme(plot.title = element_text(size = multiPlotTitleTextSize)) p4tree = plot_tree(ps4, method = "treeonly", ladderize = "left", title = "By Height") + theme(plot.title = element_text(size = multiPlotTitleTextSize)) ## ----plotglomtree, fig.width=14, fig.cap="Different types of agglomeration"--------------------- # group plots together grid.arrange(nrow = 1, p2tree, p3tree, p4tree) ## ----abundancetransformation-------------------------------------------------------------------- plot_abundance = function(physeq,title = "", Facet = "Order", Color = "Phylum"){ # Arbitrary subset, based on Phylum, for plotting p1f = subset_taxa(physeq, Phylum %in% c("Firmicutes")) mphyseq = psmelt(p1f) mphyseq <- subset(mphyseq, Abundance > 0) ggplot(data = mphyseq, mapping = aes_string(x = "sex",y = "Abundance", color = Color, fill = Color)) + geom_violin(fill = NA) + geom_point(size = 1, alpha = 0.3, position = position_jitter(width = 0.3)) + facet_wrap(facets = Facet) + scale_y_log10()+ theme(legend.position="none") } ## ----abundancetransformation2------------------------------------------------------------------- # Transform to relative abundance. Save as new object. ps3ra = transform_sample_counts(ps3, function(x){x / sum(x)}) ## ----abundancetransformation3, fig.height=12, fig.width=10.5,fig.cap="Comparison of original abundances with transformed data"---- plotBefore = plot_abundance(ps3,"") plotAfter = plot_abundance(ps3ra,"") # Combine each plot into one graphic. grid.arrange(nrow = 2, plotBefore, plotAfter) ## ----subsettaxa, fig.cap= "Violin plot of relative abundances of Lactobacillales"--------------- psOrd = subset_taxa(ps3ra, Order == "Lactobacillales") plot_abundance(psOrd, Facet = "Genus", Color = NULL) ## ----init-analysis------------------------------------------------------------------------------ .cran_packages <- c( "shiny","miniUI", "caret", "pls", "e1071", "ggplot2", "randomForest", "dplyr", "ggrepel", "nlme", "devtools", "reshape2", "PMA", "structSSI", "ade4", "ggnetwork", "intergraph", "scales") .github_packages <- c("jfukuyama/phyloseqGraphTest") .bioc_packages <- c("genefilter", "impute") # Install CRAN packages (if not already installed) .inst <- .cran_packages %in% installed.packages() if (any(!.inst)){ install.packages(.cran_packages[!.inst],repos = "http://cran.rstudio.com/") } .inst <- .github_packages %in% installed.packages() if (any(!.inst)){ devtools::install_github(.github_packages[!.inst]) } .inst <- .bioc_packages %in% installed.packages() if(any(!.inst)){ source("http://bioconductor.org/biocLite.R") biocLite(.bioc_packages[!.inst]) } ## ----preprocessing-setup, fig.cap="Histogram of age groupings",fig.show="hold"------------------ qplot(sample_data(ps)$age, geom = "histogram",binwidth=20) + xlab("age") ## ----preprocessing2, fig.cap="Histograms comparing raw and log transformed read depths",fig.show="hold"---- qplot(log10(rowSums(otu_table(ps))),binwidth=0.2) + xlab("Logged counts-per-sample") ## ----outlier-detect, fig.cap="Exploratory ordination analysis with log abundances.",fig.wide= TRUE---- sample_data(ps)$age_binned <- cut(sample_data(ps)$age, breaks = c(0, 100, 200, 400)) levels(sample_data(ps)$age_binned) <- list(Young100="(0,100]", Mid100to200="(100,200]", Old200="(200,400]") sample_data(ps)$family_relationship=gsub(" ","",sample_data(ps)$family_relationship) pslog <- transform_sample_counts(ps, function(x) log(1 + x)) out.wuf.log <- ordinate(pslog, method = "MDS", distance = "wunifrac") evals <- out.wuf.log$values$Eigenvalues plot_ordination(pslog, out.wuf.log, color = "age_binned") + labs(col = "Binned Age") + coord_fixed(sqrt(evals[2] / evals[1])) ## ----outlier-analyze, fig.width=9, fig.height=5, fig.cap="The outlier samples are dominated by a single ASV."---- rel_abund <- t(apply(otu_table(ps), 1, function(x) x / sum(x))) qplot(rel_abund[, 12], geom = "histogram",binwidth=0.05) + xlab("Relative abundance") ## ----remove-outliers---------------------------------------------------------------------------- outliers <- c("F5D165", "F6D165", "M3D175", "M4D175", "M5D175", "M6D175") ps <- prune_samples(!(sample_names(ps) %in% outliers), ps) ## ----removelowreads----------------------------------------------------------------------------- which(!rowSums(otu_table(ps)) > 1000) ps <- prune_samples(rowSums(otu_table(ps)) > 1000, ps) pslog <- transform_sample_counts(ps, function(x) log(1 + x)) ## ----ordinations-bray,fig.cap="A PCoA plot using Bray-Curtis between samples."----------------- out.pcoa.log <- ordinate(pslog, method = "MDS", distance = "bray") evals <- out.pcoa.log$values[,1] plot_ordination(pslog, out.pcoa.log, color = "age_binned", shape = "family_relationship") + labs(col = "Binned Age", shape = "Litter")+ coord_fixed(sqrt(evals[2] / evals[1])) ## ----ordinations-dpcoa, fig.wide=TRUE,fig.cap="A DPCoA plot incorporates phylogenetic information, but is dominated by the first axis."---- out.dpcoa.log <- ordinate(pslog, method = "DPCoA") evals <- out.dpcoa.log$eig plot_ordination(pslog, out.dpcoa.log, color = "age_binned", label= "SampleID", shape = "family_relationship") + labs(col = "Binned Age", shape = "Litter")+ coord_fixed(sqrt(evals[2] / evals[1])) ## ----dpcoabiplot,fig.wide=TRUE,fig.cap="Taxa responsible for Axis 1 and 2"---------------------- plot_ordination(pslog, out.dpcoa.log, type = "species", color = "Phylum") + coord_fixed(sqrt(evals[2] / evals[1])) ## ----ordinations-wuf,fig.wide =TRUE, fig.cap="The sample positions produced by a PCoA using weighted Unifrac."---- out.wuf.log <- ordinate(pslog, method = "PCoA", distance ="wunifrac") evals <- out.wuf.log$values$Eigenvalues plot_ordination(pslog, out.wuf.log, color = "age_binned", shape = "family_relationship") + coord_fixed(sqrt(evals[2] / evals[1])) + labs(col = "Binned Age", shape = "Litter") ## ----rankab------------------------------------------------------------------------------------- abund <- otu_table(pslog) abund_ranks <- t(apply(abund, 1, rank)) ## ----rankthreshold------------------------------------------------------------------------------ abund_ranks <- abund_ranks - 329 abund_ranks[abund_ranks < 1] <- 1 ## ----pca-rank-visualize-procedure, fig.cap="Rank threshold transformation"---------------------- library(dplyr) library(reshape2) abund_df <- melt(abund, value.name = "abund") %>% left_join(melt(abund_ranks, value.name = "rank")) colnames(abund_df) <- c("sample", "seq", "abund", "rank") abund_df <- melt(abund, value.name = "abund") %>% left_join(melt(abund_ranks, value.name = "rank")) colnames(abund_df) <- c("sample", "seq", "abund", "rank") sample_ix <- sample(1:nrow(abund_df), 8) ggplot(abund_df %>% filter(sample %in% abund_df$sample[sample_ix])) + geom_point(aes(x = abund, y = rank, col = sample), position = position_jitter(width = 0.2), size = 1.5) + labs(x = "Abundance", y = "Thresholded rank") + scale_color_brewer(palette = "Set2") ## ----pca-rank-pca-setup------------------------------------------------------------------------- library(ade4) ranks_pca <- dudi.pca(abund_ranks, scannf = F, nf = 3) row_scores <- data.frame(li = ranks_pca$li, SampleID = rownames(abund_ranks)) col_scores <- data.frame(co = ranks_pca$co, seq = colnames(abund_ranks)) tax <- tax_table(ps) %>% data.frame(stringsAsFactors = FALSE) tax$seq <- rownames(tax) main_orders <- c("Clostridiales", "Bacteroidales", "Lactobacillales", "Coriobacteriales") tax$Order[!(tax$Order %in% main_orders)] <- "Other" tax$Order <- factor(tax$Order, levels = c(main_orders, "Other")) tax$otu_id <- seq_len(ncol(otu_table(ps))) row_scores <- row_scores %>% left_join(sample_data(pslog)) col_scores <- col_scores %>% left_join(tax) ## ----pca-rank-pca-plot, fig.wide=TRUE,fig.height=8,fig.cap="The biplot resulting from the PCA after the truncated-ranking transformation."---- evals_prop <- 100 * (ranks_pca$eig / sum(ranks_pca$eig)) ggplot() + geom_point(data = row_scores, aes(x = li.Axis1, y = li.Axis2), shape = 2) + geom_point(data = col_scores, aes(x = 25 * co.Comp1, y = 25 * co.Comp2, col = Order), size = .3, alpha = 0.6) + scale_color_brewer(palette = "Set2") + facet_grid(~ age_binned) + guides(col = guide_legend(override.aes = list(size = 3))) + labs(x = sprintf("Axis1 [%s%% variance]", round(evals_prop[1], 2)), y = sprintf("Axis2 [%s%% variance]", round(evals_prop[2], 2))) + coord_fixed(sqrt(ranks_pca$eig[2] / ranks_pca$eig[1])) + theme(panel.border = element_rect(color = "#787878", fill = alpha("white", 0))) ## ----ccpna-correspondence-analysis-------------------------------------------------------------- ps_ccpna <- ordinate(pslog, "CCA", formula = pslog ~ age_binned + family_relationship) ## ----ccpna-join-data, fig.cap="The mouse and bacteria scores generated by CCpnA.", fig.wide=TRUE, fig.height=10---- library(ggrepel) ps_scores <- vegan::scores(ps_ccpna) sites <- data.frame(ps_scores$sites) sites$SampleID <- rownames(sites) sites <- sites %>% left_join(sample_data(ps)) species <- data.frame(ps_scores$species) species$otu_id <- seq_along(colnames(otu_table(ps))) species <- species %>% left_join(tax) evals_prop <- 100 * ps_ccpna$CCA$eig[1:2] / sum(ps_ccpna$CA$eig) ggplot() + geom_point(data = sites, aes(x = CCA1, y = CCA2), shape = 2, alpha = 0.5) + geom_point(data = species, aes(x = CCA1, y = CCA2, col = Order), size = 0.5) + geom_text_repel(data = species %>% filter(CCA2 < -2), aes(x = CCA1, y = CCA2, label = otu_id), size = 1.5, segment.size = 0.1) + facet_grid(. ~ family_relationship) + guides(col = guide_legend(override.aes = list(size = 3))) + labs(x = sprintf("Axis1 [%s%% variance]", round(evals_prop[1], 2)), y = sprintf("Axis2 [%s%% variance]", round(evals_prop[2], 2))) + scale_color_brewer(palette = "Set2") + coord_fixed(sqrt(ps_ccpna$CCA$eig[2] / ps_ccpna$CCA$eig[1])*0.45 ) + theme(panel.border = element_rect(color = "#787878", fill = alpha("white", 0))) ## ----caret-pls---------------------------------------------------------------------------------- library(caret) sample_data(pslog)$age2 <- cut(sample_data(pslog)$age, c(0, 100, 400)) dataMatrix <- data.frame(age = sample_data(pslog)$age2, otu_table(pslog)) # take 8 mice at random to be the training set, and the remaining 4 the test set trainingMice <- sample(unique(sample_data(pslog)$host_subject_id), size = 8) inTrain <- which(sample_data(pslog)$host_subject_id %in% trainingMice) training <- dataMatrix[inTrain,] testing <- dataMatrix[-inTrain,] plsFit <- train(age ~ ., data = training, method = "pls", preProc = "center") ## ----caret-pls-confusion------------------------------------------------------------------------ plsClasses <- predict(plsFit, newdata = testing) table(plsClasses, testing$age) ## ----caret-rf, eval =T-------------------------------------------------------------------------- library(randomForest) rfFit <- train(age ~ ., data = training, method = "rf", preProc = "center", proximity = TRUE) rfClasses <- predict(rfFit, newdata = testing) table(rfClasses, testing$age) ## ----caret-pls-scores-loadings------------------------------------------------------------------ pls_biplot <- list("loadings" = loadings(plsFit$finalModel), "scores" = scores(plsFit$finalModel)) class(pls_biplot$scores) <- "matrix" pls_biplot$scores <- data.frame(sample_data(pslog)[inTrain, ], pls_biplot$scores) tax <- tax_table(ps)@.Data %>% data.frame(stringsAsFactors = FALSE) main_orders <- c("Clostridiales", "Bacteroidales", "Lactobacillales", "Coriobacteriales") tax$Order[!(tax$Order %in% main_orders)] <- "Other" tax$Order <- factor(tax$Order, levels = c(main_orders, "Other")) class(pls_biplot$loadings) <- "matrix" pls_biplot$loadings <- data.frame(tax, pls_biplot$loadings) ## ----caret-pls-scores-plot, fig.height = 7, fig.width=8, fig.wide=TRUE, fig.cap="PLS produces a biplot representation designed to separate samples by a response variable."---- ggplot() + geom_point(data = pls_biplot$scores, aes(x = Comp.1, y = Comp.2), shape = 2) + geom_point(data = pls_biplot$loadings, aes(x = 25 * Comp.1, y = 25 * Comp.2, col = Order), size = 0.3, alpha = 0.6) + scale_color_brewer(palette = "Set2") + labs(x = "Axis1", y = "Axis2", col = "Binned Age") + guides(col = guide_legend(override.aes = list(size = 3))) + facet_grid( ~ age2) + theme(panel.border = element_rect(color = "#787878", fill = alpha("white", 0))) ## ----caret-proximity, fig.wide=TRUE, fig.cap="The random forest model determines a distance between samples, which can be input into PCoA to produce a proximity plot."---- rf_prox <- cmdscale(1 - rfFit$finalModel$proximity) %>% data.frame(sample_data(pslog)[inTrain, ]) ggplot(rf_prox) + geom_point(aes(x = X1, y = X2, col = age_binned), size = 1, alpha = 0.7) + scale_color_manual(values = c("#A66EB8", "#238DB5", "#748B4F")) + guides(col = guide_legend(override.aes = list(size = 4))) + labs(col = "Binned Age", x = "Axis1", y = "Axis2") ## ----caret-rf-importance, fig.height=6, fig.width=9, fig.cap="Random forest aids to interpretation: importance scores."---- as.vector(tax_table(ps)[which.max(importance(rfFit$finalModel)), c("Family", "Genus")]) impOtu <- as.vector(otu_table(pslog)[,which.max(importance(rfFit$finalModel))]) maxImpDF <- data.frame(sample_data(pslog), abund = impOtu) ggplot(maxImpDF) + geom_histogram(aes(x = abund)) + facet_grid(age2 ~ .) + labs(x = "Abundance of discriminative bacteria", y = "Number of samples") ## ----multitable-setup--------------------------------------------------------------------------- metab <- read.csv("https://raw.githubusercontent.com/spholmes/F1000_workflow/master/data/metabolites.csv",row.names = 1) microbe_connect <-url("https://raw.githubusercontent.com/spholmes/F1000_workflow/master/data/microbe.rda") load(microbe_connect) microbe ## ----multitable-filtering----------------------------------------------------------------------- library("genefilter") keep_ix <- rowSums(metab == 0) <= 3 metab <- metab[keep_ix, ] microbe <- prune_taxa(taxa_sums(microbe) > 4, microbe) microbe <- filter_taxa(microbe, filterfun(kOverA(3, 2)), TRUE) metab <- log(1 + metab, base = 10) X <- otu_table(microbe) X[X > 50] <- 50 dim(X) dim(metab) ## ----multitable-sparse-cca---------------------------------------------------------------------- library(PMA) cca_res <- CCA(t(X), t(metab), penaltyx = .15, penaltyz = .15) cca_res ## ----loadade4,echo=FALSE------------------------------------------------------------------------ library(ade4) ## ----multitable-plug-in-pca--------------------------------------------------------------------- combined <- cbind(t(X[cca_res$u != 0, ]), t(metab[cca_res$v != 0, ])) pca_res <- dudi.pca(combined, scannf = F, nf = 3) ## ----annotation--------------------------------------------------------------------------------- genotype <- substr(rownames(pca_res$li), 1, 2) sample_type <- substr(rownames(pca_res$l1), 3, 4) feature_type <- grepl("\\.", colnames(combined)) feature_type <- ifelse(feature_type, "Metabolite", "OTU") sample_info <- data.frame(pca_res$li, genotype, sample_type) feature_info <- data.frame(pca_res$c1, feature = substr(colnames(combined), 1, 6)) ## ----multitable-interpret-pca, fig.wide=TRUE, fig.cap="A PCA triplot produced from the CCA selected features in from muliple data types;metabolites and OTUs."---- ggplot() + geom_point(data = sample_info, aes(x = Axis1, y = Axis2, col = sample_type, shape = genotype), size = 3) + geom_label_repel(data = feature_info, aes(x = 5.5 * CS1, y = 5.5 * CS2, label = feature, fill = feature_type), size = 2, segment.size = 0.3, label.padding = unit(0.1, "lines"), label.size = 0) + geom_point(data = feature_info, aes(x = 5.5 * CS1, y = 5.5 * CS2, fill = feature_type), size = 1, shape = 23, col = "#383838") + scale_color_brewer(palette = "Set2") + scale_fill_manual(values = c("#a6d854", "#e78ac3")) + guides(fill = guide_legend(override.aes = list(shape = 32, size = 0))) + coord_fixed(sqrt(pca_res$eig[2] / pca_res$eig[2])) + labs(x = sprintf("Axis1 [%s%% Variance]", 100 * round(pca_res$eig[1] / sum(pca_res$eig), 2)), y = sprintf("Axis2 [%s%% Variance]", 100 * round(pca_res$eig[2] / sum(pca_res$eig), 2)), fill = "Feature Type", col = "Sample Type") ## ---- eval =FALSE------------------------------------------------------------------------------- # #not run # data(AntibioticPhyloseq) # theme_set(theme_bw()) # pp = processPhyloseq(AntibioticPhyloseq) # out.ff = gpcaFullFamily(pp$X, pp$Q, k = 2) # out.agpca = visualizeFullFamily(out.ff, # sample_data = sample_data(AntibioticPhyloseq), # sample_mapping = aes(x = Axis1, y = Axis2, color = condition), # var_data = tax_table(AntibioticPhyloseq), # var_mapping = aes(x = Axis1, y = Axis2, color = Phylum)) ## ----------------------------------------------------------------------------------------------- ##devtools::install.github("jfukuyama/treeDA") library(treeDA) library(adaptiveGPCA) data(AntibioticPhyloseq) theme_set(theme_bw()) ## ----------------------------------------------------------------------------------------------- out.treeda = treeda(response = sample_data(AntibioticPhyloseq)$type, predictors = otu_table(AntibioticPhyloseq), tree = phy_tree(AntibioticPhyloseq), p = 15)