From 83e7adcbdc97ef95973a625fb7573780a3604602 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CAndreas?= <eich.andreas@web.de>
Date: Fri, 11 Jul 2025 16:55:52 -0400
Subject: [PATCH 1/3] Add possibility for pairwise comparisons in betta test
 results

---
 R/betta_posthoc.R | 260 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 260 insertions(+)
 create mode 100644 R/betta_posthoc.R

diff --git a/R/betta_posthoc.R b/R/betta_posthoc.R
new file mode 100644
index 0000000..9aba367
--- /dev/null
+++ b/R/betta_posthoc.R
@@ -0,0 +1,260 @@
+#' Helper function to generate compact letter display
+#'
+#' @param pvals Named vector of p-values for pairwise comparisons
+#' @param alpha Significance threshold
+#' @param letter_set Character vector of letters to use
+#' @return Named vector of compact letter display codes
+#' @keywords internal
+generate_cld <- function(pvals, alpha = 0.05, letter_set = LETTERS) {
+  # Use multcompLetters with p-values
+  cld_letters <- multcompView::multcompLetters(
+    pvals,
+    threshold = alpha,
+    Letters = letter_set
+  )$Letters
+
+  return(cld_letters)
+}
+
+#' Post-hoc pairwise comparisons and compact letter display from diversity test object
+#'
+#' Performs pairwise post-hoc linear combination tests based on a fitted betta model object
+#' (`test_obj`) that contains a formula of the form `estimate ~ ...`. The function
+#' extracts the model formula and metadata variables, constructs all pairwise
+#' comparisons between levels, runs `betta_lincom()` on each linear contrast,
+#' adjusts p-values, computes significance codes, and returns compact letter display (CLD)
+#' indicating significantly different groups.
+#'
+#' @param test_obj An object returned from the initial diversity test function (`betta()`).
+#'   This object must contain a formula accessible as `test_obj$function.args$formula`,
+#'   where the RHS defines the model terms used in `model.matrix`.
+#' @param metadata A data frame containing the variables used in the model formula.
+#'   Each variable should correspond to a column used in `test_obj$function.args$formula`.
+#' @param p_adjust_method Character string specifying the p-value adjustment method
+#'   (default `"BH"`). See `?p.adjust` for available methods.
+#' @param alpha Numeric significance level used for compact letter display grouping (default `0.05`).
+#' @param letter_set Character vector of letters or numbers used for compact letter display labels
+#'   (default `LETTERS`).
+#' @param human_sep Character string used as separator for output group labels (default `"."`).
+#'
+#' @return A list with two components:
+#' \describe{
+#'   \item{results_df}{A data frame containing pairwise comparison results with columns:
+#'     `group_1`, `group_2`, `Estimates`, `Standard Errors`, `Lower CIs`, `Upper CIs`,
+#'     `p_value`, `p_adjusted`, and `significance`.}
+#'   \item{cld_df}{A data frame containing group-level compact letter display labels.
+#'     This data frame contains one row per group with columns corresponding to
+#'     the original variables from the formula, a `cld` column for the letter code,
+#'     and group-level estimates: `estimate`, `std_error`, `lower_ci`, `upper_ci`.}
+#' }
+#'
+#' @importFrom dplyr select distinct arrange mutate across everything all_of left_join bind_rows
+#' @importFrom purrr map_dfr
+#' @importFrom tidyr separate
+#' @importFrom stringr str_replace_all
+#' @importFrom multcompView multcompLetters
+#' @importFrom tibble tibble
+#'
+#' @export
+betta_posthoc <- function(
+  test_obj,
+  metadata,
+  p_adjust_method = "BH",
+  alpha = 0.05,
+  letter_set = LETTERS,
+  human_sep = "."
+) {
+  # Input validation
+  # Check for required components
+  if (
+    !is.list(test_obj) ||
+      is.null(test_obj$function.args) ||
+      is.null(test_obj$function.args$formula)
+  ) {
+    stop(
+      "test_obj must be a fitted betta model with accessible formula in test_obj$function.args$formula"
+    )
+  }
+
+  # Additional check to ensure it has betta-like structure for betta_lincom
+  if (is.null(test_obj$table)) {
+    stop(
+      "test_obj must be a fitted betta model with a 'table' component (required for betta_lincom)"
+    )
+  }
+
+  if (!is.data.frame(metadata)) {
+    stop("metadata must be a data frame")
+  }
+
+  # Internal separator for combining group names
+  internal_sep <- "__SEP__"
+
+  # Extract the right-hand side of the formula
+  full_formula <- test_obj$function.args$formula
+  rhs_formula <- as.formula(paste("~", as.character(full_formula)[3]))
+  rhs_vars <- all.vars(rhs_formula)
+
+  # Check that all variables in formula are in metadata
+  missing_vars <- setdiff(rhs_vars, names(metadata))
+  if (length(missing_vars) > 0) {
+    stop(paste(
+      "Variables not found in metadata:",
+      paste(missing_vars, collapse = ", ")
+    ))
+  }
+
+  # Create all unique combinations of factor levels
+  combos <- metadata %>%
+    dplyr::select(dplyr::all_of(rhs_vars)) %>%
+    dplyr::distinct() %>%
+    dplyr::arrange(dplyr::across(dplyr::everything()))
+
+  # Create group labels
+  combos$group <- apply(combos, 1, paste, collapse = internal_sep)
+
+  # Create model matrix for linear combinations
+  mm <- model.matrix(rhs_formula, data = combos)
+
+  # Generate all pairwise comparisons
+  pairwise_indices <- combn(nrow(combos), 2)
+
+  # Perform pairwise comparisons using betta_lincom
+  results_df <- purrr::map_dfr(seq_len(ncol(pairwise_indices)), function(i) {
+    idx1 <- pairwise_indices[1, i]
+    idx2 <- pairwise_indices[2, i]
+
+    g1 <- combos$group[idx1]
+    g2 <- combos$group[idx2]
+
+    # Linear combination: difference between groups
+    lin_com <- mm[idx1, ] - mm[idx2, ]
+
+    # Perform linear combination test
+    res <- betta_lincom(test_obj, lin_com)
+
+    # Extract results
+    res %>%
+      dplyr::mutate(p_value = as.numeric(`p-values`)) %>%
+      dplyr::select(
+        Estimates,
+        `Standard Errors`,
+        `Lower CIs`,
+        `Upper CIs`,
+        p_value
+      ) %>%
+      dplyr::mutate(group_1 = g1, group_2 = g2, .before = Estimates)
+  })
+
+  # Add adjusted p-values and significance codes
+  results_df <- results_df %>%
+    dplyr::mutate(
+      p_adjusted = p.adjust(p_value, method = p_adjust_method),
+      significance = dplyr::case_when(
+        p_adjusted <= 0.001 ~ "***",
+        p_adjusted <= 0.01 ~ "**",
+        p_adjusted <= 0.05 ~ "*",
+        p_adjusted <= 0.1 ~ ".",
+        TRUE ~ " "
+      )
+    )
+
+  # Create p-value vector for compact letter display
+  # multcompLetters needs a named vector of p-values for all pairwise comparisons
+  pvals <- results_df$p_adjusted
+  names(pvals) <- paste(results_df$group_1, results_df$group_2, sep = "-")
+
+  # Generate compact letter display using p-value vector approach
+  cld_letters <- generate_cld(pvals, alpha, letter_set)
+
+  cld_df <- tibble::tibble(
+    group_label = names(cld_letters),
+    cld = as.character(cld_letters)
+  )
+
+  # Add group-level estimates and confidence intervals to CLD table
+  # Calculate predicted values for each group using the model matrix
+  group_estimates <- tibble::tibble()
+
+  for (i in seq_len(nrow(combos))) {
+    group_name <- combos$group[i]
+
+    # Get linear combination for this group (intercept + effects)
+    lin_com_group <- mm[i, ]
+
+    # Calculate group estimate using betta_lincom
+    group_result <- betta_lincom(test_obj, lin_com_group)
+
+    group_estimates <- dplyr::bind_rows(
+      group_estimates,
+      tibble::tibble(
+        group_label = group_name,
+        estimate = as.numeric(group_result$Estimates),
+        std_error = as.numeric(group_result$`Standard Errors`),
+        lower_ci = as.numeric(group_result$`Lower CIs`),
+        upper_ci = as.numeric(group_result$`Upper CIs`)
+      )
+    )
+  }
+
+  # Join estimates with CLD letters
+  cld_df <- cld_df %>%
+    dplyr::left_join(group_estimates, by = "group_label")
+
+  # Safely escape internal separator for tidyr::separate()
+  escaped_sep <- gsub(
+    "([\\^\\$\\.|\\?\\*\\+\\(\\)\\[\\]\\{\\}\\\\])",
+    "\\\\\\1",
+    internal_sep
+  )
+
+  cld_df <- cld_df %>%
+    tidyr::separate(
+      group_label,
+      into = rhs_vars,
+      sep = escaped_sep,
+      remove = TRUE
+    )
+
+  # Replace internal separator with readable one
+  results_df <- results_df %>%
+    dplyr::mutate(
+      group_1 = stringr::str_replace_all(group_1, internal_sep, human_sep),
+      group_2 = stringr::str_replace_all(group_2, internal_sep, human_sep)
+    )
+
+  cld_df <- cld_df %>%
+    dplyr::mutate(
+      dplyr::across(
+        dplyr::all_of(rhs_vars),
+        ~ stringr::str_replace_all(., internal_sep, human_sep)
+      )
+    ) %>%
+    # Also clean up the group_label if it still exists
+    dplyr::mutate(
+      dplyr::across(
+        dplyr::any_of("group_label"),
+        ~ stringr::str_replace_all(., internal_sep, human_sep)
+      )
+    )
+
+  # Return results as a proper betta_posthoc object
+  new_betta_posthoc(results_df, cld_df)
+}
+
+
+#' Create a betta_posthoc object with proper class
+#'
+#' @param results_df Pairwise comparison results
+#' @param cld_df Compact letter display results
+#' @return A betta_posthoc object
+#' @keywords internal
+new_betta_posthoc <- function(results_df, cld_df) {
+  structure(
+    list(
+      results_df = results_df,
+      cld_df = cld_df
+    ),
+    class = "betta_posthoc"
+  )
+}

From beb99e1207a5beb7e72422d0e31aeda04b9b4dea Mon Sep 17 00:00:00 2001
From: Sarah Teichman <teichs@uw.edu>
Date: Thu, 17 Jul 2025 08:58:21 -0400
Subject: [PATCH 2/3] making small updates to betta_posthoc and running it

---
 DESCRIPTION       |  5 ++++-
 R/betta_posthoc.R | 28 ++++++++++++++++++++--------
 2 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 1180aeb..baefbaf 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -35,7 +35,10 @@ Suggests:
     testthat,
     ggplot2,
     R.rsp,
-    speedyseq
+    speedyseq,
+    multcompView,
+    purrr,
+    stringr
 Additional_repositories: https://github.com/mikemc/speedyseq
 Encoding: UTF-8
 LazyData: true
diff --git a/R/betta_posthoc.R b/R/betta_posthoc.R
index 9aba367..efe673c 100644
--- a/R/betta_posthoc.R
+++ b/R/betta_posthoc.R
@@ -64,6 +64,18 @@ betta_posthoc <- function(
   letter_set = LETTERS,
   human_sep = "."
 ) {
+  
+  # check that required packages are installed (since these are Suggests for this package)
+  if (!requireNamespace("multcompView", quietly = TRUE)) {
+    stop("Package 'multicompView' is required for this function. Please install it.")
+  }
+  if (!requireNamespace("purrr", quietly = TRUE)) {
+    stop("Package 'purrr' is required for this function. Please install it.")
+  }
+  if (!requireNamespace("stringr", quietly = TRUE)) {
+    stop("Package 'stringr' is required for this function. Please install it.")
+  }
+  
   # Input validation
   # Check for required components
   if (
@@ -149,14 +161,14 @@ betta_posthoc <- function(
   # Add adjusted p-values and significance codes
   results_df <- results_df %>%
     dplyr::mutate(
-      p_adjusted = p.adjust(p_value, method = p_adjust_method),
-      significance = dplyr::case_when(
-        p_adjusted <= 0.001 ~ "***",
-        p_adjusted <= 0.01 ~ "**",
-        p_adjusted <= 0.05 ~ "*",
-        p_adjusted <= 0.1 ~ ".",
-        TRUE ~ " "
-      )
+      p_adjusted = p.adjust(p_value, method = p_adjust_method)#,
+      #significance = dplyr::case_when(
+      #  p_adjusted <= 0.001 ~ "***",
+      #  p_adjusted <= 0.01 ~ "**",
+      #  p_adjusted <= 0.05 ~ "*",
+      #  p_adjusted <= 0.1 ~ ".",
+      #  TRUE ~ " "
+      #)
     )
 
   # Create p-value vector for compact letter display

From becd145153814861ef0aef9659b72a3626e9cc29 Mon Sep 17 00:00:00 2001
From: Sarah Teichman <teichs@uw.edu>
Date: Tue, 29 Jul 2025 10:24:40 -0700
Subject: [PATCH 3/3] small tweaks to pull request, including adding
 documentation and including necessary functions

---
 DESCRIPTION                         | 10 +++---
 NAMESPACE                           | 12 +++++++
 R/betta_posthoc.R                   | 13 +------
 R/utility.R                         |  2 +-
 man/betta_posthoc.Rd                | 53 +++++++++++++++++++++++++++++
 man/generate_cld.Rd                 | 22 ++++++++++++
 man/new_betta_posthoc.Rd            | 20 +++++++++++
 man/pick_base.Rd                    |  3 ++
 tests/testthat/test-betta_posthoc.R | 17 +++++++++
 9 files changed, 134 insertions(+), 18 deletions(-)
 create mode 100644 man/betta_posthoc.Rd
 create mode 100644 man/generate_cld.Rd
 create mode 100644 man/new_betta_posthoc.Rd
 create mode 100644 tests/testthat/test-betta_posthoc.R

diff --git a/DESCRIPTION b/DESCRIPTION
index baefbaf..b1b45b0 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -21,7 +21,10 @@ Imports:
     stats,
     tibble,
     tidyr,
-    utils
+    utils,
+    multcompView,
+    purrr,
+    stringr
 Suggests:
     doSNOW,
     glasso,
@@ -36,11 +39,8 @@ Suggests:
     ggplot2,
     R.rsp,
     speedyseq,
-    multcompView,
-    purrr,
-    stringr
 Additional_repositories: https://github.com/mikemc/speedyseq
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 7.1.2
+RoxygenNote: 7.3.2
 VignetteBuilder: knitr
diff --git a/NAMESPACE b/NAMESPACE
index 7ca087c..e3a47c3 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -4,6 +4,7 @@ S3method(plot,diversityEstimates)
 S3method(print,diversityEstimates)
 export(MCmat)
 export(bc_fast)
+export(betta_posthoc)
 export(bray_curtis_true)
 export(divnet)
 export(euc_fast)
@@ -27,16 +28,27 @@ import(stats)
 importFrom(MASS,ginv)
 importFrom(MASS,mvrnorm)
 importFrom(breakaway,make_design_matrix)
+importFrom(dplyr,across)
+importFrom(dplyr,all_of)
+importFrom(dplyr,arrange)
+importFrom(dplyr,bind_rows)
 importFrom(dplyr,distinct)
+importFrom(dplyr,everything)
+importFrom(dplyr,left_join)
 importFrom(dplyr,mutate)
 importFrom(dplyr,select)
 importFrom(magrittr,"%>%")
+importFrom(multcompView,multcompLetters)
 importFrom(mvnfast,rmvn)
 importFrom(phyloseq,get_variable)
 importFrom(phyloseq,otu_table)
 importFrom(phyloseq,sample_data)
 importFrom(phyloseq,sample_names)
+importFrom(purrr,map_dfr)
+importFrom(stringr,str_replace_all)
 importFrom(tibble,add_column)
 importFrom(tibble,rownames_to_column)
+importFrom(tibble,tibble)
 importFrom(tidyr,gather)
+importFrom(tidyr,separate)
 importFrom(utils,globalVariables)
diff --git a/R/betta_posthoc.R b/R/betta_posthoc.R
index efe673c..0db7f4b 100644
--- a/R/betta_posthoc.R
+++ b/R/betta_posthoc.R
@@ -65,17 +65,6 @@ betta_posthoc <- function(
   human_sep = "."
 ) {
   
-  # check that required packages are installed (since these are Suggests for this package)
-  if (!requireNamespace("multcompView", quietly = TRUE)) {
-    stop("Package 'multicompView' is required for this function. Please install it.")
-  }
-  if (!requireNamespace("purrr", quietly = TRUE)) {
-    stop("Package 'purrr' is required for this function. Please install it.")
-  }
-  if (!requireNamespace("stringr", quietly = TRUE)) {
-    stop("Package 'stringr' is required for this function. Please install it.")
-  }
-  
   # Input validation
   # Check for required components
   if (
@@ -129,7 +118,7 @@ betta_posthoc <- function(
   mm <- model.matrix(rhs_formula, data = combos)
 
   # Generate all pairwise comparisons
-  pairwise_indices <- combn(nrow(combos), 2)
+  pairwise_indices <- utils::combn(nrow(combos), 2)
 
   # Perform pairwise comparisons using betta_lincom
   results_df <- purrr::map_dfr(seq_len(ncol(pairwise_indices)), function(i) {
diff --git a/R/utility.R b/R/utility.R
index 852ad30..796be3d 100644
--- a/R/utility.R
+++ b/R/utility.R
@@ -51,7 +51,7 @@ acomb3 <- function(...) abind(..., along = 3)
 #' @param automatic_cutoff Choose detection cutoff automatically? Default is 
 #' FALSE. If TRUE, detection_cutoff will be set equal to the maximum proportion
 #' of samples any taxon is detected in.
-#' @value Index corresponding to taxon chosen as base taxon
+#' @return Index corresponding to taxon chosen as base taxon
 #' @author Amy Willis
 #' @export
 pick_base <- function(W,
diff --git a/man/betta_posthoc.Rd b/man/betta_posthoc.Rd
new file mode 100644
index 0000000..8060acb
--- /dev/null
+++ b/man/betta_posthoc.Rd
@@ -0,0 +1,53 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/betta_posthoc.R
+\name{betta_posthoc}
+\alias{betta_posthoc}
+\title{Post-hoc pairwise comparisons and compact letter display from diversity test object}
+\usage{
+betta_posthoc(
+  test_obj,
+  metadata,
+  p_adjust_method = "BH",
+  alpha = 0.05,
+  letter_set = LETTERS,
+  human_sep = "."
+)
+}
+\arguments{
+\item{test_obj}{An object returned from the initial diversity test function (`betta()`).
+This object must contain a formula accessible as `test_obj$function.args$formula`,
+where the RHS defines the model terms used in `model.matrix`.}
+
+\item{metadata}{A data frame containing the variables used in the model formula.
+Each variable should correspond to a column used in `test_obj$function.args$formula`.}
+
+\item{p_adjust_method}{Character string specifying the p-value adjustment method
+(default `"BH"`). See `?p.adjust` for available methods.}
+
+\item{alpha}{Numeric significance level used for compact letter display grouping (default `0.05`).}
+
+\item{letter_set}{Character vector of letters or numbers used for compact letter display labels
+(default `LETTERS`).}
+
+\item{human_sep}{Character string used as separator for output group labels (default `"."`).}
+}
+\value{
+A list with two components:
+\describe{
+  \item{results_df}{A data frame containing pairwise comparison results with columns:
+    `group_1`, `group_2`, `Estimates`, `Standard Errors`, `Lower CIs`, `Upper CIs`,
+    `p_value`, `p_adjusted`, and `significance`.}
+  \item{cld_df}{A data frame containing group-level compact letter display labels.
+    This data frame contains one row per group with columns corresponding to
+    the original variables from the formula, a `cld` column for the letter code,
+    and group-level estimates: `estimate`, `std_error`, `lower_ci`, `upper_ci`.}
+}
+}
+\description{
+Performs pairwise post-hoc linear combination tests based on a fitted betta model object
+(`test_obj`) that contains a formula of the form `estimate ~ ...`. The function
+extracts the model formula and metadata variables, constructs all pairwise
+comparisons between levels, runs `betta_lincom()` on each linear contrast,
+adjusts p-values, computes significance codes, and returns compact letter display (CLD)
+indicating significantly different groups.
+}
diff --git a/man/generate_cld.Rd b/man/generate_cld.Rd
new file mode 100644
index 0000000..0156c76
--- /dev/null
+++ b/man/generate_cld.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/betta_posthoc.R
+\name{generate_cld}
+\alias{generate_cld}
+\title{Helper function to generate compact letter display}
+\usage{
+generate_cld(pvals, alpha = 0.05, letter_set = LETTERS)
+}
+\arguments{
+\item{pvals}{Named vector of p-values for pairwise comparisons}
+
+\item{alpha}{Significance threshold}
+
+\item{letter_set}{Character vector of letters to use}
+}
+\value{
+Named vector of compact letter display codes
+}
+\description{
+Helper function to generate compact letter display
+}
+\keyword{internal}
diff --git a/man/new_betta_posthoc.Rd b/man/new_betta_posthoc.Rd
new file mode 100644
index 0000000..55cc9bf
--- /dev/null
+++ b/man/new_betta_posthoc.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/betta_posthoc.R
+\name{new_betta_posthoc}
+\alias{new_betta_posthoc}
+\title{Create a betta_posthoc object with proper class}
+\usage{
+new_betta_posthoc(results_df, cld_df)
+}
+\arguments{
+\item{results_df}{Pairwise comparison results}
+
+\item{cld_df}{Compact letter display results}
+}
+\value{
+A betta_posthoc object
+}
+\description{
+Create a betta_posthoc object with proper class
+}
+\keyword{internal}
diff --git a/man/pick_base.Rd b/man/pick_base.Rd
index c780238..15a6aaf 100644
--- a/man/pick_base.Rd
+++ b/man/pick_base.Rd
@@ -17,6 +17,9 @@ unless automatic_cutoff is set to TRUE.}
 FALSE. If TRUE, detection_cutoff will be set equal to the maximum proportion
 of samples any taxon is detected in.}
 }
+\value{
+Index corresponding to taxon chosen as base taxon
+}
 \description{
 Picks the base taxon to be used in divnet fit. If no taxon is detected
 in all samples, returns error; in this case, we recommend manually choosing 
diff --git a/tests/testthat/test-betta_posthoc.R b/tests/testthat/test-betta_posthoc.R
new file mode 100644
index 0000000..b7e9a37
--- /dev/null
+++ b/tests/testthat/test-betta_posthoc.R
@@ -0,0 +1,17 @@
+data("Lee")
+lee_phy <- phyloseq::tax_glom(Lee, taxrank = "Phylum")
+
+test_that("betta_posthoc function works as expected", {
+  div_obj <- divnet(lee_phy)
+  bett_obj <- betta(data = data.frame(sample_data(lee_phy),
+                                      chats = sapply(div_obj$shannon, function(x) x$estimate),
+                                      ses = sapply(div_obj$shannon, function(x) x$error)),
+                    formula = chats ~ char,
+                    ses = "ses")
+  bett_post <- betta_posthoc(test_obj = bett_obj,
+                             metadata = data.frame(sample_data(lee_phy)))
+  expect_true(all.equal(bett_obj$table[-1, 1] %>% unname(), 
+                        -bett_post$results_df$Estimates[1:4] %>% unname()))
+  # we expect estimates to align when comparing baseline level to other levels, not 
+  # p-values because of differences in how these are calculated in betta and betta_lincom
+})