From 4cabdf9154fcd2e5cd3e0cc254f838a8d0dd8f6b Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Thu, 24 Jul 2025 16:02:09 -0400 Subject: [PATCH 1/5] add logfc filter and filter exempt proteins filter --- R/getSubnetworkFromIndra.R | 9 +++++++-- R/utils_getSubnetworkFromIndra.R | 26 ++++++++++++++++++++++++-- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/R/getSubnetworkFromIndra.R b/R/getSubnetworkFromIndra.R index e875977..3852abc 100644 --- a/R/getSubnetworkFromIndra.R +++ b/R/getSubnetworkFromIndra.R @@ -24,6 +24,9 @@ #' 0.3 #' @param sources_filter filtering only on specific sources. Default is no filter, i.e. NULL. #' Otherwise, should be a list, e.g. c('reach', 'medscan'). +#' @param logfc_cutoff filtering only on proteins with absolute log fold change +#' greater than a cutoff. +#' @param filter_exempt_proteins list of proteins to exempt in filtering. Default is NULL. #' #' @return list of 2 data.frames, nodes and edges #' @@ -45,8 +48,10 @@ getSubnetworkFromIndra <- function(input, paper_count_cutoff = 1, evidence_count_cutoff = 1, correlation_cutoff = 0.3, - sources_filter = NULL) { - input <- .filterGetSubnetworkFromIndraInput(input, pvalueCutoff) + sources_filter = NULL, + logfc_cutoff = NULL, + filter_exempt_proteins = NULL) { + input <- .filterGetSubnetworkFromIndraInput(input, pvalueCutoff, logfc_cutoff, filter_exempt_proteins) .validateGetSubnetworkFromIndraInput(input, protein_level_data, sources_filter) res <- .callIndraCogexApi(input$HgncId) res <- .filterIndraResponse(res, statement_types, evidence_count_cutoff, sources_filter) diff --git a/R/utils_getSubnetworkFromIndra.R b/R/utils_getSubnetworkFromIndra.R index 318c7b6..66362db 100644 --- a/R/utils_getSubnetworkFromIndra.R +++ b/R/utils_getSubnetworkFromIndra.R @@ -86,19 +86,41 @@ #' Filter groupComparison result input based on user-defined cutoffs #' @param input groupComparison result #' @param pvalueCutoff p-value cutoff +#' @param logfc_cutoff logFC cutoff +#' @param filter_exempt_proteins list of proteins to exempt from filtering #' @return filtered groupComparison result #' @keywords internal #' @noRd -.filterGetSubnetworkFromIndraInput <- function(input, pvalueCutoff) { +.filterGetSubnetworkFromIndraInput <- function(input, pvalueCutoff, logfc_cutoff, filter_exempt_proteins) { + # Extract exempt proteins before any filtering + exempt_proteins <- NULL + if (!is.null(filter_exempt_proteins)) { + if (!is.character(filter_exempt_proteins)) { + stop("filter_exempt_proteins must be a character vector") + } + exempt_proteins <- input[input$Protein %in% filter_exempt_proteins,] + } + + # Apply standard filtering input <- input[!is.na(input$adj.pvalue),] if (!is.null(pvalueCutoff)) { input <- input[input$adj.pvalue < pvalueCutoff, ] } + if (!is.null(logfc_cutoff)) { + input <- input[input$log2FC > logfc_cutoff | input$log2FC < -logfc_cutoff, ] + } input <- input[is.na(input$issue), ] + + # Combine filtered data with exempt proteins and remove duplicates + if (!is.null(exempt_proteins) && nrow(exempt_proteins) > 0) { + combined_input <- rbind(input, exempt_proteins) + # Remove duplicates based on Protein column, keeping first occurrence + input <- combined_input[!duplicated(combined_input$Protein), ] + } + input$Protein <- as.character(input$Protein) return(input) } - #' Add additional metadata to an edge #' @param edge object representation of an INDRA statement #' @param input filtered groupComparison result From 007087eaec51e5aee4c7d0c14ea103d2cbf07937 Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Thu, 24 Jul 2025 17:09:40 -0400 Subject: [PATCH 2/5] update chatgpt comments --- R/utils_getSubnetworkFromIndra.R | 7 +++++-- man/getSubnetworkFromIndra.Rd | 9 ++++++++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/R/utils_getSubnetworkFromIndra.R b/R/utils_getSubnetworkFromIndra.R index 66362db..9135520 100644 --- a/R/utils_getSubnetworkFromIndra.R +++ b/R/utils_getSubnetworkFromIndra.R @@ -107,13 +107,16 @@ input <- input[input$adj.pvalue < pvalueCutoff, ] } if (!is.null(logfc_cutoff)) { - input <- input[input$log2FC > logfc_cutoff | input$log2FC < -logfc_cutoff, ] + if (!is.numeric(logfc_cutoff) || length(logfc_cutoff) != 1 || logfc_cutoff <= 0) { + stop("logfc_cutoff must be a single positive numeric value") + } + input <- input[!is.na(input$log2FC) & abs(input$log2FC) > logfc_cutoff, ] } input <- input[is.na(input$issue), ] # Combine filtered data with exempt proteins and remove duplicates if (!is.null(exempt_proteins) && nrow(exempt_proteins) > 0) { - combined_input <- rbind(input, exempt_proteins) + combined_input <- rbind(exempt_proteins, input) # Remove duplicates based on Protein column, keeping first occurrence input <- combined_input[!duplicated(combined_input$Protein), ] } diff --git a/man/getSubnetworkFromIndra.Rd b/man/getSubnetworkFromIndra.Rd index e1c9d6d..806adc3 100644 --- a/man/getSubnetworkFromIndra.Rd +++ b/man/getSubnetworkFromIndra.Rd @@ -12,7 +12,9 @@ getSubnetworkFromIndra( paper_count_cutoff = 1, evidence_count_cutoff = 1, correlation_cutoff = 0.3, - sources_filter = NULL + sources_filter = NULL, + logfc_cutoff = NULL, + filter_exempt_proteins = NULL ) } \arguments{ @@ -44,6 +46,11 @@ cutoff for edges with correlation less than a specified cutoff. Default is \item{sources_filter}{filtering only on specific sources. Default is no filter, i.e. NULL. Otherwise, should be a list, e.g. c('reach', 'medscan').} + +\item{logfc_cutoff}{filtering only on proteins with absolute log fold change +greater than a cutoff.} + +\item{filter_exempt_proteins}{list of proteins to exempt in filtering. Default is NULL.} } \value{ list of 2 data.frames, nodes and edges From a29e88c9d05611336a85e5faa95b099dee3ee248 Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Thu, 24 Jul 2025 17:11:30 -0400 Subject: [PATCH 3/5] additional suggestions --- R/utils_getSubnetworkFromIndra.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R/utils_getSubnetworkFromIndra.R b/R/utils_getSubnetworkFromIndra.R index 9135520..1c9b958 100644 --- a/R/utils_getSubnetworkFromIndra.R +++ b/R/utils_getSubnetworkFromIndra.R @@ -98,6 +98,10 @@ if (!is.character(filter_exempt_proteins)) { stop("filter_exempt_proteins must be a character vector") } + missing_prots <- setdiff(filter_exempt_proteins, input$Protein) + if (length(missing_prots) > 0) { + warning("filter_exempt_proteins not found: ", paste(missing_prots, collapse = ", ")) + } exempt_proteins <- input[input$Protein %in% filter_exempt_proteins,] } From 0861e2f4b9f327d90899e32f5b1cb178b6358ca2 Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Thu, 24 Jul 2025 17:14:49 -0400 Subject: [PATCH 4/5] update docs based on chatgpt ffedback --- R/getSubnetworkFromIndra.R | 9 ++++++--- man/getSubnetworkFromIndra.Rd | 9 ++++++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/R/getSubnetworkFromIndra.R b/R/getSubnetworkFromIndra.R index 3852abc..e30eaf6 100644 --- a/R/getSubnetworkFromIndra.R +++ b/R/getSubnetworkFromIndra.R @@ -24,9 +24,12 @@ #' 0.3 #' @param sources_filter filtering only on specific sources. Default is no filter, i.e. NULL. #' Otherwise, should be a list, e.g. c('reach', 'medscan'). -#' @param logfc_cutoff filtering only on proteins with absolute log fold change -#' greater than a cutoff. -#' @param filter_exempt_proteins list of proteins to exempt in filtering. Default is NULL. +#' @param logfc_cutoff absolute log fold change cutoff for filtering proteins. +#' Only proteins with |logFC| greater than this value will be retained. Default +#' is NULL, i.e. no logFC filtering. +#' @param filter_exempt_proteins character vector of protein identifiers to exempt +#' from all filtering steps. These proteins will be retained regardless of p-value, +#' logFC, or other filtering criteria. Default is NULL, i.e. no exemptions. #' #' @return list of 2 data.frames, nodes and edges #' diff --git a/man/getSubnetworkFromIndra.Rd b/man/getSubnetworkFromIndra.Rd index 806adc3..6f11a1c 100644 --- a/man/getSubnetworkFromIndra.Rd +++ b/man/getSubnetworkFromIndra.Rd @@ -47,10 +47,13 @@ cutoff for edges with correlation less than a specified cutoff. Default is \item{sources_filter}{filtering only on specific sources. Default is no filter, i.e. NULL. Otherwise, should be a list, e.g. c('reach', 'medscan').} -\item{logfc_cutoff}{filtering only on proteins with absolute log fold change -greater than a cutoff.} +\item{logfc_cutoff}{absolute log fold change cutoff for filtering proteins. +Only proteins with |logFC| greater than this value will be retained. Default +is NULL, i.e. no logFC filtering.} -\item{filter_exempt_proteins}{list of proteins to exempt in filtering. Default is NULL.} +\item{filter_exempt_proteins}{character vector of protein identifiers to exempt +from all filtering steps. These proteins will be retained regardless of p-value, +logFC, or other filtering criteria. Default is NULL, i.e. no exemptions.} } \value{ list of 2 data.frames, nodes and edges From 340b385300892980066dbb89e2188673f73181e4 Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Thu, 24 Jul 2025 17:51:53 -0400 Subject: [PATCH 5/5] change parameter to force include proteins --- R/getSubnetworkFromIndra.R | 6 +++--- R/utils_getSubnetworkFromIndra.R | 16 ++++++++-------- man/getSubnetworkFromIndra.Rd | 4 ++-- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/R/getSubnetworkFromIndra.R b/R/getSubnetworkFromIndra.R index e30eaf6..af40f3f 100644 --- a/R/getSubnetworkFromIndra.R +++ b/R/getSubnetworkFromIndra.R @@ -27,7 +27,7 @@ #' @param logfc_cutoff absolute log fold change cutoff for filtering proteins. #' Only proteins with |logFC| greater than this value will be retained. Default #' is NULL, i.e. no logFC filtering. -#' @param filter_exempt_proteins character vector of protein identifiers to exempt +#' @param force_include_proteins character vector of protein identifiers to exempt #' from all filtering steps. These proteins will be retained regardless of p-value, #' logFC, or other filtering criteria. Default is NULL, i.e. no exemptions. #' @@ -53,8 +53,8 @@ getSubnetworkFromIndra <- function(input, correlation_cutoff = 0.3, sources_filter = NULL, logfc_cutoff = NULL, - filter_exempt_proteins = NULL) { - input <- .filterGetSubnetworkFromIndraInput(input, pvalueCutoff, logfc_cutoff, filter_exempt_proteins) + force_include_proteins = NULL) { + input <- .filterGetSubnetworkFromIndraInput(input, pvalueCutoff, logfc_cutoff, force_include_proteins) .validateGetSubnetworkFromIndraInput(input, protein_level_data, sources_filter) res <- .callIndraCogexApi(input$HgncId) res <- .filterIndraResponse(res, statement_types, evidence_count_cutoff, sources_filter) diff --git a/R/utils_getSubnetworkFromIndra.R b/R/utils_getSubnetworkFromIndra.R index 1c9b958..cc13156 100644 --- a/R/utils_getSubnetworkFromIndra.R +++ b/R/utils_getSubnetworkFromIndra.R @@ -87,22 +87,22 @@ #' @param input groupComparison result #' @param pvalueCutoff p-value cutoff #' @param logfc_cutoff logFC cutoff -#' @param filter_exempt_proteins list of proteins to exempt from filtering +#' @param force_include_proteins list of proteins to exempt from filtering #' @return filtered groupComparison result #' @keywords internal #' @noRd -.filterGetSubnetworkFromIndraInput <- function(input, pvalueCutoff, logfc_cutoff, filter_exempt_proteins) { +.filterGetSubnetworkFromIndraInput <- function(input, pvalueCutoff, logfc_cutoff, force_include_proteins) { # Extract exempt proteins before any filtering exempt_proteins <- NULL - if (!is.null(filter_exempt_proteins)) { - if (!is.character(filter_exempt_proteins)) { - stop("filter_exempt_proteins must be a character vector") + if (!is.null(force_include_proteins)) { + if (!is.character(force_include_proteins)) { + stop("force_include_proteins must be a character vector") } - missing_prots <- setdiff(filter_exempt_proteins, input$Protein) + missing_prots <- setdiff(force_include_proteins, input$Protein) if (length(missing_prots) > 0) { - warning("filter_exempt_proteins not found: ", paste(missing_prots, collapse = ", ")) + warning("force_include_proteins not found: ", paste(missing_prots, collapse = ", ")) } - exempt_proteins <- input[input$Protein %in% filter_exempt_proteins,] + exempt_proteins <- input[input$Protein %in% force_include_proteins,] } # Apply standard filtering diff --git a/man/getSubnetworkFromIndra.Rd b/man/getSubnetworkFromIndra.Rd index 6f11a1c..bc84f34 100644 --- a/man/getSubnetworkFromIndra.Rd +++ b/man/getSubnetworkFromIndra.Rd @@ -14,7 +14,7 @@ getSubnetworkFromIndra( correlation_cutoff = 0.3, sources_filter = NULL, logfc_cutoff = NULL, - filter_exempt_proteins = NULL + force_include_proteins = NULL ) } \arguments{ @@ -51,7 +51,7 @@ Otherwise, should be a list, e.g. c('reach', 'medscan').} Only proteins with |logFC| greater than this value will be retained. Default is NULL, i.e. no logFC filtering.} -\item{filter_exempt_proteins}{character vector of protein identifiers to exempt +\item{force_include_proteins}{character vector of protein identifiers to exempt from all filtering steps. These proteins will be retained regardless of p-value, logFC, or other filtering criteria. Default is NULL, i.e. no exemptions.} }