diff --git a/R/getPathwaysFromIndra.R b/R/getPathwaysFromIndra.R index 06b1a45..76c654e 100644 --- a/R/getPathwaysFromIndra.R +++ b/R/getPathwaysFromIndra.R @@ -107,8 +107,8 @@ getPathwaysFromIndra <- function(annotated_df, main_target = 'MEN1_HUMAN', targe edgeToMetadataMapping[[key]]$data$stmt_type <- c(edge$type) edgeToMetadataMapping[[key]]$source_id <- source_id edgeToMetadataMapping[[key]]$target_id <- obj - edgeToMetadataMapping[[key]] <- MSstatsBioNet:::.addAdditionalMetadataToIndraEdge( - edgeToMetadataMapping[[key]], annotated_df, namespace + edgeToMetadataMapping[[key]] <- .addAdditionalMetadataToIndraEdge( + edgeToMetadataMapping[[key]], annotated_df ) } } diff --git a/R/getSubnetworkFromIndra.R b/R/getSubnetworkFromIndra.R index af40f3f..d8a1896 100644 --- a/R/getSubnetworkFromIndra.R +++ b/R/getSubnetworkFromIndra.R @@ -30,6 +30,9 @@ #' @param force_include_proteins character vector of protein identifiers to exempt #' from all filtering steps. These proteins will be retained regardless of p-value, #' logFC, or other filtering criteria. Default is NULL, i.e. no exemptions. +#' @param force_include_other character vector of identifiers to include in the +#' network, regardless if those ids are in the input data. Should be formatted +#' as "namespace:identifier", e.g. "HGNC:1234" or "CHEBI:4911". #' #' @return list of 2 data.frames, nodes and edges #' @@ -53,10 +56,11 @@ getSubnetworkFromIndra <- function(input, correlation_cutoff = 0.3, sources_filter = NULL, logfc_cutoff = NULL, - force_include_proteins = NULL) { + force_include_proteins = NULL, + force_include_other = NULL) { input <- .filterGetSubnetworkFromIndraInput(input, pvalueCutoff, logfc_cutoff, force_include_proteins) - .validateGetSubnetworkFromIndraInput(input, protein_level_data, sources_filter) - res <- .callIndraCogexApi(input$HgncId) + .validateGetSubnetworkFromIndraInput(input, protein_level_data, sources_filter, force_include_other) + res <- .callIndraCogexApi(input$HgncId, force_include_other) res <- .filterIndraResponse(res, statement_types, evidence_count_cutoff, sources_filter) edges <- .constructEdgesDataFrame(res, input, protein_level_data) edges <- .filterEdgesDataFrame(edges, paper_count_cutoff, correlation_cutoff) diff --git a/R/utils_getSubnetworkFromIndra.R b/R/utils_getSubnetworkFromIndra.R index cb25998..e514d52 100644 --- a/R/utils_getSubnetworkFromIndra.R +++ b/R/utils_getSubnetworkFromIndra.R @@ -2,13 +2,16 @@ #' @param input dataframe from MSstats groupComparison output #' @param protein_level_data dataframe from MSstats dataProcess output #' @param sources_filter sources filter +#' @param force_include_other character vector of identifiers to include in the +#' network. #' @keywords internal #' @noRd -.validateGetSubnetworkFromIndraInput <- function(input, protein_level_data, sources_filter) { +.validateGetSubnetworkFromIndraInput <- function(input, protein_level_data, sources_filter, force_include_other) { if (!"HgncId" %in% colnames(input)) { stop("Invalid Input Error: Input must contain a column named 'HgncId'.") } - if (nrow(input) >= 400) { + num_proteins = nrow(input) + ifelse(!is.null(force_include_other), length(force_include_other), 0) + if (num_proteins >= 400) { stop("Invalid Input Error: INDRA query must contain less than 400 proteins. Consider lowering your p-value cutoff") } if (nrow(input) == 0) { @@ -28,16 +31,26 @@ #' Call INDRA Cogex API and return response #' @param hgncIds list of hgnc ids +#' @param force_include_other list of identifiers to include in the network #' @return list of INDRA statements #' @importFrom jsonlite toJSON #' @importFrom httr POST add_headers content #' @keywords internal #' @noRd -.callIndraCogexApi <- function(hgncIds) { +.callIndraCogexApi <- function(hgncIds, force_include_other) { indraCogexUrl <- "https://discovery.indra.bio/api/indra_subnetwork_relations" groundings <- lapply(hgncIds, function(x) list("HGNC", x)) + if (!is.null(force_include_other)) { + groundings <- c(groundings, lapply(force_include_other, function(x) { + parts <- unlist(strsplit(x, ":")) + if (length(parts) != 2) { + stop(paste0("Invalid identifier format: ", x, ". Expected format is 'namespace:identifier', e.g. 'HGNC:1234' or 'CHEBI:4911'.")) + } + list(parts[1], parts[2]) + })) + } groundings <- list(nodes = groundings) groundings <- jsonlite::toJSON(groundings, auto_unbox = TRUE) @@ -133,40 +146,31 @@ #' Add additional metadata to an edge #' @param edge object representation of an INDRA statement #' @param input filtered groupComparison result -#' @param source_namespace namespace of the source for evidence URL #' @return edge with additional metadata #' @keywords internal #' @noRd -.addAdditionalMetadataToIndraEdge <- function(edge, input, source_namespace = "@HGNC") { +.addAdditionalMetadataToIndraEdge <- function(edge, input) { edge$evidence_list <- paste( "https://db.indra.bio/statements/from_agents?subject=", - edge$source_id, source_namespace, "&object=", - edge$target_id, "@HGNC&format=html", + edge$source_id, "@", edge$source_ns, "&object=", + edge$target_id, "@", edge$target_ns, "&format=html", sep = "" ) # Convert back to uniprot IDs - if (source_namespace == "@HGNC") { - matched_rows_source <- input[which(input$HgncId == edge$source_id), ] + matched_rows_source <- input[which(input$HgncId == edge$source_id), ] + if (nrow(matched_rows_source) != 1) { + edge$source_uniprot_id <- edge$source_name + } else { + edge$source_uniprot_id <- matched_rows_source$Protein } - matched_rows_target <- input[which(input$HgncId == edge$target_id), ] - - if ((source_namespace == "@HGNC" && nrow(matched_rows_source) != 1) || nrow(matched_rows_target) != 1) { - stop(paste0( - "INDRA Exception: Unexpected number of matches for the following HGNC IDs in the input data: ", - edge$source_id, - " or ", - edge$target_id, - ". Each ID must match exactly one entry in the input data, but 0 or multiple matches were found. Please check the input data for duplicates or missing entries." - )) - } - if (source_namespace == "@HGNC") { - edge$source_uniprot_id <- matched_rows_source$Protein + matched_rows_target <- input[which(input$HgncId == edge$target_id), ] + if (nrow(matched_rows_target) != 1) { + edge$target_uniprot_id <- edge$target_name } else { - edge$source_uniprot_id <- edge$source_id + edge$target_uniprot_id <- matched_rows_target$Protein } - edge$target_uniprot_id <- matched_rows_target$Protein return(edge) } @@ -258,14 +262,25 @@ #' @keywords internal #' @noRd .constructNodesDataFrame <- function(input, edges) { + # Get unique nodes from edges + node_ids <- unique(c(edges$source, edges$target)) + + # Create base nodes dataframe nodes <- data.frame( - id = input$Protein, - logFC = input$log2FC, - adj.pvalue = input$adj.pvalue, - hgncName = if ("HgncName" %in% colnames(input) && is.character(input$HgncName)) input$HgncName else NA, + id = node_ids, stringsAsFactors = FALSE ) - nodes <- nodes[which(nodes$id %in% edges$source | nodes$id %in% edges$target),] + + # Add attributes from input where available + nodes$logFC <- input$log2FC[match(nodes$id, input$Protein)] + nodes$adj.pvalue <- input$adj.pvalue[match(nodes$id, input$Protein)] + nodes$hgncName <- if ("HgncName" %in% colnames(input) && is.character(input$HgncName)) { + hgnc_value <- input$HgncName[match(nodes$id, input$Protein)] + ifelse(is.na(hgnc_value), nodes$id, hgnc_value) + } else { + nodes$id + } + return(nodes) } diff --git a/man/getSubnetworkFromIndra.Rd b/man/getSubnetworkFromIndra.Rd index bc84f34..2fc5e80 100644 --- a/man/getSubnetworkFromIndra.Rd +++ b/man/getSubnetworkFromIndra.Rd @@ -14,7 +14,8 @@ getSubnetworkFromIndra( correlation_cutoff = 0.3, sources_filter = NULL, logfc_cutoff = NULL, - force_include_proteins = NULL + force_include_proteins = NULL, + force_include_other = NULL ) } \arguments{ @@ -54,6 +55,10 @@ is NULL, i.e. no logFC filtering.} \item{force_include_proteins}{character vector of protein identifiers to exempt from all filtering steps. These proteins will be retained regardless of p-value, logFC, or other filtering criteria. Default is NULL, i.e. no exemptions.} + +\item{force_include_other}{character vector of identifiers to include in the +network, regardless if those ids are in the input data. Should be formatted +as "namespace:identifier", e.g. "HGNC:1234" or "CHEBI:4911".} } \value{ list of 2 data.frames, nodes and edges diff --git a/tests/testthat/test-getSubnetworkFromIndra.R b/tests/testthat/test-getSubnetworkFromIndra.R index 9e1e487..ddeca35 100644 --- a/tests/testthat/test-getSubnetworkFromIndra.R +++ b/tests/testthat/test-getSubnetworkFromIndra.R @@ -2,7 +2,7 @@ test_that("getSubnetworkFromIndra works correctly", { input <- data.table::fread( system.file("extdata/groupComparisonModel.csv", package = "MSstatsBioNet") ) - local_mocked_bindings(.callIndraCogexApi = function(x) { + local_mocked_bindings(.callIndraCogexApi = function(x,y) { return(readRDS(system.file("extdata/indraResponse.rds", package = "MSstatsBioNet"))) }) suppressWarnings(subnetwork <- getSubnetworkFromIndra(input, statement_types = c("Activation", "Phosphorylation"))) @@ -14,7 +14,7 @@ test_that("getSubnetworkFromIndra with different statement type works correctly" input <- data.table::fread( system.file("extdata/groupComparisonModel.csv", package = "MSstatsBioNet") ) - local_mocked_bindings(.callIndraCogexApi = function(x) { + local_mocked_bindings(.callIndraCogexApi = function(x,y) { return(readRDS(system.file("extdata/indraResponse.rds", package = "MSstatsBioNet"))) }) suppressWarnings(