From 87dbc3c8015a1f648dd95b7edf80aeaabd30f182 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Wed, 29 Oct 2025 17:38:44 -0400 Subject: [PATCH 1/6] First stab at returning taxa when a flag is set. --- node_normalizer/model/input.py | 5 +++++ node_normalizer/normalizer.py | 15 ++++++++++----- node_normalizer/server.py | 11 ++++++++--- node_normalizer/set_id.py | 2 +- 4 files changed, 24 insertions(+), 9 deletions(-) diff --git a/node_normalizer/model/input.py b/node_normalizer/model/input.py index ff854aa..ea7820e 100644 --- a/node_normalizer/model/input.py +++ b/node_normalizer/model/input.py @@ -36,6 +36,11 @@ class CurieList(BaseModel): title="Whether to return individual types for equivalent identifiers" ) + include_taxa: bool = Field( + True, + title="Whether to return taxa for equivalent identifiers" + ) + class Config: schema_extra = { "example": { diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index b92bc9c..2a79411 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -514,7 +514,7 @@ async def get_eqids_and_types( # Every equivalent identifier here has the same type. for eqid in eqids[index]: - eqid.update({'t': [typ]}) + eqid.update({'types': [typ]}) return eqids, types_with_ancestors @@ -525,7 +525,8 @@ async def get_normalized_nodes( conflate_gene_protein: bool, conflate_chemical_drug: bool, include_descriptions: bool = False, - include_individual_types: bool = True + include_individual_types: bool = True, + include_taxa: bool = True, ) -> Dict[str, Optional[str]]: """ Get value(s) for key(s) using redis MGET @@ -634,6 +635,7 @@ async def get_normalized_nodes( input_curie: await create_node(app, canonical_id, dereference_ids, dereference_types, info_contents, include_descriptions=include_descriptions, include_individual_types=include_individual_types, + include_taxa=include_taxa, conflations={ 'GeneProtein': conflate_gene_protein, 'DrugChemical': conflate_chemical_drug, @@ -674,7 +676,7 @@ async def get_info_content_attribute(app, canonical_nonan) -> dict: async def create_node(app, canonical_id, equivalent_ids, types, info_contents, include_descriptions=True, - include_individual_types=False, conflations=None): + include_individual_types=False, include_taxa=False, conflations=None): """Construct the output format given the compressed redis data""" # It's possible that we didn't find a canonical_id if canonical_id is None: @@ -811,9 +813,12 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, i # if descriptions is enabled and exist add them to each eq_id entry if include_descriptions and "d" in eqid and len(eqid["d"]): eq_item["description"] = eqid["d"][0] + # if include_taxa is enabled and we have taxa on this node, add them to every eq_id entry + if include_taxa and "t" in eqid and len(eqid["t"]): + eq_item["taxa"] = eqid["t"] # if individual types have been requested, add them too. - if include_individual_types and 't' in eqid: - eq_item["type"] = eqid['t'][-1] + if include_individual_types and 'types' in eqid: + eq_item["type"] = eqid['types'][-1] node["equivalent_identifiers"].append(eq_item) # We need to remove `biolink:Entity` from the types returned. diff --git a/node_normalizer/server.py b/node_normalizer/server.py index 52ec2c4..18ca7ca 100644 --- a/node_normalizer/server.py +++ b/node_normalizer/server.py @@ -263,7 +263,8 @@ async def get_normalized_node_handler( conflate: bool = fastapi.Query(True, description="Whether to apply gene/protein conflation"), drug_chemical_conflate: bool = fastapi.Query(False, description="Whether to apply drug/chemical conflation"), description: bool = fastapi.Query(False, description="Whether to return curie descriptions when possible"), - individual_types: bool = fastapi.Query(False, description="Whether to return individual types for equivalent identifiers") + individual_types: bool = fastapi.Query(False, description="Whether to return individual types for equivalent identifiers"), + include_taxa: bool = fastapi.Query(True, description="Whether to return taxa for equivalent identifiers"), ): """ Get value(s) for key(s) using redis MGET @@ -271,7 +272,9 @@ async def get_normalized_node_handler( # no_conflate = request.args.get('dontconflate',['GeneProtein']) normalized_nodes = await get_normalized_nodes(app, curie, conflate, drug_chemical_conflate, include_descriptions=description, - include_individual_types=individual_types) + include_individual_types=individual_types, + include_taxa=include_taxa, + ) # If curie contains at least one entry, then the only way normalized_nodes could be blank # would be if an error occurred during processing. @@ -291,7 +294,9 @@ async def get_normalized_node_handler_post(curies: CurieList): Get value(s) for key(s) using redis MGET """ normalized_nodes = await get_normalized_nodes(app, curies.curies, curies.conflate, curies.drug_chemical_conflate, - curies.description, include_individual_types=curies.individual_types) + curies.description, include_individual_types=curies.individual_types, + include_taxa=curies.include_taxa, + ) # If curies.curies contains at least one entry, then the only way normalized_nodes could be blank # would be if an error occurred during processing. diff --git a/node_normalizer/set_id.py b/node_normalizer/set_id.py index 0e0f555..f9b1cec 100644 --- a/node_normalizer/set_id.py +++ b/node_normalizer/set_id.py @@ -41,7 +41,7 @@ async def generate_setid(app, curies, conflations) -> SetIDResponse: # We use get_normalized_nodes() to normalize all the CURIEs for us. normalization_results = await get_normalized_nodes( - app, curies, gene_protein_conflation, drug_chemical_conflation, include_descriptions=False + app, curies, gene_protein_conflation, drug_chemical_conflation, include_descriptions=False, include_individual_types=False, include_taxa=True ) # We prepare a set of sorted, deduplicated curies. From 4c6688c9cf8fa4fdce5aedfa41dbe2b01b5eee0c Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Wed, 29 Oct 2025 17:40:08 -0400 Subject: [PATCH 2/6] Added on-push trigger for testing. --- .github/workflows/release.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 800d57b..e9d8f8f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,6 +1,7 @@ name: 'Publish to GitHub Packages' on: + push: release: types: [published] From bc8b584af5276ae68d48e4576652cd536bf9a15b Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Wed, 29 Oct 2025 17:53:45 -0400 Subject: [PATCH 3/6] Simplified description and taxa code, added numerical suffix sort. --- node_normalizer/normalizer.py | 35 ++++++++++++++++++++--------------- node_normalizer/util.py | 14 ++++++++++++++ 2 files changed, 34 insertions(+), 15 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index 2a79411..159a12a 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -17,7 +17,12 @@ from fastapi import FastAPI from reasoner_pydantic import KnowledgeGraph, Message, QueryGraph, Result, CURIE, Attribute -from .util import LoggingUtil, uniquify_list, BIOLINK_NAMED_THING +from .util import ( + LoggingUtil, + uniquify_list, + BIOLINK_NAMED_THING, + get_numerical_curie_suffix, +) # logger = LoggingUtil.init_logging(__name__, level=logging.INFO, format='medium', logFilePath=os.path.dirname(__file__), logFileLevel=logging.INFO) logger = LoggingUtil.init_logging() @@ -793,34 +798,34 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, i # Now that we've determined a label for this clique, we should never use identifiers_with_labels, possible_labels, # or filtered_possible_labels after this point. - # if descriptions are enabled look for the first available description and use that - if include_descriptions: - descriptions = list( - map( - lambda x: x[0], - filter(lambda x: len(x) > 0, [eid['d'] for eid in eids if 'd' in eid]) - ) - ) - if len(descriptions) > 0: - node["id"]["description"] = descriptions[0] - # now need to reformat the identifier keys. It could be cleaner but we have to worry about if there is a label + first_description = None + node_taxa = set() node["equivalent_identifiers"] = [] for eqid in eids: eq_item = {"identifier": eqid["i"]} - if "l" in eqid: + if "l" in eqid and eqid["l"]: eq_item["label"] = eqid["l"] # if descriptions is enabled and exist add them to each eq_id entry - if include_descriptions and "d" in eqid and len(eqid["d"]): + if include_descriptions and "d" in eqid and len(eqid["d"]) > 0: eq_item["description"] = eqid["d"][0] + if not first_description: + first_description = eq_item["description"] # if include_taxa is enabled and we have taxa on this node, add them to every eq_id entry - if include_taxa and "t" in eqid and len(eqid["t"]): + if include_taxa and "t" in eqid and eqid["t"]: eq_item["taxa"] = eqid["t"] + node_taxa.update(eqid["t"]) # if individual types have been requested, add them too. if include_individual_types and 'types' in eqid: eq_item["type"] = eqid['types'][-1] node["equivalent_identifiers"].append(eq_item) + if include_descriptions: + node["description"] = first_description + + if include_taxa: + node["taxa"] = sorted(node_taxa, key=get_numerical_curie_suffix) + # We need to remove `biolink:Entity` from the types returned. # (See explanation at https://github.com/TranslatorSRI/NodeNormalization/issues/173) if 'biolink:Entity' in types[canonical_id]: diff --git a/node_normalizer/util.py b/node_normalizer/util.py index ee47691..46d7e08 100644 --- a/node_normalizer/util.py +++ b/node_normalizer/util.py @@ -11,6 +11,20 @@ # Some constants. BIOLINK_NAMED_THING = "biolink:NamedThing" +def get_numerical_curie_suffix(curie): + """ + If a CURIE has a numerical suffix, return it as an integer. Otherwise return None. + :param curie: A CURIE. + :return: An integer if the CURIE suffix is castable to int, otherwise None. + """ + curie_parts = curie.split(":", 1) + if len(curie_parts) > 0: + # Try to cast the CURIE suffix to an integer. If we get a ValueError, don't worry about it. + try: + return int(curie_parts[1]) + except ValueError: + pass + return None # loggers = {} class LoggingUtil(object): From d338dbe7102152dd768c3bc5c1234bd7c7bb8088 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Wed, 29 Oct 2025 18:03:23 -0400 Subject: [PATCH 4/6] Removed on:push trigger after testing. --- .github/workflows/release.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index e9d8f8f..800d57b 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,7 +1,6 @@ name: 'Publish to GitHub Packages' on: - push: release: types: [published] From d06b66cfc36822ac4bfc09920b1c0d4edcf19994 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 30 Oct 2025 16:46:53 -0400 Subject: [PATCH 5/6] Don't display description or taxa unless we have some. --- node_normalizer/normalizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index 159a12a..d346996 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -820,10 +820,10 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, i eq_item["type"] = eqid['types'][-1] node["equivalent_identifiers"].append(eq_item) - if include_descriptions: + if include_descriptions and first_description: node["description"] = first_description - if include_taxa: + if include_taxa and node_taxa: node["taxa"] = sorted(node_taxa, key=get_numerical_curie_suffix) # We need to remove `biolink:Entity` from the types returned. From 70a62e217ab9a5f4f4c0730329a504168666cffd Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 30 Oct 2025 16:48:58 -0400 Subject: [PATCH 6/6] Assume we don't need taxa for a setid. --- node_normalizer/set_id.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node_normalizer/set_id.py b/node_normalizer/set_id.py index f9b1cec..3c3dc30 100644 --- a/node_normalizer/set_id.py +++ b/node_normalizer/set_id.py @@ -41,7 +41,7 @@ async def generate_setid(app, curies, conflations) -> SetIDResponse: # We use get_normalized_nodes() to normalize all the CURIEs for us. normalization_results = await get_normalized_nodes( - app, curies, gene_protein_conflation, drug_chemical_conflation, include_descriptions=False, include_individual_types=False, include_taxa=True + app, curies, gene_protein_conflation, drug_chemical_conflation, include_descriptions=False, include_individual_types=False, include_taxa=False ) # We prepare a set of sorted, deduplicated curies.