From 87dbc3c8015a1f648dd95b7edf80aeaabd30f182 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Wed, 29 Oct 2025 17:38:44 -0400
Subject: [PATCH 1/6] First stab at returning taxa when a flag is set.

---
 node_normalizer/model/input.py |  5 +++++
 node_normalizer/normalizer.py  | 15 ++++++++++-----
 node_normalizer/server.py      | 11 ++++++++---
 node_normalizer/set_id.py      |  2 +-
 4 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/node_normalizer/model/input.py b/node_normalizer/model/input.py
index ff854aa..ea7820e 100644
--- a/node_normalizer/model/input.py
+++ b/node_normalizer/model/input.py
@@ -36,6 +36,11 @@ class CurieList(BaseModel):
         title="Whether to return individual types for equivalent identifiers"
     )
 
+    include_taxa: bool = Field(
+        True,
+        title="Whether to return taxa for equivalent identifiers"
+    )
+
     class Config:
         schema_extra = {
             "example": {
diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py
index b92bc9c..2a79411 100644
--- a/node_normalizer/normalizer.py
+++ b/node_normalizer/normalizer.py
@@ -514,7 +514,7 @@ async def get_eqids_and_types(
 
         # Every equivalent identifier here has the same type.
         for eqid in eqids[index]:
-            eqid.update({'t': [typ]})
+            eqid.update({'types': [typ]})
 
     return eqids, types_with_ancestors
 
@@ -525,7 +525,8 @@ async def get_normalized_nodes(
         conflate_gene_protein: bool,
         conflate_chemical_drug: bool,
         include_descriptions: bool = False,
-        include_individual_types: bool = True
+        include_individual_types: bool = True,
+        include_taxa: bool = True,
 ) -> Dict[str, Optional[str]]:
     """
     Get value(s) for key(s) using redis MGET
@@ -634,6 +635,7 @@ async def get_normalized_nodes(
         input_curie: await create_node(app, canonical_id, dereference_ids, dereference_types, info_contents,
                                        include_descriptions=include_descriptions,
                                        include_individual_types=include_individual_types,
+                                       include_taxa=include_taxa,
                                        conflations={
                                            'GeneProtein': conflate_gene_protein,
                                            'DrugChemical': conflate_chemical_drug,
@@ -674,7 +676,7 @@ async def get_info_content_attribute(app, canonical_nonan) -> dict:
 
 
 async def create_node(app, canonical_id, equivalent_ids, types, info_contents, include_descriptions=True,
-                      include_individual_types=False, conflations=None):
+                      include_individual_types=False, include_taxa=False, conflations=None):
     """Construct the output format given the compressed redis data"""
     # It's possible that we didn't find a canonical_id
     if canonical_id is None:
@@ -811,9 +813,12 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, i
         # if descriptions is enabled and exist add them to each eq_id entry
         if include_descriptions and "d" in eqid and len(eqid["d"]):
             eq_item["description"] = eqid["d"][0]
+        # if include_taxa is enabled and we have taxa on this node, add them to every eq_id entry
+        if include_taxa and "t" in eqid and len(eqid["t"]):
+            eq_item["taxa"] = eqid["t"]
         # if individual types have been requested, add them too.
-        if include_individual_types and 't' in eqid:
-            eq_item["type"] = eqid['t'][-1]
+        if include_individual_types and 'types' in eqid:
+            eq_item["type"] = eqid['types'][-1]
         node["equivalent_identifiers"].append(eq_item)
 
     # We need to remove `biolink:Entity` from the types returned.
diff --git a/node_normalizer/server.py b/node_normalizer/server.py
index 52ec2c4..18ca7ca 100644
--- a/node_normalizer/server.py
+++ b/node_normalizer/server.py
@@ -263,7 +263,8 @@ async def get_normalized_node_handler(
     conflate: bool = fastapi.Query(True, description="Whether to apply gene/protein conflation"),
     drug_chemical_conflate: bool = fastapi.Query(False, description="Whether to apply drug/chemical conflation"),
     description: bool = fastapi.Query(False, description="Whether to return curie descriptions when possible"),
-    individual_types: bool = fastapi.Query(False, description="Whether to return individual types for equivalent identifiers")
+    individual_types: bool = fastapi.Query(False, description="Whether to return individual types for equivalent identifiers"),
+    include_taxa: bool = fastapi.Query(True, description="Whether to return taxa for equivalent identifiers"),
 ):
     """
     Get value(s) for key(s) using redis MGET
@@ -271,7 +272,9 @@ async def get_normalized_node_handler(
     # no_conflate = request.args.get('dontconflate',['GeneProtein'])
     normalized_nodes = await get_normalized_nodes(app, curie, conflate, drug_chemical_conflate,
                                                   include_descriptions=description,
-                                                  include_individual_types=individual_types)
+                                                  include_individual_types=individual_types,
+                                                  include_taxa=include_taxa,
+                                                  )
 
     # If curie contains at least one entry, then the only way normalized_nodes could be blank
     # would be if an error occurred during processing.
@@ -291,7 +294,9 @@ async def get_normalized_node_handler_post(curies: CurieList):
     Get value(s) for key(s) using redis MGET
     """
     normalized_nodes = await get_normalized_nodes(app, curies.curies, curies.conflate, curies.drug_chemical_conflate,
-                                                  curies.description, include_individual_types=curies.individual_types)
+                                                  curies.description, include_individual_types=curies.individual_types,
+                                                  include_taxa=curies.include_taxa,
+                                                  )
 
     # If curies.curies contains at least one entry, then the only way normalized_nodes could be blank
     # would be if an error occurred during processing.
diff --git a/node_normalizer/set_id.py b/node_normalizer/set_id.py
index 0e0f555..f9b1cec 100644
--- a/node_normalizer/set_id.py
+++ b/node_normalizer/set_id.py
@@ -41,7 +41,7 @@ async def generate_setid(app, curies, conflations) -> SetIDResponse:
 
     # We use get_normalized_nodes() to normalize all the CURIEs for us.
     normalization_results = await get_normalized_nodes(
-        app, curies, gene_protein_conflation, drug_chemical_conflation, include_descriptions=False
+        app, curies, gene_protein_conflation, drug_chemical_conflation, include_descriptions=False, include_individual_types=False, include_taxa=True
     )
 
     # We prepare a set of sorted, deduplicated curies.

From 4c6688c9cf8fa4fdce5aedfa41dbe2b01b5eee0c Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Wed, 29 Oct 2025 17:40:08 -0400
Subject: [PATCH 2/6] Added on-push trigger for testing.

---
 .github/workflows/release.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 800d57b..e9d8f8f 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -1,6 +1,7 @@
 name: 'Publish to GitHub Packages'
 
 on:
+    push:
     release:
         types: [published]
 

From bc8b584af5276ae68d48e4576652cd536bf9a15b Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Wed, 29 Oct 2025 17:53:45 -0400
Subject: [PATCH 3/6] Simplified description and taxa code, added numerical
 suffix sort.

---
 node_normalizer/normalizer.py | 35 ++++++++++++++++++++---------------
 node_normalizer/util.py       | 14 ++++++++++++++
 2 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py
index 2a79411..159a12a 100644
--- a/node_normalizer/normalizer.py
+++ b/node_normalizer/normalizer.py
@@ -17,7 +17,12 @@
 from fastapi import FastAPI
 from reasoner_pydantic import KnowledgeGraph, Message, QueryGraph, Result, CURIE, Attribute
 
-from .util import LoggingUtil, uniquify_list, BIOLINK_NAMED_THING
+from .util import (
+    LoggingUtil,
+    uniquify_list,
+    BIOLINK_NAMED_THING,
+    get_numerical_curie_suffix,
+)
 
 # logger = LoggingUtil.init_logging(__name__, level=logging.INFO, format='medium', logFilePath=os.path.dirname(__file__), logFileLevel=logging.INFO)
 logger = LoggingUtil.init_logging()
@@ -793,34 +798,34 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, i
     # Now that we've determined a label for this clique, we should never use identifiers_with_labels, possible_labels,
     # or filtered_possible_labels after this point.
 
-    # if descriptions are enabled look for the first available description and use that 
-    if include_descriptions:
-        descriptions = list(
-            map(
-                lambda x: x[0],
-                filter(lambda x: len(x) > 0, [eid['d'] for eid in eids if 'd' in eid])
-                )
-        )
-        if len(descriptions) > 0:
-            node["id"]["description"] = descriptions[0]
-
     # now need to reformat the identifier keys.  It could be cleaner but we have to worry about if there is a label
+    first_description = None
+    node_taxa = set()
     node["equivalent_identifiers"] = []
     for eqid in eids:
         eq_item = {"identifier": eqid["i"]}
-        if "l" in eqid:
+        if "l" in eqid and eqid["l"]:
             eq_item["label"] = eqid["l"]
         # if descriptions is enabled and exist add them to each eq_id entry
-        if include_descriptions and "d" in eqid and len(eqid["d"]):
+        if include_descriptions and "d" in eqid and len(eqid["d"]) > 0:
             eq_item["description"] = eqid["d"][0]
+            if not first_description:
+                first_description = eq_item["description"]
         # if include_taxa is enabled and we have taxa on this node, add them to every eq_id entry
-        if include_taxa and "t" in eqid and len(eqid["t"]):
+        if include_taxa and "t" in eqid and eqid["t"]:
             eq_item["taxa"] = eqid["t"]
+            node_taxa.update(eqid["t"])
         # if individual types have been requested, add them too.
         if include_individual_types and 'types' in eqid:
             eq_item["type"] = eqid['types'][-1]
         node["equivalent_identifiers"].append(eq_item)
 
+    if include_descriptions:
+        node["description"] = first_description
+
+    if include_taxa:
+        node["taxa"] = sorted(node_taxa, key=get_numerical_curie_suffix)
+
     # We need to remove `biolink:Entity` from the types returned.
     # (See explanation at https://github.com/TranslatorSRI/NodeNormalization/issues/173)
     if 'biolink:Entity' in types[canonical_id]:
diff --git a/node_normalizer/util.py b/node_normalizer/util.py
index ee47691..46d7e08 100644
--- a/node_normalizer/util.py
+++ b/node_normalizer/util.py
@@ -11,6 +11,20 @@
 # Some constants.
 BIOLINK_NAMED_THING = "biolink:NamedThing"
 
+def get_numerical_curie_suffix(curie):
+    """
+    If a CURIE has a numerical suffix, return it as an integer. Otherwise return None.
+    :param curie: A CURIE.
+    :return: An integer if the CURIE suffix is castable to int, otherwise None.
+    """
+    curie_parts = curie.split(":", 1)
+    if len(curie_parts) > 0:
+        # Try to cast the CURIE suffix to an integer. If we get a ValueError, don't worry about it.
+        try:
+            return int(curie_parts[1])
+        except ValueError:
+            pass
+    return None
 
 # loggers = {}
 class LoggingUtil(object):

From d338dbe7102152dd768c3bc5c1234bd7c7bb8088 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Wed, 29 Oct 2025 18:03:23 -0400
Subject: [PATCH 4/6] Removed on:push trigger after testing.

---
 .github/workflows/release.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index e9d8f8f..800d57b 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -1,7 +1,6 @@
 name: 'Publish to GitHub Packages'
 
 on:
-    push:
     release:
         types: [published]
 

From d06b66cfc36822ac4bfc09920b1c0d4edcf19994 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Thu, 30 Oct 2025 16:46:53 -0400
Subject: [PATCH 5/6] Don't display description or taxa unless we have some.

---
 node_normalizer/normalizer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py
index 159a12a..d346996 100644
--- a/node_normalizer/normalizer.py
+++ b/node_normalizer/normalizer.py
@@ -820,10 +820,10 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, i
             eq_item["type"] = eqid['types'][-1]
         node["equivalent_identifiers"].append(eq_item)
 
-    if include_descriptions:
+    if include_descriptions and first_description:
         node["description"] = first_description
 
-    if include_taxa:
+    if include_taxa and node_taxa:
         node["taxa"] = sorted(node_taxa, key=get_numerical_curie_suffix)
 
     # We need to remove `biolink:Entity` from the types returned.

From 70a62e217ab9a5f4f4c0730329a504168666cffd Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Thu, 30 Oct 2025 16:48:58 -0400
Subject: [PATCH 6/6] Assume we don't need taxa for a setid.

---
 node_normalizer/set_id.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/node_normalizer/set_id.py b/node_normalizer/set_id.py
index f9b1cec..3c3dc30 100644
--- a/node_normalizer/set_id.py
+++ b/node_normalizer/set_id.py
@@ -41,7 +41,7 @@ async def generate_setid(app, curies, conflations) -> SetIDResponse:
 
     # We use get_normalized_nodes() to normalize all the CURIEs for us.
     normalization_results = await get_normalized_nodes(
-        app, curies, gene_protein_conflation, drug_chemical_conflation, include_descriptions=False, include_individual_types=False, include_taxa=True
+        app, curies, gene_protein_conflation, drug_chemical_conflation, include_descriptions=False, include_individual_types=False, include_taxa=False
     )
 
     # We prepare a set of sorted, deduplicated curies.