diff --git a/node_normalizer/model/input.py b/node_normalizer/model/input.py index ff854aa..ea7820e 100644 --- a/node_normalizer/model/input.py +++ b/node_normalizer/model/input.py @@ -36,6 +36,11 @@ class CurieList(BaseModel): title="Whether to return individual types for equivalent identifiers" ) + include_taxa: bool = Field( + True, + title="Whether to return taxa for equivalent identifiers" + ) + class Config: schema_extra = { "example": { diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index b92bc9c..d346996 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -17,7 +17,12 @@ from fastapi import FastAPI from reasoner_pydantic import KnowledgeGraph, Message, QueryGraph, Result, CURIE, Attribute -from .util import LoggingUtil, uniquify_list, BIOLINK_NAMED_THING +from .util import ( + LoggingUtil, + uniquify_list, + BIOLINK_NAMED_THING, + get_numerical_curie_suffix, +) # logger = LoggingUtil.init_logging(__name__, level=logging.INFO, format='medium', logFilePath=os.path.dirname(__file__), logFileLevel=logging.INFO) logger = LoggingUtil.init_logging() @@ -514,7 +519,7 @@ async def get_eqids_and_types( # Every equivalent identifier here has the same type. for eqid in eqids[index]: - eqid.update({'t': [typ]}) + eqid.update({'types': [typ]}) return eqids, types_with_ancestors @@ -525,7 +530,8 @@ async def get_normalized_nodes( conflate_gene_protein: bool, conflate_chemical_drug: bool, include_descriptions: bool = False, - include_individual_types: bool = True + include_individual_types: bool = True, + include_taxa: bool = True, ) -> Dict[str, Optional[str]]: """ Get value(s) for key(s) using redis MGET @@ -634,6 +640,7 @@ async def get_normalized_nodes( input_curie: await create_node(app, canonical_id, dereference_ids, dereference_types, info_contents, include_descriptions=include_descriptions, include_individual_types=include_individual_types, + include_taxa=include_taxa, conflations={ 'GeneProtein': conflate_gene_protein, 'DrugChemical': conflate_chemical_drug, @@ -674,7 +681,7 @@ async def get_info_content_attribute(app, canonical_nonan) -> dict: async def create_node(app, canonical_id, equivalent_ids, types, info_contents, include_descriptions=True, - include_individual_types=False, conflations=None): + include_individual_types=False, include_taxa=False, conflations=None): """Construct the output format given the compressed redis data""" # It's possible that we didn't find a canonical_id if canonical_id is None: @@ -791,31 +798,34 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, i # Now that we've determined a label for this clique, we should never use identifiers_with_labels, possible_labels, # or filtered_possible_labels after this point. - # if descriptions are enabled look for the first available description and use that - if include_descriptions: - descriptions = list( - map( - lambda x: x[0], - filter(lambda x: len(x) > 0, [eid['d'] for eid in eids if 'd' in eid]) - ) - ) - if len(descriptions) > 0: - node["id"]["description"] = descriptions[0] - # now need to reformat the identifier keys. It could be cleaner but we have to worry about if there is a label + first_description = None + node_taxa = set() node["equivalent_identifiers"] = [] for eqid in eids: eq_item = {"identifier": eqid["i"]} - if "l" in eqid: + if "l" in eqid and eqid["l"]: eq_item["label"] = eqid["l"] # if descriptions is enabled and exist add them to each eq_id entry - if include_descriptions and "d" in eqid and len(eqid["d"]): + if include_descriptions and "d" in eqid and len(eqid["d"]) > 0: eq_item["description"] = eqid["d"][0] + if not first_description: + first_description = eq_item["description"] + # if include_taxa is enabled and we have taxa on this node, add them to every eq_id entry + if include_taxa and "t" in eqid and eqid["t"]: + eq_item["taxa"] = eqid["t"] + node_taxa.update(eqid["t"]) # if individual types have been requested, add them too. - if include_individual_types and 't' in eqid: - eq_item["type"] = eqid['t'][-1] + if include_individual_types and 'types' in eqid: + eq_item["type"] = eqid['types'][-1] node["equivalent_identifiers"].append(eq_item) + if include_descriptions and first_description: + node["description"] = first_description + + if include_taxa and node_taxa: + node["taxa"] = sorted(node_taxa, key=get_numerical_curie_suffix) + # We need to remove `biolink:Entity` from the types returned. # (See explanation at https://github.com/TranslatorSRI/NodeNormalization/issues/173) if 'biolink:Entity' in types[canonical_id]: diff --git a/node_normalizer/server.py b/node_normalizer/server.py index 52ec2c4..18ca7ca 100644 --- a/node_normalizer/server.py +++ b/node_normalizer/server.py @@ -263,7 +263,8 @@ async def get_normalized_node_handler( conflate: bool = fastapi.Query(True, description="Whether to apply gene/protein conflation"), drug_chemical_conflate: bool = fastapi.Query(False, description="Whether to apply drug/chemical conflation"), description: bool = fastapi.Query(False, description="Whether to return curie descriptions when possible"), - individual_types: bool = fastapi.Query(False, description="Whether to return individual types for equivalent identifiers") + individual_types: bool = fastapi.Query(False, description="Whether to return individual types for equivalent identifiers"), + include_taxa: bool = fastapi.Query(True, description="Whether to return taxa for equivalent identifiers"), ): """ Get value(s) for key(s) using redis MGET @@ -271,7 +272,9 @@ async def get_normalized_node_handler( # no_conflate = request.args.get('dontconflate',['GeneProtein']) normalized_nodes = await get_normalized_nodes(app, curie, conflate, drug_chemical_conflate, include_descriptions=description, - include_individual_types=individual_types) + include_individual_types=individual_types, + include_taxa=include_taxa, + ) # If curie contains at least one entry, then the only way normalized_nodes could be blank # would be if an error occurred during processing. @@ -291,7 +294,9 @@ async def get_normalized_node_handler_post(curies: CurieList): Get value(s) for key(s) using redis MGET """ normalized_nodes = await get_normalized_nodes(app, curies.curies, curies.conflate, curies.drug_chemical_conflate, - curies.description, include_individual_types=curies.individual_types) + curies.description, include_individual_types=curies.individual_types, + include_taxa=curies.include_taxa, + ) # If curies.curies contains at least one entry, then the only way normalized_nodes could be blank # would be if an error occurred during processing. diff --git a/node_normalizer/set_id.py b/node_normalizer/set_id.py index 0e0f555..3c3dc30 100644 --- a/node_normalizer/set_id.py +++ b/node_normalizer/set_id.py @@ -41,7 +41,7 @@ async def generate_setid(app, curies, conflations) -> SetIDResponse: # We use get_normalized_nodes() to normalize all the CURIEs for us. normalization_results = await get_normalized_nodes( - app, curies, gene_protein_conflation, drug_chemical_conflation, include_descriptions=False + app, curies, gene_protein_conflation, drug_chemical_conflation, include_descriptions=False, include_individual_types=False, include_taxa=False ) # We prepare a set of sorted, deduplicated curies. diff --git a/node_normalizer/util.py b/node_normalizer/util.py index ee47691..46d7e08 100644 --- a/node_normalizer/util.py +++ b/node_normalizer/util.py @@ -11,6 +11,20 @@ # Some constants. BIOLINK_NAMED_THING = "biolink:NamedThing" +def get_numerical_curie_suffix(curie): + """ + If a CURIE has a numerical suffix, return it as an integer. Otherwise return None. + :param curie: A CURIE. + :return: An integer if the CURIE suffix is castable to int, otherwise None. + """ + curie_parts = curie.split(":", 1) + if len(curie_parts) > 0: + # Try to cast the CURIE suffix to an integer. If we get a ValueError, don't worry about it. + try: + return int(curie_parts[1]) + except ValueError: + pass + return None # loggers = {} class LoggingUtil(object):