diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh index d77bfe25..6dff8759 100644 --- a/scripts/run_tests.sh +++ b/scripts/run_tests.sh @@ -16,8 +16,8 @@ sh /app/scripts/start_server.sh & coverage erase # spec validation python -m spec.validate -# run importer/, relation_engine_server/, and spec/ tests, skip test_query.py -coverage run --branch -m pytest --ignore=spec/test/stored_queries/test_query.py +# run importer/, relation_engine_server/, and spec/ tests +coverage run --branch -m pytest # RE client tests PYTHONPATH=client_src python -m pytest client_src/test coverage html --omit=*/test_* diff --git a/spec/stored_queries/generic/fulltext_search.yaml b/spec/stored_queries/generic/fulltext_search.yaml index 42d637b3..b8a31b0a 100644 --- a/spec/stored_queries/generic/fulltext_search.yaml +++ b/spec/stored_queries/generic/fulltext_search.yaml @@ -1,4 +1,6 @@ # Search a collection with a fulltext index with an attribute name and search text +# Also supports filtering by outer-level attributes +# Not recommended for fast searching because it can be very slow and even timeout at 60s name: fulltext_search params: type: object diff --git a/spec/stored_queries/taxonomy/taxonomy_search_sci_name.yaml b/spec/stored_queries/taxonomy/taxonomy_search_sci_name.yaml index 0e43164c..8217fdf2 100644 --- a/spec/stored_queries/taxonomy/taxonomy_search_sci_name.yaml +++ b/spec/stored_queries/taxonomy/taxonomy_search_sci_name.yaml @@ -1,3 +1,5 @@ +# Should be REVISED then DEPRECATED +# # Search for a taxon with a scientific name # Offset is limited to 10k name: taxonomy_search_sci_name diff --git a/spec/stored_queries/taxonomy/taxonomy_search_species.yaml b/spec/stored_queries/taxonomy/taxonomy_search_species.yaml index fe7eebd0..0e5fbb82 100644 --- a/spec/stored_queries/taxonomy/taxonomy_search_species.yaml +++ b/spec/stored_queries/taxonomy/taxonomy_search_species.yaml @@ -1,3 +1,5 @@ +# DEPRECATED. See taxonomy_search_species_strains and taxonomy_search_species_strains_no_sort +# # Search for a species/strain. Similar to search_sci_name, but simpler and quicker name: taxonomy_search_species params: diff --git a/spec/stored_queries/taxonomy/taxonomy_ncbi_species.yaml b/spec/stored_queries/taxonomy/taxonomy_search_species_strain.yaml similarity index 83% rename from spec/stored_queries/taxonomy/taxonomy_ncbi_species.yaml rename to spec/stored_queries/taxonomy/taxonomy_search_species_strain.yaml index 655fa8f2..6ad6ee75 100644 --- a/spec/stored_queries/taxonomy/taxonomy_ncbi_species.yaml +++ b/spec/stored_queries/taxonomy/taxonomy_search_species_strain.yaml @@ -1,15 +1,23 @@ # Search ncbi_taxon collection for species/strains by scientific name -name: taxonomy_ncbi_species +name: taxonomy_search_species_strain params: type: object - required: [search_text] + required: ["@taxon_coll", sciname_field, search_text] additionalProperties: false properties: + "@taxon_coll": + type: string + title: Taxon collection name + examples: [ncbi_taxon, gtdb_taxon] search_text: type: string title: Search text examples: [escherichia, es] description: Text to search on the search attribute values + sciname_field: + type: string + title: Scientific name field name + examples: [scientific_name, name] ts: type: [integer, "null"] title: Versioning timestamp @@ -43,7 +51,7 @@ query: | FOR tok IN search_text__wordboundmod_icu_toks // prepend "prefix:" RETURN CONCAT("prefix:", tok) ) - FOR doc IN FULLTEXT(ncbi_taxon, "scientific_name", search_text__fulltext) + FOR doc IN FULLTEXT(@@taxon_coll, @sciname_field, search_text__fulltext) FILTER @ts ? doc.created <= @ts AND doc.expired >= @ts : true FILTER doc.rank IN ["species", "strain"] OR doc.strain LET doc_sciname__norm = REGEX_REPLACE(LOWER(TRIM(doc.scientific_name)), "\\s+", " ") // for exact matching diff --git a/spec/stored_queries/taxonomy/taxonomy_ncbi_species_no_sort.yaml b/spec/stored_queries/taxonomy/taxonomy_search_species_strain_no_sort.yaml similarity index 80% rename from spec/stored_queries/taxonomy/taxonomy_ncbi_species_no_sort.yaml rename to spec/stored_queries/taxonomy/taxonomy_search_species_strain_no_sort.yaml index 21a3cbdf..b9c0a56c 100644 --- a/spec/stored_queries/taxonomy/taxonomy_ncbi_species_no_sort.yaml +++ b/spec/stored_queries/taxonomy/taxonomy_search_species_strain_no_sort.yaml @@ -1,12 +1,20 @@ # Search ncbi_taxon collection for species/strains by scientific name # Except do not sort, just return the first however many documents # Useful for short prefixes (e.g., "s") that would be expensive yet not meaningful to sort -name: taxonomy_ncbi_species_no_sort +name: taxonomy_search_species_strain_no_sort params: type: object - required: [search_text] + required: ["@taxon_coll", sciname_field, search_text] additionalProperties: false properties: + "@taxon_coll": + type: string + title: Taxon collection name + examples: [ncbi_taxon, gtdb_taxon] + sciname_field: + type: string + title: Scientific name field name + examples: [scientific_name, name] search_text: type: string title: Search text @@ -43,7 +51,7 @@ query: | FOR tok IN search_text__wordboundmod_icu_toks // prepend "prefix:" RETURN CONCAT("prefix:", tok) ) - FOR doc IN FULLTEXT(ncbi_taxon, "scientific_name", search_text__fulltext) + FOR doc IN FULLTEXT(@@taxon_coll, @sciname_field, search_text__fulltext) FILTER @ts ? doc.created <= @ts AND doc.expired >= @ts : true FILTER doc.rank IN ["species", "strain"] OR doc.strain LIMIT @offset ? @offset : 0, @limit ? @limit : 20 diff --git a/spec/test/stored_queries/test_fulltext_search.py b/spec/test/stored_queries/test_fulltext_search.py index 0ff13fc6..e0340d02 100644 --- a/spec/test/stored_queries/test_fulltext_search.py +++ b/spec/test/stored_queries/test_fulltext_search.py @@ -1,5 +1,12 @@ """ -Tests for the generic fulltext search +Tests for stored queries involving a fulltext search: +* Generic fulltext_search (should be used with caution because it can be slow and timeout at 60s) +* Taxonomy taxonomy_search_species_strain +* Taxonomy taxonomy_search_species_strain_no_sort + +The latter two are switched between depending on the length of the search text. +These stored query tests are all bundled in one test file because their original purpose is to do a species/strain +name search on the ncbi_taxon collection These tests run within the re_api docker image, and require access to the ArangoDB, auth, and workspace images. """ @@ -80,7 +87,149 @@ ] -class Test(unittest.TestCase): +class TestTaxonomySearchSpeciesStrainStoredQueries(unittest.TestCase): + @classmethod + def setUpClass(cls): + check_spec_test_env() + create_test_docs("ncbi_taxon", ncbi_taxa) + + def test_ncbi_taxon_scinames(self): + """Happy path""" + for sciname in scinames_test_all: + _taxonomy_search_species_strain_queries( + self, + taxon_coll="ncbi_taxon", + sciname_field="scientific_name", + search_text=sciname, + ts=_NOW if sciname in scinames_test_latest else None, + offset=None, + limit=LIMIT, + select="scientific_name", + # --- + expect_error=False, + expect_hit=True, + ) + + def test_null_bind_params(self): + """Leave off parameters""" + for sciname in scinames_test_all: + _taxonomy_search_species_strain_queries( + self, + taxon_coll="ncbi_taxon", + sciname_field="scientific_name", + search_text=sciname, + ts=None, + offset=None, + limit=None, + select=None, + # --- + expect_error=False, + expect_hit=True, + ) + + def test_fully_specified_bind_params(self): + """Specify all parameters""" + for sciname in scinames_test_all: + _taxonomy_search_species_strain_queries( + self, + taxon_coll="ncbi_taxon", + sciname_field="scientific_name", + search_text=sciname, + ts=_NOW if sciname in scinames_test_latest else None, + offset=0, + limit=LIMIT, + select=["id", "scientific_name"], + # --- + expect_error=False, + expect_hit=True, + ) + + def test_extra_params(self): + """Extra params not in spec/aql""" + _taxonomy_search_species_strain_queries( + self, + taxon_coll="ncbi_taxon", + sciname_field="scientific_name", + search_text="esch", + ts=None, + offset=0, + limit=LIMIT, + select=["id", "scientific_name"], + extra_unused_param=42, + # --- + expect_error=("Additional properties are not allowed"), + ) + + def test_validation_fail(self): + _taxonomy_search_species_strain_queries( + self, + taxon_coll=[], + sciname_field=42, + search_text={"hi": 1}, + ts=None, + offset=None, + limit=None, + select=None, + # --- + expect_error="[] is not of type 'string'", + ) + + def test_aql_error(self): + for sciname in scinames_test_all: + _taxonomy_search_species_strain_queries( + self, + taxon_coll="ncbi_taxon", + sciname_field="fake_attrkey", + search_text=sciname, + ts=None, + offset=None, + limit=None, + select=None, + # --- + expect_error=True, + ) + + def test_no_hit(self): + for sciname in scinames_test_all: + _taxonomy_search_species_strain_queries( + self, + taxon_coll="ncbi_taxon", + sciname_field="scientific_name", + search_text=sciname[::-1], + ts=None, + offset=None, + limit=None, + select=None, + # --- + expect_error=False, + expect_hit=False, + expected_hits=[], + ) + + def test_prefix_hit(self): + """Test search text len being lte 3""" + _taxonomy_search_species_strain_queries( + self, + taxon_coll="ncbi_taxon", + sciname_field="scientific_name", + search_text="inf", + ts=None, + offset=None, + limit=None, + select=None, + # --- + expect_error=False, + expect_hit=False, + expected_hits=[ + "Influenza A virus PX8-XIII(A/USSR/90/77(H1N1)xA/Pintail Duck/Primorie/695/76(H2N3))", + "Influenza C virus (C/PIG/Beijing/439/1982)", + "Influenza B virus (B/Ann Arbor/1/1966 [cold-adapted and wild- type])", + "Influenza B virus (B/Brisbane/FSS700/2017)", + ], + ) + + +class TestFulltextSearchStoredQuery(unittest.TestCase): @classmethod def setUpClass(cls): check_spec_test_env() @@ -89,7 +238,7 @@ def setUpClass(cls): def test_ncbi_taxon_scinames(self): """Happy path""" for sciname in scinames_test_all: - _fulltext_query( + _fulltext_search_query( self, coll="ncbi_taxon", search_attrkey="scientific_name", @@ -111,7 +260,7 @@ def test_ncbi_taxon_scinames(self): def test_null_bind_params(self): """Leave off parameters""" for sciname in scinames_test_all: - _fulltext_query( + _fulltext_search_query( self, coll="ncbi_taxon", search_attrkey="scientific_name", @@ -129,7 +278,7 @@ def test_null_bind_params(self): def test_fully_specified_bind_params(self): """Specify all parameters""" for sciname in scinames_test_all: - _fulltext_query( + _fulltext_search_query( self, coll="ncbi_taxon", search_attrkey="scientific_name", @@ -150,7 +299,7 @@ def test_fully_specified_bind_params(self): def test_extra_params(self): """Extra params not in spec/aql""" - _fulltext_query( + _fulltext_search_query( self, coll="ncbi_taxon", search_attrkey="scientific_name", @@ -170,7 +319,7 @@ def test_extra_params(self): ) def test_validation_fail(self): - _fulltext_query( + _fulltext_search_query( self, coll=[], search_attrkey=42, @@ -186,7 +335,7 @@ def test_validation_fail(self): def test_aql_error(self): for sciname in scinames_test_all: - _fulltext_query( + _fulltext_search_query( self, coll="ncbi_taxon", search_attrkey="fake_attrkey", @@ -202,7 +351,7 @@ def test_aql_error(self): def test_no_hit(self): for sciname in scinames_test_all: - _fulltext_query( + _fulltext_search_query( self, coll="ncbi_taxon", search_attrkey="scientific_name", @@ -222,7 +371,56 @@ def test_no_hit(self): # --- Test helpers --- -def _fulltext_query( +def _switch_taxonomy_search_species_strain_queries(search_text): + return ( + "taxonomy_search_species_strain_no_sort" + if len(search_text) <= 3 + else "taxonomy_search_species_strain" + ) + + +def _taxonomy_search_species_strain_queries( + self, + taxon_coll, + sciname_field, + search_text, + ts, + offset, + limit, + select, + expect_error=False, + expect_hit=True, + expected_hits=None, + **kw, # for testing passing disallowed properties +): + """ + Run query against ArangoDB server + """ + data = { + "@taxon_coll": taxon_coll, + "sciname_field": sciname_field, + "search_text": search_text, + "ts": ts, + "offset": offset, + "limit": limit, + "select": select, + **kw, + } + stored_query = _switch_taxonomy_search_species_strain_queries(search_text) + _check_query_results( + self, + data, + stored_query, + sciname_field, + search_text, + limit, + expect_error, + expect_hit, + expected_hits, + ) + + +def _fulltext_search_query( self, coll, search_attrkey, @@ -251,9 +449,34 @@ def _fulltext_query( "select": select, **kw, } + stored_query = "fulltext_search" + _check_query_results( + self, + data, + stored_query, + search_attrkey, + search_text, + limit, + expect_error, + expect_hit, + expected_hits, + ) + + +def _check_query_results( + self, + data, + stored_query, + search_attrkey, + search_text, + limit, + expect_error, + expect_hit, + expected_hits, +): resp = requests.post( _CONF["re_api_url"] + "/api/v1/query_results", - params={"stored_query": "fulltext_search"}, + params={"stored_query": stored_query}, data=json.dumps(data), ) @@ -276,13 +499,14 @@ def _fulltext_query( self.assertNotIn(search_text, hits) if expected_hits is not None: - self.assertEqual(expected_hits, hits) + self.assertCountEqual(expected_hits, hits) # Filter out null values + # to see if their default null values would kick in properly data = {k: v for k, v in data.items() if v is not None} resp = requests.post( _CONF["re_api_url"] + "/api/v1/query_results", - params={"stored_query": "fulltext_search"}, + params={"stored_query": stored_query}, data=json.dumps(data), ) @@ -305,4 +529,4 @@ def _fulltext_query( self.assertNotIn(search_text, hits) if expected_hits is not None: - self.assertEqual(expected_hits, hits) + self.assertCountEqual(expected_hits, hits) diff --git a/spec/test/stored_queries/test_query.py b/spec/test/stored_queries/test_query.py index 7fe06af6..cf0dbe49 100644 --- a/spec/test/stored_queries/test_query.py +++ b/spec/test/stored_queries/test_query.py @@ -10,12 +10,19 @@ import pytest from typing import Tuple, List from requests.exceptions import ReadTimeout +import unittest from arango import ArangoClient import numpy as np from relation_engine_server.utils import json_validation +# Skip entire module if env var not set +if not os.environ.get("DO_QUERY_TESTING"): + raise unittest.SkipTest( + "Env var DO_QUERY_TESTING not set. Skipping query testing module" + ) + warnings.filterwarnings("ignore") # Directories and files @@ -27,24 +34,23 @@ SCINAMES_LATEST_FP = os.path.join(TMP_OUT_DIR, "ncbi_scinames_latest.json") SAMPLINGS_FP = os.path.join(TMP_OUT_DIR, "samplings.json") STORED_QUERY_FP = os.path.join( - ROOT_DIR, "spec/stored_queries/taxonomy/taxonomy_ncbi_species.yaml" + ROOT_DIR, "spec/stored_queries/taxonomy/taxonomy_search_species_strain.yaml" ) STORED_QUERY_NO_SORT_FP = os.path.join( - ROOT_DIR, "spec/stored_queries/taxonomy/taxonomy_ncbi_species_no_sort.yaml" + ROOT_DIR, "spec/stored_queries/taxonomy/taxonomy_search_species_strain_no_sort.yaml" ) if not os.path.exists(TMP_OUT_DIR): os.mkdir(TMP_OUT_DIR) +# Read config try: with open(CONFIG_FP) as fh: CONFIG = json.load(fh) - if not CONFIG["host"] or not CONFIG["username"] or not CONFIG["password"]: - raise RuntimeError("Missing config fields") CLIENT = ArangoClient(hosts=CONFIG["host"]) DB = CLIENT.db("ci", username=CONFIG["username"], password=CONFIG["password"]) except Exception as e: - help = """ + help_msg = """ Please set host URL, username, and password in arango_live_server_config.json, e.g., { "username": "doe_j", @@ -56,18 +62,23 @@ `ssh -L 8532:10.58.1.211:8532 j_doe@login1.berkeley.kbase.us` Then, the url would be `http://localhost:8532` """ - print(help) - raise (e) + print(help_msg) + raise + +# Get pointer to collection NCBI_TAXON = DB.collection("ncbi_taxon") # Load the queries QUERY = json_validation.load_json_yaml(STORED_QUERY_FP)["query"] QUERY_NO_SORT = json_validation.load_json_yaml(STORED_QUERY_NO_SORT_FP)["query"] +# Set query bind parameters LIMIT = 20 NOW = time.time() * 1000 # Load/cache the scinames +# This probably won't work well and will need some fiddling/improvement +# because doing it this way can lead to a timeout on some machine setups if os.path.isfile(SCINAMES_LATEST_FP): with open(SCINAMES_LATEST_FP) as fh: SCINAMES_LATEST = json.load(fh) @@ -85,11 +96,13 @@ and taxa["created"] <= NOW and NOW <= taxa["expired"] ] + # Cache latest scinames with open(SCINAMES_LATEST_FP, "w") as fh: json.dump(SCINAMES_LATEST, fh) def use_sort(search_text): + """Determine whether to use the sorting or non-sorting query""" return len(search_text) > 3 @@ -111,11 +124,13 @@ def jprint(jo, dry=False): print(txt) -def fulltext_search_ncbi_scinames(search_text): - """""" +def taxonomy_search_species_strain(search_text): + """Make the query""" cursor = DB.aql.execute( QUERY if use_sort(search_text) else QUERY_NO_SORT, bind_vars={ + "@taxon_coll": "ncbi_taxon", + "sciname_field": "scientific_name", "search_text": search_text, "ts": NOW, "offset": None, @@ -349,7 +364,7 @@ def do_query_testing( data.append(dat) try: - query_res = fulltext_search_ncbi_scinames(search_text) + query_res = taxonomy_search_species_strain(search_text) except Exception: handle_err("Something went wrong in the query!", dat, failed)