From dd520a1909974d7d8579ece0866f19b2f4749fa9 Mon Sep 17 00:00:00 2001 From: n1mus <709030+n1mus@users.noreply.github.com> Date: Thu, 17 Mar 2022 23:18:09 +0000 Subject: [PATCH 1/4] special char fulltext tests --- spec/test/data/ncbi_taxon.json | 144 ++++++++++++++++++ .../stored_queries/test_fulltext_search.py | 9 ++ 2 files changed, 153 insertions(+) diff --git a/spec/test/data/ncbi_taxon.json b/spec/test/data/ncbi_taxon.json index 9a4092f8..e16451d6 100644 --- a/spec/test/data/ncbi_taxon.json +++ b/spec/test/data/ncbi_taxon.json @@ -2260,5 +2260,149 @@ "expired": 1612915015846, "release_created": 1541030400000, "release_expired": 1612137599999 + }, + { + "_key": "338794_2018-11-01", + "_id": "ncbi_taxon/338794_2018-11-01", + "_rev": "_b2jbO4G--D", + "id": "338794", + "scientific_name": "low G+C Gram-positive bacterium HTA462", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 338794, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "586732_2018-11-01", + "_id": "ncbi_taxon/586732_2018-11-01", + "_rev": "_b2kB1gK--B", + "id": "586732", + "scientific_name": "Integrating expression vector pJEB403+drrA", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 586732, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "1127597_2018-11-01", + "_id": "ncbi_taxon/1127597_2018-11-01", + "_rev": "_b2lFmce--B", + "id": "1127597", + "scientific_name": "Fusarium cf. solani 3+4-uuu DPGS-2011", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 1127597, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "1173779_2018-11-01", + "_id": "ncbi_taxon/1173779_2018-11-01", + "_rev": "_b2lOxFa--_", + "id": "1173779", + "scientific_name": "Salmonella enterica subsp. diarizonae serovar 60:r:e,n,x,z15", + "rank": "no rank", + "strain": true, + "aliases": [], + "ncbi_taxon_id": 1173779, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "1906029_2018-11-01", + "_id": "ncbi_taxon/1906029_2018-11-01", + "_rev": "_b2nDL5---_", + "id": "1906029", + "scientific_name": "Nostoc sp. 'Peltigera sp. \"hawaiensis\" P1236 cyanobiont'", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 1906029, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "1945188_2018-11-01", + "_id": "ncbi_taxon/1945188_2018-11-01", + "_rev": "_b2nJbF2--_", + "id": "1945188", + "scientific_name": "Reporter vector p1168hIL6mC/EBP-luc+", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 1945188, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "1945295_2018-11-01", + "_id": "ncbi_taxon/1945295_2018-11-01", + "_rev": "_b2nJbIK--_", + "id": "1945295", + "scientific_name": "Vector pEntry-attR2-IRES-eGFP-luc+-pA-attL3", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 1945295, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "2727889_2021-02-01", + "_id": "ncbi_taxon/2727889_2021-02-01", + "_rev": "_b2n6us---A", + "id": "2727889", + "scientific_name": "Pleurocapsales cyanobacterium 'Beach rock 4+5\"'", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 2727889, + "gencode": 11, + "first_version": "2021-02-01", + "last_version": "2021-02-01", + "created": 1612915015847, + "expired": 9007199254740991, + "release_created": 1612137600000, + "release_expired": 9007199254740991 } ] diff --git a/spec/test/stored_queries/test_fulltext_search.py b/spec/test/stored_queries/test_fulltext_search.py index e0340d02..4cc9536a 100644 --- a/spec/test/stored_queries/test_fulltext_search.py +++ b/spec/test/stored_queries/test_fulltext_search.py @@ -55,6 +55,15 @@ "Vaccinia virus WR 65-16", "Dengue virus 2 Jamaica/1409/1983", "Dengue virus 2 Thailand/NGS-C/1944", + # --- Escape chars (,:+-|"') --- + "Salmonella enterica subsp. diarizonae serovar 60:r:e,n,x,z15", + "Fusarium cf. solani 3+4-uuu DPGS-2011", + "Integrating expression vector pJEB403+drrA", + "Vector pEntry-attR2-IRES-eGFP-luc+-pA-attL3", + "low G+C Gram-positive bacterium HTA462", + "Reporter vector p1168hIL6mC/EBP-luc+", + "Pleurocapsales cyanobacterium 'Beach rock 4+5\"'", + "Nostoc sp. 'Peltigera sp. \"hawaiensis\" P1236 cyanobiont'", # --- Dups (techinically only applicable to live data) --- "environmental samples", "Listeria sp. FSL_L7-0091", From 5d4feb8d04b160970165a3326645ec23851906cb Mon Sep 17 00:00:00 2001 From: n1mus <709030+n1mus@users.noreply.github.com> Date: Fri, 18 Mar 2022 01:36:05 +0000 Subject: [PATCH 2/4] test fake sciname with pipe (|) --- README.md | 2 +- spec/test/data/ncbi_taxon.json | 18 ++++++++++++++++++ .../stored_queries/test_fulltext_search.py | 15 ++++++++++++++- 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 85a97cb8..fa850c76 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ These specifications are used by the [Relation Engine API](relation_engine_serve The relation engine server (`relation_engine_server/`) is a simple API that allows KBase community developers to interact with the Relation Engine graph database. You can run stored queries or do bulk updates on documents. ## Relation Engine Startup -* Docker image is built with environment variable `SPEC_RELEASE_PATH=/opt/spec.tar.gz'. This contains the specs from the repo itself. +* Docker image is built with environment variable `SPEC_RELEASE_PATH=/opt/spec.tar.gz`. This contains the specs from the repo itself. * Wait for response from auth, workspace, and arangodb services, as they are set up * Specs are set up. Either the repo specs or remote specs are loaded into the specs root path * Collections, views, and analyzers from the specs are added to the ArangoDB server. If the collection, view, or analyzer already exists, but in a different configuration, it will _not_ be overwritten. diff --git a/spec/test/data/ncbi_taxon.json b/spec/test/data/ncbi_taxon.json index e16451d6..31866edb 100644 --- a/spec/test/data/ncbi_taxon.json +++ b/spec/test/data/ncbi_taxon.json @@ -2404,5 +2404,23 @@ "expired": 9007199254740991, "release_created": 1612137600000, "release_expired": 9007199254740991 + }, + { + "_key": "fake_2021-02-01", + "_id": "ncbi_taxon/fake_2021-02-01", + "_rev": "fake", + "id": "fake", + "scientific_name": "|Fake|fake|fake| ||fake||", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": -1, + "gencode": 11, + "first_version": "2021-02-01", + "last_version": "2021-02-01", + "created": 1612915015847, + "expired": 9007199254740991, + "release_created": 1612137600000, + "release_expired": 9007199254740991 } ] diff --git a/spec/test/stored_queries/test_fulltext_search.py b/spec/test/stored_queries/test_fulltext_search.py index 4cc9536a..86dc0d02 100644 --- a/spec/test/stored_queries/test_fulltext_search.py +++ b/spec/test/stored_queries/test_fulltext_search.py @@ -35,6 +35,7 @@ ncbi_taxa = json.load(fh) # scinames_test_all are all the test scinames +# These are selected from the ncbi_taxon collection scinames_test_all = [ # --- Token preceded by punctuation --- "Lactobacillus sp. 'thermophilus'", @@ -56,6 +57,7 @@ "Dengue virus 2 Jamaica/1409/1983", "Dengue virus 2 Thailand/NGS-C/1944", # --- Escape chars (,:+-|"') --- + # --- TODO sample scinames with the escape chars in different variety of syntaxes --- "Salmonella enterica subsp. diarizonae serovar 60:r:e,n,x,z15", "Fusarium cf. solani 3+4-uuu DPGS-2011", "Integrating expression vector pJEB403+drrA", @@ -64,6 +66,7 @@ "Reporter vector p1168hIL6mC/EBP-luc+", "Pleurocapsales cyanobacterium 'Beach rock 4+5\"'", "Nostoc sp. 'Peltigera sp. \"hawaiensis\" P1236 cyanobiont'", + "|Fake|fake|fake| ||fake||", # --- Dups (techinically only applicable to live data) --- "environmental samples", "Listeria sp. FSL_L7-0091", @@ -73,7 +76,8 @@ "Corticiaceae sp.", "Escherichia coli", ] -# scinames_test_latest are the test scinames that are compatible with a current timestamp +# scinames_test_latest are the test scinames that are not expired and +# compatible with a current timestamp scinames_test_latest = [ "Lactobacillus sp. 'thermophilus'", "Rabbit fibroma virus (strain Kasza)", @@ -88,6 +92,15 @@ "Vaccinia virus WR 65-16", "Dengue virus 2 Jamaica/1409/1983", "Dengue virus 2 Thailand/NGS-C/1944", + "Salmonella enterica subsp. diarizonae serovar 60:r:e,n,x,z15", + "Fusarium cf. solani 3+4-uuu DPGS-2011", + "Integrating expression vector pJEB403+drrA", + "Vector pEntry-attR2-IRES-eGFP-luc+-pA-attL3", + "low G+C Gram-positive bacterium HTA462", + "Reporter vector p1168hIL6mC/EBP-luc+", + "Pleurocapsales cyanobacterium 'Beach rock 4+5\"'", + "Nostoc sp. 'Peltigera sp. \"hawaiensis\" P1236 cyanobiont'", + "|Fake|fake|fake| ||fake||", "environmental samples", "Listeria sp. FSL_L7-0091", "Listeria sp. FSL_L7-1519", From b6ac528f9f917e7acfea0ed84f91ae96552824d2 Mon Sep 17 00:00:00 2001 From: n1mus <709030+n1mus@users.noreply.github.com> Date: Fri, 18 Mar 2022 01:38:40 +0000 Subject: [PATCH 3/4] doc generic fulltext search as problematic --- spec/stored_queries/generic/fulltext_search.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spec/stored_queries/generic/fulltext_search.yaml b/spec/stored_queries/generic/fulltext_search.yaml index b8a31b0a..6859add4 100644 --- a/spec/stored_queries/generic/fulltext_search.yaml +++ b/spec/stored_queries/generic/fulltext_search.yaml @@ -1,3 +1,6 @@ +# Should be REVISED or DEPRECATED. +# Is currently unused outside testing. +# # Search a collection with a fulltext index with an attribute name and search text # Also supports filtering by outer-level attributes # Not recommended for fast searching because it can be very slow and even timeout at 60s From fd93db42a7e97b4c7a620559338711f5abb129b6 Mon Sep 17 00:00:00 2001 From: n1mus <709030+n1mus@users.noreply.github.com> Date: Mon, 21 Mar 2022 20:53:43 +0000 Subject: [PATCH 4/4] correct docs --- spec/test/stored_queries/test_fulltext_search.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spec/test/stored_queries/test_fulltext_search.py b/spec/test/stored_queries/test_fulltext_search.py index 86dc0d02..99bd4d44 100644 --- a/spec/test/stored_queries/test_fulltext_search.py +++ b/spec/test/stored_queries/test_fulltext_search.py @@ -56,7 +56,7 @@ "Vaccinia virus WR 65-16", "Dengue virus 2 Jamaica/1409/1983", "Dengue virus 2 Thailand/NGS-C/1944", - # --- Escape chars (,:+-|"') --- + # --- Escape chars ( ,:+-|"' ) --- # --- TODO sample scinames with the escape chars in different variety of syntaxes --- "Salmonella enterica subsp. diarizonae serovar 60:r:e,n,x,z15", "Fusarium cf. solani 3+4-uuu DPGS-2011", @@ -67,7 +67,7 @@ "Pleurocapsales cyanobacterium 'Beach rock 4+5\"'", "Nostoc sp. 'Peltigera sp. \"hawaiensis\" P1236 cyanobiont'", "|Fake|fake|fake| ||fake||", - # --- Dups (techinically only applicable to live data) --- + # --- Dups (technically only applicable to live data) --- "environmental samples", "Listeria sp. FSL_L7-0091", "Listeria sp. FSL_L7-1519",