diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..e0862330 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,51 @@ +.dockerignore +.travis.yml +__pycache__ +*.pyc +*.pyo +*.pyd +.Python +.tox +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover +*.log +.git +.git* +CODEOWNERS +LICENSE.md +Makefile +hooks + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +/dist/ +/client_src/dist/ +eggs/ +.eggs/ +*.egg-info/ +*.egg +MANIFEST + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# docker bits +Dockerfile* +docker-compose* \ No newline at end of file diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..2b3b13a8 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,7 @@ +/CODEOWNERS export-ignore +/.gitattributes export-ignore +/.gitignore export-ignore +/.travis.yml export-ignore +README.md export-ignore +/Makefile export-ignore +/test export-ignore diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md new file mode 100644 index 00000000..2b5e08cb --- /dev/null +++ b/.github/CONTRIBUTING.md @@ -0,0 +1,22 @@ +# Contributing to KBase + +Important resources: + +* [The KBase Code of Conduct](https://kbase.us/code-of-conduct/) +* [The KBase user documentation](https://kbase.us/new-to-kbase/) +* [The KBase users' slack channel](https://kbaseusers.slack.com) +* [The KBase SDK docs](https://kbase.github.io/kb_sdk_docs/) +* [The kbase Github organization](https://github.com/kbase) +* [The kbaseapps Github organization](https://github.com/kbaseapps) + +## Submitting changes + +After opening a Github pull request on the repo you'd like to update, be sure to: + +* Update documentation, including the README.md file, any additional documentation under `/docs`, and any separate documentation sites that cover the functionality. +* Check that you added test coverage for any changes you made. +* Check if your changes break an API. If so, increment the version and ensure that any current dependents will not break. + +Always write a clear log messages for your commits. + +:tada: :100: Thank you for your contributions! diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 00000000..bfc22517 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,8 @@ + +- [ ] I updated the README.md docs to reflect this change. + +For changes to the codebase: + +- [ ] I have written tests to cover this change. +- [ ] This is not a breaking API change OR +- [ ] This is a breaking API change and I have incremented the API version and added a summary to CHANGELOG.md. diff --git a/.github/workflows/build_prodrc_pr.yaml b/.github/workflows/build_prodrc_pr.yaml new file mode 100644 index 00000000..2e5034e7 --- /dev/null +++ b/.github/workflows/build_prodrc_pr.yaml @@ -0,0 +1,31 @@ +--- +name: Build Prod RC Image +'on': + pull_request: + branches: + - master + - main + types: + - opened + - synchronize + - ready_for_review +jobs: + docker_build: + runs-on: ubuntu-latest + steps: + - name: Verify merge is develop -> main + if: github.head_ref != 'develop' + run: echo "Must merge from develop -> main/master"; exit 1 + - name: Check out GitHub Repo + if: github.event.pull_request.draft == false && github.head_ref == 'develop' + with: + ref: "${{ github.event.pull_request.head.sha }}" + uses: actions/checkout@v2 + - name: Build and Push to Packages + if: github.event.pull_request.draft == false && github.head_ref == 'develop' + env: + PR: "${{ github.event.pull_request.number }}" + SHA: "${{ github.event.pull_request.head.sha }}" + DOCKER_ACTOR: "${{ secrets.GHCR_USERNAME }}" + DOCKER_TOKEN: "${{ secrets.GHCR_TOKEN }}" + run: "./.github/workflows/scripts/build_prodrc_pr.sh\n" diff --git a/.github/workflows/build_test_pr.yaml b/.github/workflows/build_test_pr.yaml new file mode 100644 index 00000000..b6b53286 --- /dev/null +++ b/.github/workflows/build_test_pr.yaml @@ -0,0 +1,27 @@ +--- +name: Build Test Image +'on': + pull_request: + branches: + - develop + types: + - opened + - synchronize + - ready_for_review +jobs: + docker_build: + runs-on: ubuntu-latest + steps: + - name: Check out GitHub Repo + if: github.event.pull_request.draft == false + with: + ref: "${{ github.event.pull_request.head.sha }}" + uses: actions/checkout@v2 + - name: Build and Push to Packages + if: github.event.pull_request.draft == false + env: + PR: "${{ github.event.pull_request.number }}" + SHA: "${{ github.event.pull_request.head.sha }}" + DOCKER_ACTOR: "${{ secrets.GHCR_USERNAME }}" + DOCKER_TOKEN: "${{ secrets.GHCR_TOKEN }}" + run: "./.github/workflows/scripts/build_test_pr.sh\n" diff --git a/.github/workflows/prod_release.yaml b/.github/workflows/prod_release.yaml new file mode 100644 index 00000000..ffa14533 --- /dev/null +++ b/.github/workflows/prod_release.yaml @@ -0,0 +1,38 @@ +--- +name: Publish Release Image +'on': + release: + branches: + - main + - master + types: + - published +jobs: + docker_build: + runs-on: ubuntu-latest + steps: + - name: Check Tag + id: check-tag + run: |- + if [[ ${{ github.ref_name }} =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then + echo ::set-output name=match::true + fi + - name: Report SemVer Check + if: steps.check-tag.outputs.match != 'true' + run: echo "Release version must follow semantic naming (e.g. 1.0.2)"; exit 1 + - name: Check Source Branch + if: github.event.release.target_commitish != 'master' && github.event.release.target_commitish != 'main' + run: echo "Releases must be built from master/main branch"; exit 1 + - name: Check out GitHub Repo + with: + ref: "${{ github.event.pull_request.head.sha }}" + uses: actions/checkout@v2 + - name: Build and Push to Packages + env: + ISH: "${{ github.event.release.target_commitish }}" + PR: "${{ github.event.pull_request.number }}" + SHA: "${{ github.event.pull_request.head.sha }}" + VER: "${{ github.event.release.tag_name }}" + DOCKER_ACTOR: "${{ secrets.GHCR_USERNAME }}" + DOCKER_TOKEN: "${{ secrets.GHCR_TOKEN }}" + run: "./.github/workflows/scripts/prod_release.sh\n" diff --git a/.github/workflows/run_tests.yaml b/.github/workflows/run_tests.yaml new file mode 100644 index 00000000..1b864459 --- /dev/null +++ b/.github/workflows/run_tests.yaml @@ -0,0 +1,49 @@ +name: Relation Engine test and deploy +on: + [push, pull_request] +jobs: + run_tests: + runs-on: ubuntu-latest + steps: + - name: checkout git repo + uses: actions/checkout@v2 + + - name: run tests + shell: bash + env: + GITHUB_ACTIONS_CI: 1 + run: | + docker-compose build + docker-compose run re_api sh scripts/run_tests.sh + docker-compose down --remove-orphans + + docker_build_and_push: + runs-on: ubuntu-latest + needs: run_tests + if: (github.ref == 'refs/heads/develop' || github.ref == 'refs/heads/master') && github.event_name == 'push' && !contains(github.event.head_commit.message, 'skip_docker_build') + steps: + - name: checkout git repo + uses: actions/checkout@v2 + + - name: copy VERSION to TAG_NAME + shell: bash + run: | + mkdir -p .target + cp VERSION .target/TAG_NAME + + - name: set env vars + shell: bash + run: | + echo "DATE=$(date -u +"%Y-%m-%dT%H:%M:%SZ")" >> $GITHUB_ENV + echo "BRANCH=$(git symbolic-ref --short HEAD)" >> $GITHUB_ENV + echo "COMMIT=$(git rev-parse --short HEAD)" >> $GITHUB_ENV + + - name: build and push to dockerhub + uses: opspresso/action-docker@master + with: + args: --docker + env: + USERNAME: ${{ secrets.DOCKER_USERNAME }} + PASSWORD: ${{ secrets.DOCKER_PASSWORD }} + DOCKERFILE: "Dockerfile" + IMAGE_NAME: "kbase/relation_engine_api" diff --git a/.github/workflows/scripts/build_prodrc_pr.sh b/.github/workflows/scripts/build_prodrc_pr.sh new file mode 100755 index 00000000..4c7bdf27 --- /dev/null +++ b/.github/workflows/scripts/build_prodrc_pr.sh @@ -0,0 +1,17 @@ +#! /usr/bin/env bash + +export MY_ORG=$(echo "${GITHUB_REPOSITORY}" | awk -F / '{print $1}') +export MY_APP=$(echo "${GITHUB_REPOSITORY}" | awk -F / '{print $2}') +export DATE=$(date -u +"%Y-%m-%dT%H:%M:%SZ") +export BUILD_DATE=$(date -u +"%Y-%m-%dT%H:%M:%SZ") +export COMMIT=$(echo "$SHA" | cut -c -7) + +echo "Branch is:" ${GITHUB_HEAD_REF} +docker login -u "$DOCKER_ACTOR" -p "$DOCKER_TOKEN" ghcr.io +docker build --build-arg BUILD_DATE="$DATE" \ + --build-arg COMMIT="$COMMIT" \ + --build-arg BRANCH="$GITHUB_HEAD_REF" \ + --build-arg PULL_REQUEST="$PR" \ + --label us.kbase.vcs-pull-req="$PR" \ + -t ghcr.io/"$MY_ORG"/"$MY_APP":"pr-""$PR" . +docker push ghcr.io/"$MY_ORG"/"$MY_APP":"pr-""$PR" diff --git a/.github/workflows/scripts/build_test_pr.sh b/.github/workflows/scripts/build_test_pr.sh new file mode 100755 index 00000000..546b1b42 --- /dev/null +++ b/.github/workflows/scripts/build_test_pr.sh @@ -0,0 +1,17 @@ +#! /usr/bin/env bash + +export MY_ORG=$(echo "${GITHUB_REPOSITORY}" | awk -F / '{print $1}') +export MY_APP=$(echo $(echo "${GITHUB_REPOSITORY}" | awk -F / '{print $2}')"-develop") +export DATE=$(date -u +"%Y-%m-%dT%H:%M:%SZ") +export BUILD_DATE=$(date -u +"%Y-%m-%dT%H:%M:%SZ") +export COMMIT=$(echo "$SHA" | cut -c -7) + +echo $DOCKER_TOKEN | docker login ghcr.io -u $DOCKER_ACTOR --password-stdin +docker build --build-arg BUILD_DATE="$DATE" \ + --build-arg COMMIT="$COMMIT" \ + --build-arg BRANCH="$GITHUB_HEAD_REF" \ + --build-arg PULL_REQUEST="$PR" \ + --label us.kbase.vcs-pull-req="$PR" \ + -t ghcr.io/"$MY_ORG"/"$MY_APP":"pr-""$PR" . +docker push ghcr.io/"$MY_ORG"/"$MY_APP":"pr-""$PR" + \ No newline at end of file diff --git a/.github/workflows/scripts/deploy_tag.sh b/.github/workflows/scripts/deploy_tag.sh new file mode 100755 index 00000000..5fb928ab --- /dev/null +++ b/.github/workflows/scripts/deploy_tag.sh @@ -0,0 +1,34 @@ +#! /usr/bin/env bash + +# Usage: ./deploy_tag.sh -e TARGET -o ORG -r REPO -s DEV_PROD -t IMAGE_TAG +# +# Example 1: ./deploy_tag.sh -o "kbase" -r "narrative-traefiker" -s "dev" -t "pr-9001" -e "ci" +# Example 2: ./deploy_tag.sh -o "kbase" -r "narrative" -s "prod" -t "latest" -e "next" +# +# Where: +# -o ORG is the organization (`kbase`, `kbaseapps`, etc.) +# -r REPO is the repository (e.g. `narrative`) +# -s DEV_PROD determines whether to pull the development {APPNAME}-develop or production {APPNAME} image. +# -t IMAGE_TAG is the *current* Docker image tag, typically `pr-#` or `latest` +# -e TARGET is one of: `appdsshev`, `ci`, or `next` +# +# Be sure to set $TOKEN first! +# See: https://docs.github.com/en/packages/getting-started-with-github-container-registry/migrating-to-github-container-registry-for-docker-images#authenticating-with-the-container-registry + + +while getopts e:o:r:s:t: option + do + case "${option}" + in + e) TARGET=${OPTARG};; + o) ORG=${OPTARG};; + r) REPO=${OPTARG};; + s) DEV_PROD=${OPTARG};; + t) IMAGE_TAG=${OPTARG};; + esac +done + +curl -H "Authorization: token $TOKEN" \ + -H 'Accept: application/vnd.github.everest-preview+json' \ + "https://api.github.com/repos/$ORG/$REPO/dispatches" \ + -d '{"event_type":"Tag '"$DEV_PROD"' '"$IMAGE_TAG"' for '"$TARGET"'", "client_payload": {"image_tag": "'"$IMAGE_TAG"'","target": "'"$TARGET"'","dev_prod": "'"$DEV_PROD"'"}}' diff --git a/.github/workflows/scripts/prod_release.sh b/.github/workflows/scripts/prod_release.sh new file mode 100755 index 00000000..46d008c6 --- /dev/null +++ b/.github/workflows/scripts/prod_release.sh @@ -0,0 +1,24 @@ +#! /usr/bin/env bash + +export MY_ORG=$(echo "${GITHUB_REPOSITORY}" | awk -F / '{print $1}') +export MY_APP=$(echo "${GITHUB_REPOSITORY}" | awk -F / '{print $2}') +export DATE=$(date -u +"%Y-%m-%dT%H:%M:%SZ") +export BUILD_DATE=$(date -u +"%Y-%m-%dT%H:%M:%SZ") +export COMMIT=$(echo "$SHA" | cut -c -7) + +echo "ISH is:" $ISH +echo "GITHUB_REF is:" $GITHUB_REF +echo "HEAD_REF is:" $GITHUB_HEAD_REF +echo "BASE_REF is:" $GITHUB_BASE_REF +echo "Release is:" $GITHUB_REF_NAME +echo $DOCKER_TOKEN | docker login ghcr.io -u $DOCKER_ACTOR --password-stdin +docker build --build-arg BUILD_DATE="$DATE" \ + --build-arg COMMIT="$COMMIT" \ + --build-arg BRANCH="$GITHUB_HEAD_REF" \ + --build-arg PULL_REQUEST="$PR" \ + --build-arg VERSION="$VER" \ + --label us.kbase.vcs-pull-req="$PR" \ + -t ghcr.io/"$MY_ORG"/"$MY_APP":"$VER" \ + -t ghcr.io/"$MY_ORG"/"$MY_APP":"latest" . +docker push ghcr.io/"$MY_ORG"/"$MY_APP":"$VER" +docker push ghcr.io/"$MY_ORG"/"$MY_APP":"latest" diff --git a/.github/workflows/scripts/tag_environments.sh b/.github/workflows/scripts/tag_environments.sh new file mode 100755 index 00000000..b39732a0 --- /dev/null +++ b/.github/workflows/scripts/tag_environments.sh @@ -0,0 +1,22 @@ + +#! /usr/bin/env bash +# Add vars for PR & environments to yaml, as called from external script + +export MY_ORG=$(echo "${GITHUB_REPOSITORY}" | awk -F / '{print $1}') +export MY_APP=$(echo "${GITHUB_REPOSITORY}" | awk -F / '{print $2}') +export DATE=$(date -u +"%Y-%m-%dT%H:%M:%SZ") +export BUILD_DATE=$(date -u +"%Y-%m-%dT%H:%M:%SZ") +export COMMIT=$(echo "$SHA" | cut -c -7) + +if [ $DEV_PROD = "dev" ] || [ $DEV_PROD = "develop" ] +then + IMAGE=$MY_APP"-develop" +else + IMAGE=$MY_APP +fi + +echo "Dev or Prod:" $DEV_PROD +docker login -u "$DOCKER_ACTOR" -p "$DOCKER_TOKEN" ghcr.io +docker pull ghcr.io/"$MY_ORG"/"$IMAGE":"$IMAGE_TAG" +docker tag ghcr.io/"$MY_ORG"/"$IMAGE":"$IMAGE_TAG" ghcr.io/"$MY_ORG"/"$IMAGE":"$TARGET" +docker push ghcr.io/"$MY_ORG"/"$IMAGE":"$TARGET" diff --git a/.github/workflows/scripts/tag_prod_latest.sh b/.github/workflows/scripts/tag_prod_latest.sh new file mode 100755 index 00000000..c3c42252 --- /dev/null +++ b/.github/workflows/scripts/tag_prod_latest.sh @@ -0,0 +1,12 @@ +#! /usr/bin/env bash + +export MY_ORG=$(echo "${GITHUB_REPOSITORY}" | awk -F / '{print $1}') +export MY_APP=$(echo "${GITHUB_REPOSITORY}" | awk -F / '{print $2}') +export DATE=$(date -u +"%Y-%m-%dT%H:%M:%SZ") +export BUILD_DATE=$(date -u +"%Y-%m-%dT%H:%M:%SZ") +export COMMIT=$(echo "$SHA" | cut -c -7) + +docker login -u "$DOCKER_ACTOR" -p "$DOCKER_TOKEN" ghcr.io +docker pull ghcr.io/"$MY_ORG"/"$MY_APP":"pr-""$PR" +docker tag ghcr.io/"$MY_ORG"/"$MY_APP":"pr-""$PR" ghcr.io/"$MY_ORG"/"$MY_APP":"latest-rc" +docker push ghcr.io/"$MY_ORG"/"$MY_APP":"latest-rc" diff --git a/.github/workflows/scripts/tag_test_latest.sh b/.github/workflows/scripts/tag_test_latest.sh new file mode 100755 index 00000000..c0dc504a --- /dev/null +++ b/.github/workflows/scripts/tag_test_latest.sh @@ -0,0 +1,12 @@ +#! /usr/bin/env bash + +export MY_ORG=$(echo "${GITHUB_REPOSITORY}" | awk -F / '{print $1}') +export MY_APP=$(echo $(echo "${GITHUB_REPOSITORY}" | awk -F / '{print $2}')"-develop") +export DATE=$(date -u +"%Y-%m-%dT%H:%M:%SZ") +export BUILD_DATE=$(date -u +"%Y-%m-%dT%H:%M:%SZ") +export COMMIT=$(echo "$SHA" | cut -c -7) + +docker login -u "$DOCKER_ACTOR" -p "$DOCKER_TOKEN" ghcr.io +docker pull ghcr.io/"$MY_ORG"/"$MY_APP":"pr-""$PR" +docker tag ghcr.io/"$MY_ORG"/"$MY_APP":"pr-""$PR" ghcr.io/"$MY_ORG"/"$MY_APP":"latest" +docker push ghcr.io/"$MY_ORG"/"$MY_APP":"latest" diff --git a/.github/workflows/tag_environments.yaml b/.github/workflows/tag_environments.yaml new file mode 100644 index 00000000..6dba7431 --- /dev/null +++ b/.github/workflows/tag_environments.yaml @@ -0,0 +1,19 @@ +--- +name: Tag Image For Deploy +'on': + repository_dispatch +jobs: + tag_environments: + runs-on: ubuntu-latest + steps: + - name: Check out GitHub Repo + uses: actions/checkout@v2 + - name: Tag Deploy Environments + env: + DOCKER_ACTOR: "${{ secrets.GHCR_USERNAME }}" + DOCKER_TOKEN: ${{ secrets.GHCR_TOKEN }} + IMAGE_TAG: ${{ github.event.client_payload.image_tag }} + SHA: ${{ github.event.pull_request.head.sha }} + TARGET: ${{ github.event.client_payload.target }} + DEV_PROD: ${{ github.event.client_payload.dev_prod }} + run: './.github/workflows/scripts/tag_environments.sh' diff --git a/.github/workflows/tag_prod_latest.yaml b/.github/workflows/tag_prod_latest.yaml new file mode 100644 index 00000000..12b23df0 --- /dev/null +++ b/.github/workflows/tag_prod_latest.yaml @@ -0,0 +1,27 @@ +--- +name: Tag Prod Latest +'on': + pull_request: + branches: + - master + - main + types: + - closed +jobs: + docker_tag: + runs-on: ubuntu-latest + steps: + - name: Check out GitHub Repo + if: github.event_name == 'pull_request' && github.event.action == 'closed' && + github.event.pull_request.merged == true + with: + ref: "${{ github.event.pull_request.head.sha }}" + uses: actions/checkout@v2 + - name: Build and Push to Packages + if: github.event.pull_request.draft == false + env: + PR: "${{ github.event.pull_request.number }}" + SHA: "${{ github.event.pull_request.head.sha }}" + DOCKER_ACTOR: "${{ secrets.GHCR_USERNAME }}" + DOCKER_TOKEN: "${{ secrets.GHCR_TOKEN }}" + run: "./.github/workflows/scripts/tag_prod_latest.sh\n" diff --git a/.github/workflows/tag_test_latest.yaml b/.github/workflows/tag_test_latest.yaml new file mode 100644 index 00000000..d8cac465 --- /dev/null +++ b/.github/workflows/tag_test_latest.yaml @@ -0,0 +1,26 @@ +--- +name: Tag Latest Test Image +'on': + pull_request: + branches: + - develop + types: + - closed +jobs: + docker_tag: + runs-on: ubuntu-latest + steps: + - name: Check out GitHub Repo + if: github.event_name == 'pull_request' && github.event.action == 'closed' && + github.event.pull_request.merged == true + with: + ref: "${{ github.event.pull_request.head.sha }}" + uses: actions/checkout@v2 + - name: Build and Push to Packages + if: github.event.pull_request.draft == false + env: + PR: "${{ github.event.pull_request.number }}" + SHA: "${{ github.event.pull_request.head.sha }}" + DOCKER_ACTOR: "${{ secrets.GHCR_USERNAME }}" + DOCKER_TOKEN: "${{ secrets.GHCR_TOKEN }}" + run: "./.github/workflows/scripts/tag_test_latest.sh\n" diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..02727e6d --- /dev/null +++ b/.gitignore @@ -0,0 +1,23 @@ +# Environment variables +.env + +# Test coverage +/htmlcov/ + +# Caches and temp dirs +/build/ +/dist/ +/client_src/dist/ +*.pyc +.mypy_cache/ +.cache/ +/tmp/ +coverage_report/ +.coverage +*.egg-info/ + +# Spec archives +spec.tar.gz + +# Creds +arango_live_server_config.json diff --git a/.gitignore/.gitignore b/.gitignore/.gitignore deleted file mode 100644 index b6e47617..00000000 --- a/.gitignore/.gitignore +++ /dev/null @@ -1,129 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -pip-wheel-metadata/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -.python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..8896d9a2 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,76 @@ +# Changelog for kbase/relation_engine + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + + + +## [0.0.18] - 2022-03-02 +### Added +- taxonomy_search_species_strain and taxonomy_search_species_strain_no_sort stored queries + +## [0.0.17] - 2022-01-25 +### Added +- Ensure local specs match server specs +### Changed +- Remove explicit namespace from "icu_tokenize" analyzer + +## [0.0.16] - 2022-01-14 +### Added +- Added github actions to build docker images on ghcr.io +- Added generic fulltext search stored query + +## [0.0.12] - 2021-01-29 +### Added +- In the API, show the source file path or URL when updating the specs + +## [0.0.11] - 2020-11-19 +### Changed +- DJORNL edge spec (`spec/collections/djornl/djornl_edge.yaml``) updated to indicate whether or not the edge is directed. +- DJORNL parser and test suite updated accordingly. + +## [0.0.10] - 2020-10-08 +### Changed +- Clean up some of the configuration logic, and add the `SPEC_REPO_URL` env var instead of hard-coding +- Bundle the spec tarball in the docker image so other programs can use the image for testing + +## [0.0.9] - 2020-10-05 +### Fixed +- Fixed the function that concatenates parts of the query for the API + +## [0.0.8] - 2020-09-18 +### Fixed +- Remove need for authentication when waiting for the ArangoDB dependency to start (this is a staging server restriction) + +## [0.0.7] - 2020-09-18 +### Fixed +- Modified the docker deployment script so it can be used to release to the staging server + +## [0.0.6] - 2020-08-20 + +### Added + +- `relation_engine_server/api_versions/api_v1.py`: add and/or document API endpoints: + - `/api/v1/specs/stored_queries` + - `/api/v1/specs/data_sources` +- `spec/datasets`: new directory for dataset-specific schemas, e.g. DJORNL parser manifest, `spec/datasets/djornl/manifest.schema.json` +- `spec/test/test_manifest_schema.py`: to test the manifest schema against example input +- `importers/djornl/parser.py`: use manifest file to specify the files to be parsed and loaded into ArangoDB + +### Changed + +- `relation_engine_server/utils/spec_loader.py`: refactor to return a schema or the path to a schema file +- `importers/djornl/parser.py`: refactor parsing code to be more flexible and parse multiple files +- `spec/collections/djornl/*`, `spec/stored_queries/djornl/*`, `spec/views/djornl/*`, and `spec/test/djornl`: rename DB fields and headers in test files + +### Removed + +- `spec/test/djornl`: delete unneeded test files + + + +## [0.0.5] + +Last release with RE components in two repositories, https://github.com/kbase/relation_engine_api and https://github.com/kbase/relation_engine_spec diff --git a/CODEOWNERS b/CODEOWNERS new file mode 100644 index 00000000..c45e98cf --- /dev/null +++ b/CODEOWNERS @@ -0,0 +1 @@ +* @ialarmedalien @eapearson @n1mus diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..c2a3fce0 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,43 @@ +FROM python:3.7-alpine + +ARG DEVELOPMENT +ARG BUILD_DATE +ARG VCS_REF +ARG BRANCH=develop + +COPY requirements.txt dev-requirements.txt /tmp/ +WORKDIR /app + +# Install dockerize +ENV DOCKERIZE_VERSION v0.6.1 +RUN apk --update add --virtual build-dependencies curl tar gzip && \ + curl -o dockerize.tar.gz \ + https://raw.githubusercontent.com/kbase/dockerize/master/dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz && \ + tar -C /usr/local/bin -xvzf dockerize.tar.gz && \ + rm dockerize.tar.gz && \ + apk del build-dependencies + +# Install dependencies +RUN apk --update add --virtual build-dependencies build-base python3-dev && \ + pip install --upgrade pip && \ + pip install --no-cache-dir -r /tmp/requirements.txt && \ + if [ "$DEVELOPMENT" ]; then pip install --no-cache-dir -r /tmp/dev-requirements.txt; fi && \ + apk del build-dependencies && \ + pip install types-requests types-PyYAML + +COPY . /app + +# Create tarball of the spec directory so we have it cached in the image +RUN tar czvf /opt/spec.tar.gz /app/spec +ENV SPEC_RELEASE_PATH=/opt/spec.tar.gz + +LABEL org.label-schema.build-date=$BUILD_DATE \ + org.label-schema.vcs-url="https://github.com/kbase/relation_engine_api" \ + org.label-schema.vcs-ref=$VCS_REF \ + org.label-schema.schema-version="1.0.0-rc1" \ + us.kbase.vcs-branch=$BRANCH \ + maintainer="KBase Team" + +EXPOSE 5000 +ENTRYPOINT ["/usr/local/bin/dockerize"] +CMD ["sh", "-x", "/app/scripts/start_server.sh"] diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 00000000..916a8c78 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,7 @@ +Copyright (c) 2020 The KBase Project and its Contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..e4d49ecd --- /dev/null +++ b/Makefile @@ -0,0 +1,32 @@ +QUERY_TESTING_FILE = spec/test/stored_queries/test_query.py + +.PHONY: test reset full_query_testing sampling_query_testing graph_query_testing + +test: + docker-compose build + docker-compose run re_api sh scripts/run_tests.sh + docker-compose down --remove-orphans + +shell: + docker-compose down --remove-orphans + docker-compose build + docker-compose run re_api sh + +reset: + docker-compose --rmi all -v + docker-compose build + +full_query_testing: + DO_QUERY_TESTING=full time python -m pytest -s $(QUERY_TESTING_FILE) + +sampling_query_testing: + DO_QUERY_TESTING=sampling time python -m pytest -s $(QUERY_TESTING_FILE) + +compare_query_testing: + DO_QUERY_TESTING=compare time python -m pytest -s $(QUERY_TESTING_FILE) + +graph_query_testing: + # invocation example: + # make graph_query_testing data_new_fp="tmp/blah.json" data_old_fp="tmp/bleh.json" + # where `data_new_fp` and `data_old_fp` are generated by `make compare_query_testing` + DO_QUERY_TESTING=graph python $(QUERY_TESTING_FILE) $(data_new_fp) $(data_old_fp) diff --git a/README.md b/README.md new file mode 100644 index 00000000..fa850c76 --- /dev/null +++ b/README.md @@ -0,0 +1,46 @@ +[![Total alerts](https://img.shields.io/lgtm/alerts/g/kbase/relation_engine.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/kbase/relation_engine/alerts/) [![Language grade: Python](https://img.shields.io/lgtm/grade/python/g/kbase/relation_engine.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/kbase/relation_engine/context:python) +![RE test and deploy](https://github.com/kbase/relation_engine/workflows/Relation%20Engine%20test%20and%20deploy/badge.svg) + +# KBase Relation Engine + +This repo holds the code associated with the KBase relation engine, previously held in https://github.com/kbase/relation_engine_api and https://github.com/kbase/relation_engine_spec. + +## Relation Engine Spec +### `spec/` + +The `spec/` directory holds the schemas for [stored queries](spec/stored_queries), [collections](spec/collections), [views](spec/views), [analyzers](spec/analyzers), and [migrations](spec/migrations) for the relation engine graph database service. + +These specifications are used by the [Relation Engine API](relation_engine_server). + +## Relation Engine API +### `relation_engine_server/` + +The relation engine server (`relation_engine_server/`) is a simple API that allows KBase community developers to interact with the Relation Engine graph database. You can run stored queries or do bulk updates on documents. + +## Relation Engine Startup +* Docker image is built with environment variable `SPEC_RELEASE_PATH=/opt/spec.tar.gz`. This contains the specs from the repo itself. +* Wait for response from auth, workspace, and arangodb services, as they are set up +* Specs are set up. Either the repo specs or remote specs are loaded into the specs root path +* Collections, views, and analyzers from the specs are added to the ArangoDB server. If the collection, view, or analyzer already exists, but in a different configuration, it will _not_ be overwritten. +* Collections, views, and analyzers from the loaded specs are compared to those on the ArangoDB server. If the loaded specs' attributes are not recursively a subset of the server specs, then an exception is raise. (This is just preliminary validation behavior.) + + +## Relation Engine builds + +The Relation Engine is available on github packages. These images are built by the configs in the .github repo. +The develop tags are located at https://github.com/kbase/relation_engine/pkgs/container/relation_engine-develop +e.g. +``` +docker pull ghcr.io/kbase/relation_engine-develop:latest (Built upon merging a PR) +docker pull ghcr.io/kbase/relation_engine-develop:pr-93 (Built upon creating a PR) +``` + +## How to Deploy in CI +The CI service is available in the `relationapi` service +* Press Upgrade Arrow +* Ensure the relationapi service uses `ghcr.io/kbase/relation_engine-develop:latest` +* Ensure the `Always pull image before creating` box is ticked +* Press `Upgrade` button +* If the deployment suceeded, you can finish the upgrade. If not, you can press the rollback button. + +(For deployments to other environments, request help from the #devops channel) diff --git a/VERSION b/VERSION new file mode 100644 index 00000000..32786aa4 --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +0.0.18 diff --git a/client_src/README.md b/client_src/README.md new file mode 100644 index 00000000..1eacc4da --- /dev/null +++ b/client_src/README.md @@ -0,0 +1,115 @@ +# Relation engine client + +A pip-installable Python client module for accessing the methods of the Relation Engine API. + +## Installation + +Install with pip: + +```sh +pip install --extra-index-url https://pypi.anaconda.org/kbase/simple \ + releng-client==0.0.1 +``` + +## Usage + +### Initialize the client + +Pass in the URL of the Relation Engine API server you want to use, which is most likely one of the following: + +* `https://kbase.us/services/relation_engine_api` +* `https://ci.kbase.us/services/relation_engine_api` +* `https://appdev.kbase.us/services/relation_engine_api` + +Additionally, pass in a KBase auth token you would like to use for access control and document saving permissions when making requests to the API. + +```py +from relation_engine_client import REClient + +re_client = REClient("https://ci.kbase.us/services/relation_engine_api", "xyz_my_token") +``` + +You can leave off the token if you want to do unauthenticated queries for public data. + +### Basic calls + +#### Stored queries + +To execute a stored/named query, run the following: + +``` +re_client.stored_query(query_name, bind_vars, raise_not_found=False) +``` + +Where: + +* `bind_vars`: required - dict - variables to use in the query. +* `raise_not_found`: options - bool - defaults to False - whether to raise an RENotFound error if 0 docs are returned. + +### Saving documents + +``` +re_client.save_docs(collection_name, docs, on_duplicate='error', display_errors=False) +``` + +Where: + +* `collection_name`: required - str - name of the collection you are saving documents into +* `docs`: required - list of dict or single dict - json-serializable list of + documents to save to the above collection +* `on_duplicate`: optional - one of 'replace', 'update', 'ignore', or 'error' defaults to 'error' - action to take when we have a duplicate document by + `_key` while saving. +* `display_errors`: optional - bool - defaults to False - whether to return + error messages for every document that failed to save. + +#### Admin queries + +To run an ad-hoc admin query, run: + +```py +re_client.admin_query(aql_query_text, bind_vars) +``` + +You must have an auth token set in the client with the RE admin role. + +### Exceptions + +A few different exceptions can be thrown from each method, which you can import: + +```py +from relation_engine_client.exceptions import REServerError, RERequestError, RENotFound +``` + +#### REServerError + +An error was thrown by the server (status code 500). + +Access the `.resp.text` property on the error object to see the response body from the API, or simply print the error to debug. + +#### RERequestError + +There was an invalid or missing parameter or header in the request. + +Access the `.resp.text` property on the error object to see the response body from the API, or simply print the error to debug. + +#### RENotFound + +The `raise_not_found` argument was set to `True` and no documents were found in the query. + +Access the `.req_body` and `.req_params` properties of the error object to see the request data, or simply print the error to debug. + +## Development + +### Publishing + +Increment the semantic version inside `client_src/setup.py`, then build the package with: + +```sh +python setup.py sdist +``` + +Publish to the anaconda pypi repository with: + +```sh +anaconda upload -i -u kbase dist/releng-client-{version}.tar.gz +``` diff --git a/client_src/relation_engine_client/__init__.py b/client_src/relation_engine_client/__init__.py new file mode 100644 index 00000000..ac0de15e --- /dev/null +++ b/client_src/relation_engine_client/__init__.py @@ -0,0 +1,3 @@ +from .main import REClient + +__all__ = ["REClient"] diff --git a/client_src/relation_engine_client/exceptions.py b/client_src/relation_engine_client/exceptions.py new file mode 100644 index 00000000..58b29943 --- /dev/null +++ b/client_src/relation_engine_client/exceptions.py @@ -0,0 +1,41 @@ +class REServerError(Exception): + """Server-originated error from RE API (ie. 500+)""" + + def __init__(self, resp): + self.resp = resp + + def __str__(self): + return ( + f"Relation engine API server error:\n" + f"Status: {self.resp.status_code}\n" + f"Response: {self.resp.text}" + ) + + +class RERequestError(Exception): + """Error in the request format or data from the client (ie. 400)""" + + def __init__(self, resp): + self.resp = resp + + def __str__(self): + return ( + f"Relation engine API client request error:\n" + f"Status: {self.resp.status_code}\n" + f"Response: {self.resp.text}" + ) + + +class RENotFound(Exception): + """The user required some results to be returned, but there were none.""" + + def __init__(self, req_body, req_params): + self.req_body = req_body + self.req_params = req_params + + def __str__(self): + return ( + f"Documents not found in the Relation Engine:\n" + f"Request body: {self.req_body}\n" + f"URL params: {self.req_params}" + ) diff --git a/client_src/relation_engine_client/main.py b/client_src/relation_engine_client/main.py new file mode 100644 index 00000000..8ffa7017 --- /dev/null +++ b/client_src/relation_engine_client/main.py @@ -0,0 +1,157 @@ +import json +import requests +from typing import Optional, List, Dict, Union + +from .exceptions import REServerError, RERequestError, RENotFound + +_QUERY_METHOD = "POST" +_QUERY_ENDPOINT = "/api/v1/query_results" +_SAVE_METHOD = "PUT" +_SAVE_ENDPOINT = "/api/v1/documents" + + +class REClient: + def __init__(self, api_url: str, token: str = None): + self.api_url = api_url + self.token = token + # Type check the constructor parameters + if not self.api_url or not isinstance(self.api_url, str): + raise TypeError("The Relation Engine API URL was not provided.") + # Remove any trailing slash in the API URL so we can append paths + self.api_url = self.api_url.strip("/") + + def admin_query(self, query: str, bind_vars: dict, raise_not_found=False): + """ + Run an ad-hoc query using admin privs. + Params: + query - string - AQL query to execute + bind_vars - dict - JSON serializable bind variables for the query + raise_not_found - bool - Whether to raise an error if there are zero results. Defaults to False + Exceptions raised: + RERequestError - 400-499 error from the RE API + REServerError - 500+ error from the RE API + RENotFound - raised when raise_not_found is True and there are 0 results + """ + # Type-check the parameters + if not isinstance(query, str): + raise TypeError("`query` argument must be a str") + if not isinstance(bind_vars, dict): + raise TypeError("`bind_vars` argument must be a dict") + if not isinstance(raise_not_found, bool): + raise TypeError("`raise_not_found` argument must be a bool") + # Construct and execute the request + req_body = dict(bind_vars) + req_body["query"] = query + url = str(self.api_url) + _QUERY_ENDPOINT + resp = self._make_request( + method=_QUERY_METHOD, + url=url, + data=json.dumps(req_body), + params={}, + raise_not_found=raise_not_found, + ) + return resp + + def stored_query(self, stored_query: str, bind_vars: dict, raise_not_found=False): + """ + Run a stored query. + Params: + stored_query - string - name of the stored query to execute + bind_vars - JSON serializable - bind variables for the query (JSON serializable) + raise_not_found - bool - Whether to raise an error if there are zero results. Defaults to False + Exceptions raised: + RERequestError - 400-499 from the RE API (client error) + REServerError - 500+ error from the RE API + RENotFound - raised when raise_not_found is True and there are 0 results + """ + # Type-check the parameters + if not isinstance(stored_query, str): + raise TypeError("`stored_query` argument must be a str") + if not isinstance(bind_vars, dict): + raise TypeError("`bind_vars` argument must be a dict") + if not isinstance(raise_not_found, bool): + raise TypeError("`raise_not_found` argument must be a bool`") + # Construct and execute the request + req_body = dict(bind_vars) + url = str(self.api_url) + _QUERY_ENDPOINT + return self._make_request( + method=_QUERY_METHOD, + url=url, + data=json.dumps(req_body), + params={"stored_query": stored_query}, + raise_not_found=raise_not_found, + ) + + def save_docs( + self, + coll: str, + docs: Union[Dict, List[Dict]], + on_duplicate: Optional[str] = None, + display_errors=False, + ): + """ + Save documents to a collection in the relation engine. + Requires an auth token with RE admin privileges. + Params: + coll - str - collection name to save to + docs - a single dict or list of dicts - json-serializable documents to save + on_duplicate - str (defaults to 'error') - what to do when a provided document + already exists in the collection. See options here: + https://github.com/kbase/relation_engine_api#put-apiv1documents + display_errors - bool (defaults to False) - whether to respond with + document save errors (the response will give you an error for every + document that failed to save). + Exceptions raised: + RERequestError - 400-499 from the RE API (client error) + REServerError - 500+ error from the RE API + """ + if isinstance(docs, dict): + docs = [docs] + if not docs: + raise TypeError("No documents provided to save") + if not isinstance(docs, list): + raise TypeError("`docs` argument must be a list") + if on_duplicate and not isinstance(on_duplicate, str): + raise TypeError("`on_duplicate` argument must bea str") + if not isinstance(display_errors, bool): + raise TypeError("`display_errors` argument must be a bool") + params = {"collection": coll} + if display_errors: + params["display_errors"] = "1" + params["on_duplicate"] = on_duplicate or "error" + req_body = "\n".join(json.dumps(d) for d in docs) + url = str(self.api_url) + _SAVE_ENDPOINT + return self._make_request( + method=_SAVE_METHOD, + url=url, + data=req_body, + params=params, + raise_not_found=False, + ) + + def _make_request(self, method, url, data, params, raise_not_found): + """ + Internal utility to make a generic request to the RE API and handle the + response. + """ + headers = {} + if self.token: + headers["Authorization"] = self.token + resp = requests.request( + method=method, url=url, data=data, params=params, headers=headers + ) + if resp.status_code >= 500: + # Server error + raise REServerError(resp) + elif resp.status_code >= 400 and resp.status_code < 500: + # Client error + raise RERequestError(resp) + elif not resp.ok: + raise RuntimeError( + f"Unknown RE API error:\nURL: {resp.url}\nMethod: {method}\n{resp.text}" + ) + resp_json = resp.json() + if raise_not_found and not len(resp_json["results"]): + # Results were required to be non-empty + raise RENotFound(req_body=data, req_params=params) + return resp_json diff --git a/client_src/setup.py b/client_src/setup.py new file mode 100644 index 00000000..f6b4d08a --- /dev/null +++ b/client_src/setup.py @@ -0,0 +1,11 @@ +from setuptools import setup + + +setup( + name="releng-client", + version="0.0.1", + description="KBase Relation Engine API Client Module", + url="https://github.com/kbase/relation_engine_api", + packages=["relation_engine_client"], + install_requires=["requests>=2"], +) diff --git a/client_src/test/__init__.py b/client_src/test/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/client_src/test/test_integration.py b/client_src/test/test_integration.py new file mode 100644 index 00000000..baa17e97 --- /dev/null +++ b/client_src/test/test_integration.py @@ -0,0 +1,202 @@ +import unittest +import os +from uuid import uuid4 + +from relation_engine_client import REClient +from relation_engine_client.exceptions import RERequestError, RENotFound + +_API_URL = os.environ.get("RE_API_URL", "http://localhost:5000") +# See the test schemas here: +# https://github.com/kbase/relation_engine/tree/develop/spec/collections/test +_VERT_COLL = "test_vertex" +_EDGE_COLL = "test_edge" +# See the docker-compose.yaml file in the root of this repo +# See the mock auth endpoints in relation_engine_server/test/mock_auth/*.json +_TOK_ADMIN = "admin_token" +_TOK_USER = "non_admin_token" +_TOK_INVALID = "invalid_token" + + +class TestREClientIntegration(unittest.TestCase): + """Integration tests for the REClient package.""" + + @classmethod + def setUpClass(cls): + cls.client = REClient(_API_URL, _TOK_ADMIN) + + def test_admin_query_ok(self): + _id = self._save_test_vert() + bind_vars = {"id": _id} + query = f"FOR vert IN {_VERT_COLL} FILTER vert._key == @id RETURN vert" + result = self.client.admin_query(query, bind_vars) + self.assertEqual(result["count"], 1) + self.assertEqual(result["results"][0]["_key"], _id) + + def test_admin_query_empty_auth(self): + client2 = REClient(_API_URL) + query = f"FOR vert IN {_VERT_COLL} FILTER vert._key == @id RETURN vert" + with self.assertRaises(RERequestError) as ctx: + client2.admin_query(query, {"id": "xyz"}) + self.assertEqual(ctx.exception.resp.status_code, 400) + # Mostly make sure that the __str__ method does not throw any errs + self.assertTrue("Missing header: Authorization" in str(ctx.exception)) + + def test_admin_query_invalid_auth(self): + client2 = REClient(_API_URL, "xyz") + query = f"FOR vert IN {_VERT_COLL} FILTER vert._key == @id RETURN vert" + with self.assertRaises(RERequestError) as ctx: + client2.admin_query(query, {"id": "xyz"}) + self.assertEqual(ctx.exception.resp.status_code, 403) + # Mostly make sure that the __str__ method does not throw any errs + self.assertTrue("Unauthorized" in str(ctx.exception)) + + def test_admin_empty_query(self): + bind_vars = {"id": "xyz"} + with self.assertRaises(RERequestError) as ctx: + self.client.admin_query("", bind_vars) + self.assertEqual(ctx.exception.resp.status_code, 400) + # Mostly make sure that the __str__ method does not throw any errs + self.assertTrue("Response:" in str(ctx.exception)) + + def test_admin_missing_param(self): + query = f"FOR vert IN {_VERT_COLL} FILTER vert._key == @id RETURN vert" + with self.assertRaises(RERequestError) as ctx: + self.client.admin_query(query, bind_vars={}) + self.assertEqual(ctx.exception.resp.status_code, 400) + # Mostly make sure that the __str__ method does not throw any errs + self.assertTrue("Response:" in str(ctx.exception)) + + def test_admin_raise_not_found(self): + query = f"FOR vert IN {_VERT_COLL} FILTER vert._key == @id RETURN vert" + _id = str(uuid4()) + bind_vars = {"id": _id} + with self.assertRaises(RENotFound) as ctx: + self.client.admin_query(query, bind_vars, raise_not_found=True) + self.assertTrue(_id in ctx.exception.req_body) + # Mostly make sure that the __str__ method does not throw any errs + self.assertTrue("Request body:" in str(ctx.exception)) + + def test_admin_invalid_args(self): + # No params + with self.assertRaises(TypeError): + self.client.admin_query() + # Wrong type for query + with self.assertRaises(TypeError): + self.client.admin_query(123) + # Wrong type for bind_vars + with self.assertRaises(TypeError): + self.client.admin_query("", 123) + + def test_stored_query_ok(self): + _id = self._save_test_vert() + bind_vars = {"key": _id} + qname = "fetch_test_vertex" + result = self.client.stored_query(qname, bind_vars) + self.assertEqual(result["count"], 1) + self.assertEqual(result["results"][0]["_key"], _id) + + def test_stored_query_invalid_args(self): + with self.assertRaises(TypeError): + self.client.stored_query() + with self.assertRaises(TypeError): + self.client.stored_query(123, 123) + with self.assertRaises(TypeError): + self.client.stored_query("") + + def test_stored_query_unknown_query(self): + qname = "xyz123" + with self.assertRaises(RERequestError) as ctx: + self.client.admin_query(qname, bind_vars={"key": 0}) + self.assertEqual(ctx.exception.resp.status_code, 400) + # Mostly make sure that the __str__ method does not throw any errs + self.assertTrue("Response:" in str(ctx.exception)) + + def test_stored_query_missing_bind_vars(self): + qname = "fetch_test_vertex" + with self.assertRaises(RERequestError) as ctx: + self.client.admin_query(qname, bind_vars={"x": "y"}) + self.assertEqual(ctx.exception.resp.status_code, 400) + # Mostly make sure that the __str__ method does not throw any errs + self.assertTrue("Response:" in str(ctx.exception)) + + def test_stored_query_raise_not_found(self): + _id = str(uuid4()) + bind_vars = {"key": _id} + qname = "fetch_test_vertex" + with self.assertRaises(RENotFound) as ctx: + self.client.stored_query(qname, bind_vars, raise_not_found=True) + self.assertTrue(_id in ctx.exception.req_body) + # Mostly make sure that the __str__ method does not throw any errs + self.assertTrue("Request body:" in str(ctx.exception)) + + def test_save_docs_ok(self): + _id = str(uuid4()) + docs = [{"_key": _id}] + results = self.client.save_docs(coll=_VERT_COLL, docs=docs) + self.assertEqual(results["created"], 1) + self.assertFalse(results["error"]) + self.assertEqual(results["errors"], 0) + self.assertEqual(results["ignored"], 0) + self.assertEqual(results["updated"], 0) + + def test_save_docs_empty_auth(self): + client2 = REClient(_API_URL) + docs = [{"_key": "xyz"}] + with self.assertRaises(RERequestError) as ctx: + client2.save_docs(coll=_VERT_COLL, docs=docs) + self.assertEqual(ctx.exception.resp.status_code, 400) + # Mostly make sure that the __str__ method does not throw any errs + self.assertTrue("Missing header: Authorization" in str(ctx.exception)) + + def test_save_docs_invalid_auth(self): + client2 = REClient(_API_URL, "xyz") + docs = [{"_key": "xyz"}] + with self.assertRaises(RERequestError) as ctx: + client2.save_docs(coll=_VERT_COLL, docs=docs) + self.assertEqual(ctx.exception.resp.status_code, 403) + # Mostly make sure that the __str__ method does not throw any errs + self.assertTrue("Unauthorized" in str(ctx.exception)) + + def test_save_docs_invalid_args(self): + with self.assertRaises(TypeError): + self.client.save_docs() + with self.assertRaises(TypeError): + self.client.save_docs(123, 456) + # Empty docs list + with self.assertRaises(TypeError): + self.client.save_docs(_VERT_COLL, []) + + def test_save_docs_unknown_coll(self): + with self.assertRaises(RERequestError) as ctx: + self.client.save_docs("xyz123", [{"_key": 0}]) + self.assertEqual(ctx.exception.resp.status_code, 404) + self.assertEqual( + ctx.exception.resp.json(), + { + "error": { + "message": "Not found", + "details": "Collection 'xyz123' does not exist.", + "name": "xyz123", + } + }, + ) + # Mostly make sure that the __str__ method does not throw any errs + self.assertTrue("Response:" in str(ctx.exception)) + + def test_save_docs_invalid_docs(self): + with self.assertRaises(RERequestError) as ctx: + self.client.save_docs(_VERT_COLL, [{"hi": 0}]) + self.assertEqual(ctx.exception.resp.status_code, 400) + # Mostly make sure that the __str__ method does not throw any errs + self.assertTrue("Response:" in str(ctx.exception)) + + # -- Test helpers + + def _save_test_vert(self): + """Create a test vertex with a random & unique id.""" + _id = str(uuid4()) + docs = [{"_key": _id}] + results = self.client.save_docs(coll=_VERT_COLL, docs=docs) + if results["error"]: + raise RuntimeError(results) + return _id diff --git a/dev-requirements.txt b/dev-requirements.txt new file mode 100644 index 00000000..618eea4e --- /dev/null +++ b/dev-requirements.txt @@ -0,0 +1,10 @@ +mypy>=0.630 +bandit==1.5.1 +mccabe==0.6.1 +flake8==3.5.0 +grequests==0.3.0 +coverage==5.2.1 +typed-ast>=1.4.0 +black==22.3.0 +pytest==6.2.5 +jinja2==3.0.3 diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 00000000..e1739190 --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,52 @@ +version: '3' + +# This docker-compose is for developer convenience, not for running in production. + +services: + + # For running the Flask server and tests + re_api: + build: + context: . + dockerfile: Dockerfile + args: + DEVELOPMENT: 1 + ports: + - "127.0.0.1:5000:5000" + volumes: + - ${PWD}:/app + depends_on: + - auth + - workspace + - arangodb + environment: + - WORKERS=2 + - DEVELOPMENT=1 + - FLASK_ENV=development + - FLASK_DEBUG=1 + - KBASE_AUTH_URL=http://auth:5000 + - KBASE_WORKSPACE_URL=http://workspace:5000 + - PYTHONUNBUFFERED=true + - SPEC_RELEASE_PATH=/app/relation_engine_server/test/spec_release/spec.tar.gz + - DB_URL=http://arangodb:8529 + - DB_USER=root + - RE_API_URL=http://127.0.0.1:5000 + + # A mock kbase auth server (see src/test/mock_auth/endpoints.json) + auth: + image: mockservices/mock_json_service + volumes: + - ${PWD}/relation_engine_server/test/mock_auth:/config + + # Mock workspace server (see src/test/mock_workspace/endpoints.json) + workspace: + image: mockservices/mock_json_service + volumes: + - ${PWD}/relation_engine_server/test/mock_workspace:/config + + # Arangodb server in cluster mode + arangodb: + image: arangodb:3.5 + ports: + - "127.0.0.1:8529:8529" + command: sh -c "arangodb --starter.local" diff --git a/importers/ONTOLOGY_LOAD.md b/importers/ONTOLOGY_LOAD.md new file mode 100644 index 00000000..4a5cbc57 --- /dev/null +++ b/importers/ONTOLOGY_LOAD.md @@ -0,0 +1,38 @@ +# Loading ontology procedure + +#### Downloading obo file. +* Ex. PO obo was downloaded from http://purl.obolibrary.org/obo/po.obo + +#### Converting obo to obograph. +* Cloning https://github.com/ontodev/robot +* Running it do conversion, Ex. + +```sh +docker run -v `pwd`:`pwd` --user $(id -u) -w `pwd` robot convert \ +--input ~/tmp/gaz.obo --output ~/tmp/gaz.json +``` + +#### Running scripts/prepare_ontology.py to generate yaml files for ontology +```sh +python3 scripts/prepare_ontology.py scripts/test/data/data_sources.json po_ontology +``` + +#### Preparing PR with generated ontology yaml files and requesting for merge and deployment +* Corresponding collections should be created in arango + +#### Preparing relation_engine_importers +* Cloning https://github.com/kbase/relation_engine_importers +* setup ssh tunnel for arangodb + +#### Loading with obograph_delta_loader.py +```sh +relation_engine/ontologies/obograph/loaders/obograph_delta_loader.py \ +--file ~/package/plant-ontology/po.json --onto-id-prefix PO \ +--arango-url http://127.0.0.1:48000/ --database luj_test --load-namespace po_ontology \ +--node-collection PO_terms --edge-collection PO_edges --merge-edge-collection PO_merges \ +--load-version release_999 --load-registry-collection delta_load_registry \ +--load-timestamp $(( $(date '+%s%N') / 1000000)) --release-timestamp $(( $(date '+%s%N') / 1000000)) \ +--user $USER --pwd-file passfile --graph-id "http://purl.obolibrary.org/obo/po.owl" +``` +* The passfile contains user's arango password. +* The “--graph-id” is required if there are more than one graphs in obograph file. diff --git a/importers/README.md b/importers/README.md new file mode 100644 index 00000000..e54bb379 --- /dev/null +++ b/importers/README.md @@ -0,0 +1,19 @@ +# RE Importers + +This directory holds python modules that import data into ArangoDB. + +## Running importers + +Configure importers through environment variables with the `RES_` prefix (which stands for Relation Engine Spec). + +Global env vars: + +* `RES_AUTH_TOKEN` - auth token to use when making requests to RE API - defaults to test value +* `RES_API_URL` - url to use for the RE API - defaults to test value + +### djornl + +```sh +RES_ROOT_DATA_PATH=/path/to/djornl_data \ +python -m importers.djornl.parser +``` diff --git a/importers/__init__.py b/importers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/importers/djornl/__init__.py b/importers/djornl/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/importers/djornl/parser.py b/importers/djornl/parser.py new file mode 100644 index 00000000..6f4de607 --- /dev/null +++ b/importers/djornl/parser.py @@ -0,0 +1,845 @@ +""" +Loads the Dan Jacobson/ORNL group's gene and phenotype network data into arangodb. + +Running this requires a set of source files provided by the ORNL group. + +The parser sets up its configuration, including the files it will parse, from the RES_ROOT_DATA_PATH +environment variable once per instantiation. To parse a set of files from a different directory, +create a new parser with RES_ROOT_DATA_PATH set appropriately. + +Sample usage: + +from the command line: + +# load files from /path/to/data/dir +RES_ROOT_DATA_PATH=/path/to/data/dir python -m importers.djornl.parser + +""" +import argparse +import csv +import json +import os +import requests +import yaml + +import importers.utils.config as config +from relation_engine_server.utils.json_validation import ( + run_validator, + get_schema_validator, +) + + +class DJORNL_Parser(object): + def __init__(self): + + # dict of nodes, indexed by node ID (node1 and node2 from the file) + self.node_ix = {} + # dict of edges, indexed by node1__node2__edge_type + self.edge_ix = {} + + # the order in which to parse the different data files + self.parse_order = ["edges", "nodes", "clusters"] + + def config(self, value): + if not hasattr(self, "_config"): + self._configure() + + if value not in self._config: + raise KeyError(f"No such config value: {value}") + + return self._config[value] + + def _configure(self): + + configuration = config.load_from_env(extra_required=["ROOT_DATA_PATH"]) + + # Collection name config + configuration["node_name"] = "djornl_node" + configuration["edge_name"] = "djornl_edge" + + # fetch the manifest and make sure all the files listed actually exist + manifest = self._get_manifest(configuration) + for type in ["node", "edge", "cluster"]: + configuration[type + "_files"] = [] + + error_list = [] + for file in manifest["file_list"]: + file_path = os.path.join(configuration["ROOT_DATA_PATH"], file["path"]) + + if not os.path.exists(file_path): + error_list.append(f"{file_path}: file does not exist") + continue + + if not os.path.isfile(file_path): + error_list.append(f"{file_path}: not a file") + continue + + # add the file to the appropriate list + file["file_path"] = file_path + configuration[file["data_type"] + "_files"].append(file) + + if error_list: + raise RuntimeError("\n".join(error_list)) + + self._config = configuration + return self._config + + def _get_manifest_schema_file(self): + + return os.path.join(self._get_dataset_schema_dir(), "manifest.schema.json") + + def _get_dataset_schema_dir(self): + + if not hasattr(self, "_dataset_schema_dir"): + dir_path = os.path.dirname(os.path.realpath(__file__)) + self._dataset_schema_dir = os.path.join( + dir_path, "../", "../", "spec", "datasets", "djornl" + ) + + return self._dataset_schema_dir + + def _get_manifest(self, configuration): + """ + Read the manifest file, which contains path and file type info, and validate it. + The manifest is expected to be at ROOT_DATA_PATH/manifest.yaml + """ + + schema_file = self._get_manifest_schema_file() + + # load the manifest and validate it against the schema + manifest_file = os.path.join(configuration["ROOT_DATA_PATH"], "manifest.yaml") + + try: + with open(manifest_file) as fd: + manifest = yaml.safe_load(fd) + except FileNotFoundError: + raise RuntimeError( + f"No manifest file found at {manifest_file}.\n" + "Please ensure that you have created a manifest that lists the files " + "in the release" + ) + + try: + validated_manifest = run_validator(schema_file=schema_file, data=manifest) + except Exception as err: + print(err) + raise RuntimeError( + "The manifest file failed validation. Please recheck the file and try again." + ) + + return validated_manifest + + def _get_file_reader(self, fd, file): + """Given a dict containing file information, instantiate the correct type of parser""" + + delimiter = "\t" + if file.get("file_format", "").lower() == "csv" or file[ + "path" + ].lower().endswith(".csv"): + delimiter = "," + return csv.reader(fd, delimiter=delimiter) + + def parser_gen(self, file): + """generator function to parse a file""" + expected_col_count = 0 + with open(file["file_path"], newline="") as fd: + csv_reader = self._get_file_reader(fd, file) + line_no = 0 + for row in csv_reader: + line_no += 1 + if not len(row) or len(row[0]) and row[0][0] == "#": + # comment / metadata + continue + + cols = [c.strip() for c in row] + + if len(cols) == expected_col_count: + yield (line_no, cols, None) + continue + + # if we didn't get the expected number of cols: + if expected_col_count == 0: + # this is the header row; set up the expected column count + expected_col_count = len(cols) + yield (line_no, [c.lower() for c in cols], None) + continue + + # otherwise, this row does not have the correct number of columns + col_count = len(cols) + msg = f"expected {expected_col_count} cols, found {col_count}" + yield (line_no, None, f"{file['path']} line {line_no}: {msg}") + + def check_headers(self, headers, validator=None): + """ + Ensure that the file headers contain required columns for the data type. Checks the schema + in the validator to ensure that all required fields are present in the headers. + + :param headers: (list) list containing headers + + :param validator: (obj) validator object, with the appropriate schema loaded + + :return header_errs: (dict) dict of header errors: + 'missing': required headers that are missing from the input + 'invalid': headers that should not be in the input + 'duplicate': duplicated headers (data would be overwritten) + If the list of headers supplied is valid--i.e. it + contains all the fields marked as required in the validator + schema--or no validator has been supplied, the method + returns an empty dict + """ + + if validator is None: + return {} + + header_errs = {} + + all_headers = {} + # ensure we don't have any duplicate headers + for h in headers: + if h in all_headers: + all_headers[h] += 1 + else: + all_headers[h] = 1 + + duplicate_headers = [h for h in all_headers.keys() if all_headers[h] != 1] + if duplicate_headers: + header_errs["duplicate"] = duplicate_headers + + # check that each required header in the schema is present in headers + required_props = validator.schema["required"] + missing_headers = [i for i in required_props if i not in headers] + if missing_headers: + header_errs["missing"] = missing_headers + + if not validator.schema.get("additionalProperties", True): + all_props = validator.schema["properties"].keys() + extra_headers = [i for i in headers if i not in all_props] + if extra_headers: + header_errs["invalid"] = extra_headers + + return header_errs + + def remap_object(self, raw_data, remap_functions): + """ + Given a dict, raw_data, create a new dict, remapped_data, using the functions in the + dictionary `remap_functions`. + + :param raw_data: (dict) input data for remapping + + :param remap_fn: (dict) mapping of output param names to functions + + Each function should take the raw_data object as an + argument and return the value for the output parameter. + For parameters that can be copied over to the output + object without modification, set the value to `None` + instead of a function. + + :return remapped_data: (dict) the remapped data! + """ + remapped_data = {} + for (key, function) in remap_functions.items(): + # these keys get copied over unchanged to the new object if they exist in the input obj + if function is None: + if key in raw_data: + remapped_data[key] = raw_data[key] + else: + remapped_data[key] = function(raw_data) + + return remapped_data + + def process_file(self, file, remap_fn, store_fn, err_list, validator=None): + """process an input file to generate a dataset and possibly an error list + + Each valid line in the file is turned into a dictionary using the header row, and then + validated against the csv validation schema in spec/datasets/djornl/csv_. + If that completes successfully, it is transformed using the functions in the dictionary + `remap_fn`, checked for uniqueness against existing data, and saved to a dictionary. Once + all files of a certain type have been processed, results can be saved to Arango. + + Any errors that occur during parsing and processing are accumulated in `err_list`. + + :param file: (dict) file data + :param remap_fn: (dict) mapping of output param names to functions + each function should take the row data object as input and + return the value for the output parameter + + :param store_fn: (func) function to store the results of the remapping + + :param err_list: (list) error list + + :param validator: (Validator) jsonschema validator object + + """ + print("Parsing " + file["data_type"] + " file " + file["file_path"]) + file_parser = self.parser_gen(file) + + def add_error(error): + err_list.append(error) + + try: + (line_no, cols, err_str) = next(file_parser) + except StopIteration: + # no valid lines found in the file + add_error(f"{file['path']}: no header line found") + return + + header_errors = self.check_headers(cols, validator) + if header_errors.keys(): + err_str = { + "duplicate": "duplicate", + "missing": "missing required", + "invalid": "invalid additional", + } + for err_type in ["missing", "invalid", "duplicate"]: + if err_type in header_errors: + add_error( + f"{file['path']}: {err_str[err_type]} headers: " + + ", ".join(sorted(header_errors[err_type])) + ) + return + + headers = cols + n_stored = 0 + for (line_no, cols, err_str) in file_parser: + # mismatch in number of cols + if cols is None: + add_error(err_str) + continue + + # merge headers with cols to create an object + row_object = dict(zip(headers, cols)) + + if validator is not None: + # validate the object + if not validator.is_valid(row_object): + for e in sorted(validator.iter_errors(row_object), key=str): + add_error(f"{file['path']} line {line_no}: " + e.message) + continue + + try: + # transform it using the remap_functions + datum = self.remap_object(row_object, remap_fn) + except Exception as err: + err_type = type(err) + add_error( + f"{file['path']} line {line_no}: error remapping data: {err_type} {err}" + ) + continue + + # and store it + storage_error = store_fn(datum) + if storage_error is None: + n_stored += 1 + else: + add_error(f"{file['path']} line {line_no}: " + storage_error) + + if not n_stored: + add_error(f"{file['path']}: no valid data found") + + def store_parsed_edge_data(self, datum): + """ + store node and edge data in the node (node_ix) and edge (edge_ix) indexes respectively + Nodes are indexed by the '_key' attribute. + Parsed edge data only contains node '_key' values. + + Edges are indexed by the unique combination of the two node IDs, the edge type, and whether + or not it is a directed edge. It is assumed that if there is more than one score for a given + combination of node IDs and edge type, the datum is erroneous. + """ + + # there should only be one value for each node<->node edge of a given type, + # so use these values as an index key + if datum["directed"]: + property_array = [ + datum["node1"], + datum["node2"], + datum["edge_type"], + str(datum["directed"]), + ] + else: + # sort undirected nodes to ensure no dupes slip through + property_array = [ + *sorted([datum["node1"], datum["node2"]]), + datum["edge_type"], + str(datum["directed"]), + ] + + edge_key = "__".join(property_array) + + if edge_key in self.edge_ix: + # duplicate lines can be ignored + if datum["score"] == self.edge_ix[edge_key]["score"]: + return None + # report non-matching data + return f"duplicate data for edge {edge_key}" + + # create a unique key for the DB for this record + datum["_key"] = "__".join( + [ + str(datum[_]) + for _ in ["node1", "node2", "edge_type", "directed", "score"] + ] + ) + + # keep track of the nodes mentioned in this edge set + for node_n in ["1", "2"]: + _node_key = datum[f"node{node_n}"] + if _node_key not in self.node_ix: + self.node_ix[_node_key] = {"_key": _node_key} + del datum[f"node{node_n}"] + + self.edge_ix[edge_key] = datum + return None + + def load_edges(self): + """Load edge data from the set of edge files""" + + # error accumulator + err_list = [] + + schema_file = os.path.join(self._get_dataset_schema_dir(), "csv_edge.yaml") + validator = get_schema_validator(schema_file=schema_file) + + node_name = self.config("node_name") + # these functions remap the values in the columns of the input file to + # appropriate values to go into Arango + # note that the functions that assume the presence of a certain key in the input + # can do so because that key is in a 'required' property in the CSV spec file + remap_functions = { + "node1": None, # this will be deleted in the 'store' step + "node2": None, # as will this + "_from": lambda row: node_name + "/" + row["node1"], + "_to": lambda row: node_name + "/" + row["node2"], + "score": lambda row: float(row["score"]), + "edge_type": None, + "directed": lambda row: True if row.get("directed", "") == "1" else False, + } + + for file in self.config("edge_files"): + self.process_file( + file=file, + remap_fn=remap_functions, + store_fn=self.store_parsed_edge_data, + err_list=err_list, + validator=validator, + ) + + return { + "nodes": self.node_ix.values(), + "edges": self.edge_ix.values(), + "err_list": err_list, + } + + def _try_node_merge(self, existing_node, new_node, path=[]): + """ + Try to merge two data structures. These should be JSON compatible, so they will be limited + to lists, dicts, and scalar data types. + + This method tests the keys/values of the two dict objects provided and depending on the type + of the values, merges them or records an error: + + - scalar (strings, ints, floats, etc.): record an error on mismatches + - list: merge list contents, removing duplicates and preserving order + - dict: run _try_node_merge recursively on it + - mismatch of data types between the two nodes: record an error + + :param existing_node: (dict) existing node + :param new_node: (dict) node data to be merged into it + :param path: (list) path to this node in a larger data structure + + :return (merge, err_list): (tuple) + If successful, the method returns the merged dict and [] + If there were errors, err_list will be populated with the + keys/values where mismatches occurred. + """ + + # merge the dictionaries + merge = {**existing_node, **new_node} + + # find the shared keys -- keys in both existing and new nodes where the values differ + shared_keys = [ + i + for i in new_node + if i in existing_node and new_node[i] != existing_node[i] + ] + + # if there were no shared keys, return the merged list + if not shared_keys: + return (merge, []) + + # otherwise, we need to remove the shared keys and examine them individually + for k in shared_keys: + del merge[k] + + err_list = [] + # go through the dict keys, checking their type + for k in sorted(shared_keys): + value_type = type(existing_node[k]) + + # do the types match? If not, these values cannot be merged + if type(new_node[k]) != value_type: + err_list.append("/".join(path + [k])) + continue + + if value_type == list: + # merge lists, preserving order. Data type agnostic. + merge[k] = [] + for i in existing_node[k] + new_node[k]: + if i not in merge[k]: + merge[k].append(i) + continue + + elif value_type == dict: + # recursively check dict data using _try_node_merge + (k_merged, k_errs) = self._try_node_merge( + existing_node[k], new_node[k], path + [k] + ) + if k_errs: + err_list = err_list + k_errs + continue + merge[k] = k_merged + + else: + # this is a scalar (string, number, etc.) so it can't be merged + err_list.append("/".join(path + [k])) + + # at some point, it may be useful to examine these errors in more detail + if err_list: + merge = None + return (merge, err_list) + + def store_parsed_node_data(self, datum): + """ + store node data in the node index, node_ix, indexed by the node _key or gid + + If a node is already present, new data is checked for conflicts with existing data + """ + node_ix = datum.get("gid", datum.get("_key")) + if not node_ix: + return + # check whether we have this node already + if node_ix in self.node_ix: + # identical data: ignore it + if datum == self.node_ix[node_ix]: + return None + + # try merging the data + (merged, err_list) = self._try_node_merge(self.node_ix[node_ix], datum) + if err_list: + return "duplicate data for node " + node_ix + datum = merged + + self.node_ix[node_ix] = datum + + def load_nodes(self): + """Load node metadata""" + + err_list = [] + + schema_file = os.path.join( + self._get_dataset_schema_dir(), "{file_format}_node.yaml" + ) + + def _get_node_validator(file_format): + return get_schema_validator( + schema_file=schema_file.format(file_format=file_format) + ) + + def go_terms(row): + if "go_terms" in row and len(row["go_terms"]): + return [c.strip() for c in row["go_terms"].split(",")] + return [] + + remap_functions = { + # these pass straight through + "gene_full_name": None, + "gene_model_type": None, + "gene_symbol": None, + "go_description": None, + "mapman_bin": None, + "mapman_description": None, + "mapman_name": None, + "node_type": None, + "pheno_aragwas_id": None, + "pheno_description": None, + "pheno_pto_description": None, + "pheno_pto_name": None, + "pheno_reference": None, + "tair_computational_description": None, + "tair_curator_summary": None, + "tair_short_description": None, + "transcript": None, + "user_notes": None, + # rename + "_key": lambda row: row["gid"] if "gid" in row else row["node_id"], + # see functions above + "go_terms": go_terms, + } + + for file in self.config("node_files"): + self.process_file( + file=file, + remap_fn=remap_functions, + store_fn=self.store_parsed_node_data, + err_list=err_list, + validator=_get_node_validator(file_format=file["file_format"]), + ) + + return { + "nodes": self.node_ix.values(), + "err_list": err_list, + } + + def store_parsed_cluster_data(self, datum): + """ + store remapped cluster data + + The input is in the form + + {'cluster_id': cluster_id, 'node_ids': [node_id_1, node_id_2, node_id_3, ...]} + + Cluster IDs are stored in the 'clusters' node attribute as a list, with new IDs added to + (rather than replacing) existing IDs + """ + cluster_id = datum["cluster_id"] + # gather a list of cluster IDs for each node + for node_id in datum["node_ids"]: + if node_id not in self.node_ix: + self.node_ix[node_id] = {"_key": node_id, "clusters": [cluster_id]} + elif "clusters" not in self.node_ix[node_id]: + self.node_ix[node_id]["clusters"] = [cluster_id] + elif cluster_id not in self.node_ix[node_id]["clusters"]: + self.node_ix[node_id]["clusters"].append(cluster_id) + return None + + def load_clusters(self): + """Annotate genes with cluster ID fields.""" + + err_list = [] + + schema_file = os.path.join(self._get_dataset_schema_dir(), "csv_cluster.yaml") + validator = get_schema_validator(schema_file=schema_file) + + # these functions remap the values in the columns of the input file to + # appropriate values to go into Arango + # the 'cluster_id' remap function is assigned below on a per-file basis + remap_functions = { + "node_ids": lambda row: [n.strip() for n in row["node_ids"].split(",")] + } + + for file in self.config("cluster_files"): + prefix = file["cluster_prefix"] + remap_functions["cluster_id"] = ( + lambda row: prefix + ":" + row["cluster_id"].replace("Cluster", "") + ) + + self.process_file( + file=file, + remap_fn=remap_functions, + store_fn=self.store_parsed_cluster_data, + err_list=err_list, + validator=validator, + ) + + return { + "nodes": list(self.node_ix.values()), + "err_list": err_list, + } + + def save_dataset(self, dataset=None): + + if dataset is None: + dataset = { + "nodes": list(self.node_ix.values()), + "edges": list(self.edge_ix.values()), + } + + if "nodes" in dataset and len(dataset["nodes"]) > 0: + self.save_docs(self.config("node_name"), dataset["nodes"]) + + if "edges" in dataset and len(dataset["edges"]) > 0: + self.save_docs(self.config("edge_name"), dataset["edges"]) + + def save_docs(self, coll_name, docs, on_dupe="update"): + + resp = requests.put( + self.config("API_URL") + "/api/v1/documents", + params={"collection": coll_name, "on_duplicate": on_dupe}, + headers={"Authorization": self.config("AUTH_TOKEN")}, + data="\n".join(json.dumps(d) for d in docs), + ) + if not resp.ok: + raise RuntimeError(resp.text) + + print(f"Saved docs to collection {coll_name}!") + print(resp.text) + print("=" * 80) + return resp + + def load_data(self, dry_run=False): + all_errs = [] + method_ix = { + "clusters": self.load_clusters, + "edges": self.load_edges, + "nodes": self.load_nodes, + } + for data_type in self.parse_order: + output = method_ix[data_type]() + if output["err_list"]: + all_errs = all_errs + output["err_list"] + + # if there are no errors then save the dataset unless this is a dry run + if len(all_errs) == 0 and not dry_run: + self.save_dataset() + + # report stats on the data that has been gathered + return self.summarise_dataset(all_errs) + + def summarise_dataset(self, errs): + """summarise the data that has been loaded""" + + # go through the node index, checking for nodes that only have one attribute ('_key') or + # were loaded from the clusters files, with only '_key' and 'clusters' attributes + + node_type_ix = {"__NO_TYPE__": 0} + node_data = {"key_only": [], "cluster": [], "full": []} + + for node in self.node_ix.values(): + if len(node.keys()) == 2 and "clusters" in node: + node_data["cluster"].append(node) + elif len(node.keys()) == 1: + node_data["key_only"].append(node) + else: + node_data["full"].append(node) + + if "node_type" in node: + if node["node_type"] in node_type_ix: + node_type_ix[node["node_type"]] += 1 + else: + node_type_ix[node["node_type"]] = 1 + else: + node_type_ix["__NO_TYPE__"] += 1 + + nodes_in_edge_ix = {} + edge_type_ix = {} + for edge in self.edge_ix.values(): + nodes_in_edge_ix[edge["_from"]] = 1 + nodes_in_edge_ix[edge["_to"]] = 1 + if edge["edge_type"] in edge_type_ix: + edge_type_ix[edge["edge_type"]] += 1 + else: + edge_type_ix[edge["edge_type"]] = 1 + + return { + "nodes_total": len(self.node_ix.keys()), + "edges_total": len(self.edge_ix.keys()), + "nodes_in_edge": len(nodes_in_edge_ix.keys()), + "node_type_count": node_type_ix, + "edge_type_count": edge_type_ix, + "node_data_available": { + "key_only": len(node_data["key_only"]), + "cluster": len(node_data["cluster"]), + "full": len(node_data["full"]), + }, + "errors_total": len(errs), + "errors": errs, + } + + +def format_summary(summary, output): + if output == "json": + return json.dumps(summary) + node_type_counts = [count for count in summary["node_type_count"].values()] + edge_type_counts = [count for count in summary["node_type_count"].values()] + values = ( + [ + summary["nodes_total"], + summary["edges_total"], + summary["nodes_in_edge"], + summary["node_data_available"]["key_only"], + summary["node_data_available"]["cluster"], + summary["node_data_available"]["full"], + summary.get("errors_total"), + ] + + node_type_counts + + edge_type_counts + ) + value_width = max([len(str(value)) for value in values]) + node_type_names = dict(__NO_TYPE__="No type") + node_types = "\n".join( + [ + ( + f"{count:{value_width}} {node_type_names.get(ntype, ntype)}".format( + value_width + ) + ) + for ntype, count in summary["node_type_count"].items() + ] + ) + edge_type_names = dict() + edge_types = "\n".join( + [ + ( + f"{count:{value_width}} {edge_type_names.get(etype, etype)}".format( + value_width + ) + ) + for etype, count in summary["edge_type_count"].items() + ] + ) + text_summary = f""" +{summary["nodes_total"]:{value_width}} Total nodes +{summary["edges_total"]:{value_width}} Total edges +{summary["nodes_in_edge"]:{value_width}} Nodes in edge +--- +Node Types +{node_types:{value_width}} +--- +Edge Types +{edge_types:{value_width}} +--- +Node data available +{summary["node_data_available"]["key_only"]:{value_width}} Key only +{summary["node_data_available"]["cluster"]:{value_width}} Cluster +{summary["node_data_available"]["full"]:{value_width}} Full +--- +{summary.get("errors_total"):{value_width}} Errors +""".format( + value_width + ) + if summary.get("errors_total") > 0: + text_summary = text_summary + "\n" + "\n".join(summary.get("errors")) + return text_summary + + +def main(): + argparser = argparse.ArgumentParser(description="Load DJORNL data") + argparser.add_argument( + "--dry-run", + dest="dry", + action="store_true", + help="Perform all actions of the parser, except loading the data.", + ) + argparser.add_argument( + "--output", + default="text", + help="Specify the format of any output generated. (text or json)", + ) + args = argparser.parse_args() + parser = DJORNL_Parser() + summary = dict() + try: + summary = parser.load_data(dry_run=args.dry) + except Exception as err: + print("Unhandled exception", err) + exit(1) + errors = summary.get("errors") + if summary: + print(format_summary(summary, args.output)) + if errors: + error_output = f"Aborted with {len(errors)} errors.\n" + raise RuntimeError(error_output) + + +if __name__ == "__main__": + main() diff --git a/importers/test/__init__.py b/importers/test/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/importers/test/test_djornl_parser.py b/importers/test/test_djornl_parser.py new file mode 100644 index 00000000..4c119e69 --- /dev/null +++ b/importers/test/test_djornl_parser.py @@ -0,0 +1,472 @@ +""" +Tests for the DJORNL Parser + +At the present time, this just ensures that the files are parsed correctly; +it does not check data loading into the db. + +These tests run within the re_api docker image. +""" +import json +import unittest +import os + +from importers.djornl.parser import DJORNL_Parser +from spec.test.helpers import modified_environ + +_TEST_DIR = "/app/spec/test" + + +class Test_DJORNL_Parser(unittest.TestCase): + @classmethod + def setUpClass(cls): + # import the results file + results_file = os.path.join(_TEST_DIR, "djornl", "results.json") + with open(results_file) as fh: + cls.json_data = json.load(fh) + + cls.maxDiff = None + + def init_parser_with_path(self, root_path): + + with modified_environ(RES_ROOT_DATA_PATH=root_path): + parser = DJORNL_Parser() + # ensure that the configuration has been set + parser._configure() + return parser + + def test_errors(self, parser=None, errs={}): + if parser is None: + self.assertTrue(True) + return + + all_errs = [] + for data_type in parser.parse_order: + if data_type not in errs: + continue + + all_errs = all_errs + errs[data_type] + method = f"load_{data_type}" + output = getattr(parser, method)() + with self.subTest(data_type=data_type): + self.assertEqual(output["err_list"], errs[data_type]) + + with self.subTest(data_type="all types"): + # test all errors + summary = parser.load_data(dry_run=True) + err_list = summary["errors"] + self.assertEqual(err_list, all_errs) + + def test_missing_required_env_var(self): + """test that the parser exits with code 1 if the RES_ROOT_DATA_PATH env var is not set""" + with self.assertRaisesRegex( + RuntimeError, "Missing required env var: RES_ROOT_DATA_PATH" + ): + parser = DJORNL_Parser() + parser.load_edges() + + def test_config(self): + """test that the parser raises an error if a config value cannot be found""" + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, "djornl", "test_data") + parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) + with self.assertRaisesRegex(KeyError, "No such config value: bananas"): + parser.config("bananas") + + def test_load_no_manifest(self): + """test loading when the manifest does not exist""" + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, "djornl", "no_manifest") + err_str = "No manifest file found at " + os.path.join( + RES_ROOT_DATA_PATH, "manifest.yaml" + ) + with self.assertRaisesRegex(RuntimeError, err_str): + self.init_parser_with_path(RES_ROOT_DATA_PATH) + + def test_load_invalid_manifest(self): + """test an invalid manifest file""" + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, "djornl", "invalid_manifest") + err_str = "The manifest file failed validation" + with self.assertRaisesRegex(RuntimeError, err_str): + self.init_parser_with_path(RES_ROOT_DATA_PATH) + + def test_load_invalid_file(self): + """test loading when what is supposed to be a file is actually a directory""" + + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, "djornl", "invalid_file") + + # edges: directory, not a file + err_str = os.path.join(RES_ROOT_DATA_PATH, "edges.tsv") + ": not a file" + with self.assertRaisesRegex(RuntimeError, err_str): + self.init_parser_with_path(RES_ROOT_DATA_PATH) + + def test_load_missing_files(self): + """test loading when files cannot be found""" + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, "djornl", "missing_files") + # not found + err_str = ( + os.path.join(RES_ROOT_DATA_PATH, "edges.tsv") + ": file does not exist" + ) + with self.assertRaisesRegex(RuntimeError, err_str): + self.init_parser_with_path(RES_ROOT_DATA_PATH) + + def test_load_empty_files(self): + """test loading files containing no data""" + + # path: test/djornl/empty_files + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, "djornl", "empty_files") + parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) + + errs = { + # mix of problems + "clusters": [ + "cluster_data/headers_only.tsv: no valid data found", + "cluster_data/no_content.tsv: no header line found", + "cluster_data/comment_only.tsv: no header line found", + ], + # comments only + "edges": ["merged_edges-AMW-060820_AF.tsv: no header line found"], + # header only, no content + "nodes": [ + "aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv: no valid data found" + ], + } + self.test_errors(parser, errs) + + def test_load_missing_headers(self): + """test loading when files lack required headers""" + RES_ROOT_DATA_PATH = os.path.join( + _TEST_DIR, "djornl", "missing_required_headers" + ) + parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) + + def invalid_err(file_name, header_list): + return f"{file_name}: invalid additional headers: " + ", ".join( + sorted(header_list) + ) + + def missing_err(file_name, header_list): + return f"{file_name}: missing required headers: " + ", ".join( + sorted(header_list) + ) + + def dupe_err(file_name, header_list): + return f"{file_name}: duplicate headers: " + ", ".join(sorted(header_list)) + + errs = { + "clusters": [ + # tuple containing file name and list of invalid column headers in that file + missing_err("I2_named.tsv", ["cluster_id", "node_ids"]), + invalid_err("I2_named.tsv", ["cluster", "node_list"]), + invalid_err("I4_named.tsv", ["other cool stuff"]), + dupe_err("I6_named.tsv", ["node_ids"]), + ], + "edges": [ + missing_err("edges.tsv", ["score"]), + missing_err("hithruput-edges.csv", ["edge_type"]), + ], + "nodes": [ + missing_err("extra_node.csv", ["node_type"]), + invalid_err("extra_node.csv", ["node_types"]), + missing_err("pheno_nodes.csv", ["node_id"]), + invalid_err("pheno_nodes.csv", ["id", "pheno_ref", "usernotes"]), + ], + } + self.test_errors(parser, errs) + + def test_load_invalid_types(self): + """test file format errors""" + + # path: test/djornl/invalid_types + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, "djornl", "invalid_types") + parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) + + errs = { + "edges": [ + # invalid edge type + r"edges.tsv line 3: 'Same-Old-Stuff' is not valid under any of the given schemas", + # empty to/from + r"edges.tsv line 4: '' does not match '^\\S{2,}.*$'", + r"edges.tsv line 5: '' does not match '^\\S{2,}.*$'", + # empty edge type + r"edges.tsv line 6: '' is not valid under any of the given schemas", + # invalid score + r"edges.tsv line 7: '2.' does not match '^\\d+(\\.\\d+)?$'", + # invalid edge type + r"edges.tsv line 8: 'raNetv2-DC_' is not valid under any of the given schemas", + # invalid score + r"edges.tsv line 10: 'score!' does not match '^\\d+(\\.\\d+)?$'", + # various permutations of edge directedness + r"directed_edges.tsv line 4: 'true' is not one of ['1', '0']", + r"directed_edges.tsv line 5: '' is not one of ['1', '0']", + r"directed_edges.tsv line 6: 'directed' is not one of ['1', '0']", + r"directed_edges.tsv line 8: 'false' is not one of ['1', '0']", + ], + "nodes": [ + # invalid node type + r"nodes.csv line 5: 'Monkey' is not valid under any of the given schemas", + r"nodes.csv line 7: 'A' does not match '^\\S{2,}.*$'", + r"pheno_nodes.csv: no valid data found", + ], + "clusters": [ + r"markov2_named.tsv line 7: 'HoneyNutCluster3' does not match '^Cluster\\d+$'", + r"markov2_named.tsv line 8: expected 2 cols, found 1", + ], + } + self.test_errors(parser, errs) + + def test_load_col_count_errors(self): + """test files with invalid numbers of columns""" + + # path: test/djornl/col_count_errors + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, "djornl", "col_count_errors") + parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) + + errs = { + "edges": [ + "edges.tsv line 2: expected 5 cols, found 6", + "edges.tsv line 6: expected 5 cols, found 3", + "directed_edges.tsv line 4: expected 6 cols, found 5", + "directed_edges.tsv line 6: expected 6 cols, found 3", + ], + "nodes": ["nodes.csv line 3: expected 20 cols, found 22"], + } + self.test_errors(parser, errs) + + def test_load_valid_edge_data(self): + """ensure that valid edge data can be parsed""" + + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, "djornl", "test_data") + parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) + + edge_data = parser.load_edges() + expected = self.json_data["load_edges"] + + for data_structure in [edge_data, expected]: + for k in data_structure.keys(): + data_structure[k] = sorted(data_structure[k], key=lambda n: n["_key"]) + expected["err_list"] = [] + + self.assertEqual(edge_data, expected) + + def test_load_valid_node_data(self): + """ensure that valid node data can be parsed""" + + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, "djornl", "test_data") + parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) + + node_data = parser.load_nodes() + expected = self.json_data["load_nodes"] + + for data_structure in [node_data, expected]: + for k in data_structure.keys(): + data_structure[k] = sorted(data_structure[k], key=lambda n: n["_key"]) + data_structure[k] = [n["_key"] for n in data_structure[k]] + expected["err_list"] = [] + + self.assertEqual(node_data, expected) + + def test_load_valid_cluster_data(self): + """ensure that valid cluster data can be parsed""" + + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, "djornl", "test_data") + parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) + + cluster_data = parser.load_clusters() + expected = self.json_data["load_clusters"] + expected["err_list"] = [] + + self.assertEqual(cluster_data, expected) + + def test_duplicate_data(self): + """test files with duplicate data that should throw an error""" + + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, "djornl", "duplicate_data") + parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) + + errs = { + "edges": [ + "edges.tsv line 17: duplicate data for edge " + + "AT1G01100__SDV__protein-protein-interaction_literature-curated_AraNet_v2__False", + "hithruput-edges.csv line 5: duplicate data for edge " + + "AT1G01010__AT1G01030__protein-protein-interaction_high-throughput_AraNet_v2__False", + "hithruput-edges.csv line 9: duplicate data for edge " + + "AT1G01030__AT1G01050__pairwise-gene-coexpression_AraNet_v2__False", + "hithruput-edges.csv line 11: duplicate data for edge " + + "SDV__AT1G01100__protein-protein-interaction_literature-curated_AraNet_v2__True", + ], + "nodes": ["extra_node.csv line 5: duplicate data for node AT1G01080"], + } + self.test_errors(parser, errs) + + def test_duplicate_cluster_data(self): + """test files with duplicate cluster data, which should be seamlessly merged""" + + # path: test/djornl/duplicate_data + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, "djornl", "duplicate_data") + parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) + + cluster_data = parser.load_clusters() + expected = self.json_data["load_clusters"] + expected["err_list"] = [] + + self.assertEqual(cluster_data, expected) + + def test_dry_run(self): + + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, "djornl", "test_data") + parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) + + output = parser.load_data(dry_run=True) + self.assertEqual( + { + "edge_type_count": { + "phenotype-association_AraGWAS": 3, + "pairwise-gene-coexpression_AraNet_v2": 1, + "domain-co-occurrence_AraNet_v2": 1, + "protein-protein-interaction_high-throughput_AraNet_v2": 2, + "protein-protein-interaction_literature-curated_AraNet_v2": 6, + }, + "edges_total": 13, + "node_data_available": {"cluster": 0, "full": 14, "key_only": 0}, + "node_type_count": {"__NO_TYPE__": 0, "gene": 10, "pheno": 4}, + "nodes_in_edge": 12, + "nodes_total": 14, + "errors_total": 0, + "errors": [], + }, + output, + ) + + def test_try_node_merge(self): + """test node merging""" + + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, "djornl", "test_data") + parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) + + tests = [ + { + "desc": "existing node is just a _key", + "old": {"_key": "abcde"}, + "new": { + "_key": "abcde", + "node_type": "gene", + "node_quality": "highest", + }, + "out": ( + {"_key": "abcde", "node_type": "gene", "node_quality": "highest"}, + [], + ), + }, + { + "desc": "new node is just a _key", + "old": {"_key": "abcde", "node_type": "gene"}, + "new": {"_key": "abcde"}, + "out": ({"_key": "abcde", "node_type": "gene"}, []), + }, + { + "desc": "no overlapping keys", + "old": {"_key": "abcde", "node_type": "gene"}, + "new": {"_key": "abcde", "node_size": 24}, + "out": ({"_key": "abcde", "node_type": "gene", "node_size": 24}, []), + }, + { + "desc": "mergeable fields", + "old": { + "_key": "abcde", + "go_terms": ["this", "that"], + "colour": "pink", + }, + "new": {"_key": "abcde", "go_terms": ["the other"]}, + "out": ( + { + "_key": "abcde", + "go_terms": ["this", "that", "the other"], + "colour": "pink", + }, + [], + ), + }, + { + "desc": "mergeable fields, removing list duplicates", + "old": { + "_key": "abcde", + "go_terms": ["this", "that", "this", "that", "the"], + "colour": "pink", + }, + "new": { + "_key": "abcde", + "go_terms": ["this", "the", "that", "that", "other", "other"], + }, + "out": ( + { + "_key": "abcde", + "go_terms": ["this", "that", "the", "other"], + "colour": "pink", + }, + [], + ), + }, + { + "desc": "mergeable fields, complex list contents, removing list duplicates", + "old": {"_key": 123, "list": [{"a": "b"}, {"a": "b"}, {"c": "d"}]}, + "new": {"_key": 123, "list": [{"a": "b"}, {"a": "c"}, {"c": "d"}]}, + "out": ( + {"_key": 123, "list": [{"a": "b"}, {"c": "d"}, {"a": "c"}]}, + [], + ), + }, + { + "desc": "mergeable fields, no overlapping keys, nested version", + "old": {"_key": "abcde", "type": "gene", "info": {"teeth": 16}}, + "new": {"_key": "abcde", "size": 24, "info": {"colour": "pinkish"}}, + "out": ( + { + "_key": "abcde", + "type": "gene", + "size": 24, + "info": {"teeth": 16, "colour": "pinkish"}, + }, + [], + ), + }, + { + "desc": "single field error: duplicate", + "old": {"_key": "abcde", "node_type": "gene"}, + "new": {"_key": "abcde", "node_type": "pheno"}, + "out": (None, ["node_type"]), + }, + { + "desc": "single field error: type mismatch", + "old": {"_key": "abcde", "node_type": "gene"}, + "new": {"_key": "abcde", "node_type": ["pheno"]}, + "out": (None, ["node_type"]), + }, + { + "desc": "multiple field errors", + "old": {"_key": "abcde", "node_type": "gene", "shark": "Jaws"}, + "new": { + "_key": "abcde", + "node_type": "pheno", + "shark": "Loan", + "fish": "guppy", + }, + "out": (None, ["node_type", "shark"]), + }, + { + "desc": "multiple field errors, nested dicts", + "old": { + "_key": 123, + "a": "A", + "b": {"c": {"d": "D"}, "e": {}, "f": "F"}, + }, + "new": { + "_key": 123, + "a": "A", + "b": {"c": {"d": ["D"]}, "e": "E", "f": "f"}, + }, + "out": (None, ["b/c/d", "b/e", "b/f"]), + }, + ] + + for t in tests: + with self.subTest(desc=t["desc"]): + output = parser._try_node_merge(t["old"], t["new"]) + self.assertEqual(output, t["out"]) diff --git a/importers/test/test_djornl_parser_integration.py b/importers/test/test_djornl_parser_integration.py new file mode 100644 index 00000000..d98ee1d9 --- /dev/null +++ b/importers/test/test_djornl_parser_integration.py @@ -0,0 +1,28 @@ +""" +Tests for the DJORNL Parser + +At the present time, this just ensures that the files are parsed correctly; +it does not check data loading into the db. +""" +import unittest +import os + +from importers.djornl.parser import DJORNL_Parser +from spec.test.helpers import modified_environ, check_spec_test_env + +_TEST_DIR = "/app/spec/test" + + +class Test_DJORNL_Parser_Integration(unittest.TestCase): + @classmethod + def setUpClass(cls): + check_spec_test_env() + + def test_the_full_shebang(self): + + with modified_environ( + RES_ROOT_DATA_PATH=os.path.join(_TEST_DIR, "djornl", "test_data") + ): + parser = DJORNL_Parser() + parser.load_data() + self.assertTrue(bool(parser.load_data())) diff --git a/importers/utils/config.py b/importers/utils/config.py new file mode 100644 index 00000000..da906885 --- /dev/null +++ b/importers/utils/config.py @@ -0,0 +1,28 @@ +""" +Loads and initializes configuration data for importers using environment +variables and a set of default values. +""" +import os +from typing import List + + +REQUIRED: List[str] = [] +OPTIONAL = ["AUTH_TOKEN", "API_URL"] +DEFAULTS = { + "AUTH_TOKEN": "admin_token", # test default + "API_URL": "http://localhost:5000", # test default +} + + +def load_from_env(extra_required=None, extra_optional=None, prefix="RES_"): + """Load all configuration vars from environment variables""" + conf = dict(DEFAULTS) + required = list(REQUIRED) + (extra_required or []) + optional = list(OPTIONAL) + (extra_optional or []) + for field in required: + if (prefix + field) not in os.environ: + raise RuntimeError(f"Missing required env var: {prefix + field}") + for field in required + optional: + if (prefix + field) in os.environ: + conf[field] = os.environ[prefix + field] + return conf diff --git a/relation_engine_server/README.md b/relation_engine_server/README.md new file mode 100644 index 00000000..8a3369df --- /dev/null +++ b/relation_engine_server/README.md @@ -0,0 +1,470 @@ +# Relation Engine API + +A simple API that allows KBase community developers to interact with the Relation Engine graph database. You can run stored queries or do bulk updates on documents. + +## Python client + +There is a [pip-installable python client](client_src/README.md) that can be used to access the RE API. + +## HTTP API v1 + +The API is a small, rest-ish service where all data is in JSON format. Replace the `{root_url}` in the examples below with one of: + * Production: `https://kbase.us/services/relation_engine_api` + * Staging: `https://ci.kbase.us/services/relation_engine_api` + * App-dev: `https://appdev.kbase.us/services/relation_engine_api` + +### Error responses + +The majority of errors returned from the server have explanatory information in the response content in the following format: + +```json + +{ + "error": { + "message": "A brief message explaining the error", + } +} +``` + +Specific errors may have other fields giving more details, e.g. JSON parsing errors have `source_json`, `pos`, `lineno`, and `colno` describing the error; ArangoDB errors have an `arango_message` field. + +### GET / + +Returns server status info + +### POST /api/v1/query_results + +Run a query using a stored query or a cursor ID. Semantically, this is a GET, but it's a POST to allow better support for passing JSON in the request body (eg. Postman doesn't allow request body data in get requests) + +_Example request_ + +```sh +curl -X POST -d '{"argument": "value"}' {root_url}/api/v1/query_results?stored_query=example +``` + +_Query params_ +* `stored_query` - required - string - name of the stored query to run as a query against the database +* `cursor_id` - required - string - ID of a cursor that was returned from a previous query with >100 results +* `full_count` - optional - bool - If true, return a count of the total documents before any LIMIT is applied (for example, in pagination). This might make some queries run more slowly + +Pass one of `stored_query` or `cursor_id` -- not both. + +_Request body_ + +When running a new query, the request body can be a JSON object of all bind variables for the query. Anything with a `@name` in the query source should have an entry in the object here. For example, a query with bind vars for `@@collection` and `@value`, you will need to pass: + +```json +{ "@collection": "collection_name", "value": "my_value"} +``` + +If you are using a cursor, the request body should be blank. + +_Example response_ + +```json +{ + "results": [..], + "count": 100, + "has_more": true, + "cursor_id": 123, + "stats": {..} +} +``` + +_Response JSON schema_ + +```json +{ "type": "object", + "properties": { + "results": { + "type": "array", + "description": "Result data from running with a maximum of 100 entries." + }, + "count": { + "type": "integer", + "description": "Total count of results." + }, + "has_more": { + "type": "boolean", + "description": "Whether additional results can be fetched with the cursor_id." + }, + "cursor_id": { + "type": "string", + "description": "A cursor ID that you can use to fetch more results, if they are present." + }, + "stats": { + "type": "object", + "description": "Information about how this query affected the database and its run-time." + } + } +} +``` + +Results are limited to 100 items. To continue fetching additional results, use the `cursor_id` parameter. + + +#### Ad-hoc sysadmin queries + +System admins can run ad-hoc queries by specifying a "query" property in the JSON request body. + +```sh +curl -d '{"query": "for v in coll sort rand() limit @count return v", "count": 1}' \ + {root_url}/api/v1/query_results +``` + +This will return the same form of results as above. + +**Note:** Currently, all queries are read-only. This includes stored queries and ad-hoc admin queries. Commands like `UPDATE` or `REMOVE` will fail. + +### PUT /api/v1/documents + +Bulk-update documents by either creating, replacing, or updating. + +_Example_ + +```sh +curl -X PUT {root_url}/api/v1/documents?collection=genes&on_duplicate=update +``` + +_Query params_ +* `collection` - required - string - name of the collection that we want to bulk-import into. +* `on_duplicate` - optional - "replace", "update", "ignore", "error" - Action to take when we find a duplicate document by `_key`. "replace" replaces the whole document. "update" merges in the new values. "ignore" takes no action. "error" cancels the entire transaction. +* `display_errors` - optional - bool - whether to return error messages for each document that failed to save in the response. This is disabled by default as it will slow down the response time. + +_Request body_ + +The request body should be a series of JSON documents separated by line-breaks. For example: + +``` +{"_key": "1", "name": "x"} +{"_key": "2", "name": "y"} +``` + +_Example response_ + +```json +{"created": 3, "errors": 2, "empty": 0, "updated": 0, "ignored": 0, "error": false} +``` + +_Response JSON schema_ + +```json +{ "type": "object", + "properties": { + "created": { + "type": "integer", + "description": "Count of documents that were created." + }, + "errors": { + "type": "integer", + "description": "Count of documents that had an error in saving." + }, + "empty": { + "type": "integer", + "description": "Count of empty lines in the import." + }, + "updated": { + "type": "integer", + "description": "Count of documents that were updated with an attribute merge." + }, + "ignored": { + "type": "integer", + "description": "Count of documents that were not imported due to a match." + }, + "error": { + "type": "boolean", + "description": "Whether the entire save operation was cancelled due to an error." + } + } +} +``` + +#### JSON Schema error responses + +If you try to update a collection and it fails validation against a JSON schema found in the [relation engine spec](spec/), then you will get a JSON error response with the following fields: + +* `"message"` - Human readable message explaining the error +* `"failed_validator"` - The name of the validator that failed (eg. "required") +* `"value"` - The (possibly nested) value in your data that failed validation +* `"path"` - The path into your data where you can find the value that failed validation + +### PUT /api/v1/specs/ + +Manually check and pull spec updates. Requires sysadmin auth. + +_Example_ + +``` +curl {root_url}/api/v1/update_specs +``` + +_Query params_ +* `init_collections` - optional - boolean - defaults to true - whether to initialize any new collections in arango (also creates indexes and views) +* `release_url` - optional - string - the specific url of the release to download and use (as a tarball). If left blank, then the latest release from github is used (not including any pre-releases or drafts). + +Every call to update specs will reset the spec data (do a clean download and overwrite). + +### GET /api/v1/specs/collections + +Get all collection names (returns an array of strings): + +```sh +GET {root_url}/api/v1/specs/collections +``` + +Example response: + +```json +["test_vertex", "test_edge"] +``` + +Get the schema for a specific collection + +```sh +GET "{root_url}/api/v1/specs/collections?name=test_vertex" +``` + +Example response: + +```json +{ + "name": "test_vertex", + "type": "vertex", + "schema": { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "required": ["_key"], + "description": "An example vertex schema for testing", + "properties": { + "_key": {"type": "string"}, + "is_public": {"type": "boolean"}, + "ws_id": {"type": "integer"} + } + } +} +``` + +Get the schema for a particular document by its full ID + +```sh +GET "{root_url}/api/v1/specs/collections?doc_id=test_vertex/1" +``` + +The response will have the same format as the example response above + +### GET /api/v1/specs/data_sources + +See also `GET /api/v1/data_sources` for a similar API that returns results in a slightly different format. + +Get all data source names (returns an array of strings): + +```sh +GET {root_url}/api/v1/specs/data_sources +``` + +Example response: + +```json +["envo_ontology", "go_ontology", "gtdb"] +``` + +Get the schema for a specific data source + +```sh +GET "{root_url}/api/v1/specs/data_sources?name=ncbi_taxonomy" +``` + +Example response: + +```json +{ + "name": "ncbi_taxonomy", + "category": "taxonomy", + "title": "NCBI Taxonomy", + "home_url": "https://www.ncbi.nlm.nih.gov/taxonomy", + "data_url": "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/", + "logo_url": "https://kbase.us/ui-assets/images/third-party-data-sources/ncbi/logo-51-64.png" +} +``` + +Response JSON schema: + +```json +{ "type": "object", + "properties": { + "name": { + "type": "string", + "description": "canonical identifier for this data source" + }, + "category": { + "type": "string", + "description": "parent category, such as taxonomy or ontology" + }, + "title": { + "type": "string", + "description": "human readable name for the data source" + }, + "home_url": { + "type": "string", + "description": "full URL of the home page for the data source" + }, + "data_url": { + "type": "string", + "description": "full URL from where the data can be downloaded" + }, + "logo_url": { + "type": "string", + "description": "the URL of a logo image representing this data source" + }, + } +} +``` + + +### GET /api/v1/specs/stored_queries + +Get all stored query names (returns an array of strings): + +```sh +GET {root_url}/api/v1/specs/stored_queries +``` + +Example response: + +```json +["fetch_test_vertices", "fetch_test_edges", "ncbi_fetch_taxon"] +``` + +Get the schema for a specific stored query + +```sh +GET "{root_url}/api/v1/specs/stored_queries?name=ncbi_fetch_taxon" +``` + +Example response: + +```json +{ + "stored_query": { + "name": "ncbi_fetch_taxon", + "params": { + "type": "object", + "required": [ + "id", + "ts" + ], + "properties": { + "id": { + "type": "string", + "title": "NCBI Taxonomy ID" + }, + "ts": { + "type": "integer", + "title": "Versioning timestamp" + } + } + }, + "query": "for t in ncbi_taxon\n filter t.id == @id\n filter t.created <= @ts AND t.expired >= @ts\n limit 1\n return t\n" + } +} +``` + + +### GET /api/v1/data_sources + +See also `GET /api/v1/spec/data_sources` for the standard `/specs` API endpoint access to this data. + +Fetch a list of data source names. Will return an array of strings. + +Example response body: + +```json +{"data_sources": ["x", "y", "z"]} +``` +The response is nearly identical to that for `GET /api/v1/specs/data_sources`, but this data is held in an object under the key `data_sources`. + + +Response JSON schema: + +```json +{ "type": "object", + "properties": { + "data_sources": { + "type": "array", + "items": { "type": "string" } + } + } +} +``` + +### GET /api/v1/data_sources/{name} + +Fetch the details for a data source by name. Will return an object of key/value details. + +Example response body: + +```json +{ + "data_source": { + "name": "envo_ontology", + "category": "ontology", + "title": "Environment Ontology", + "home_url": "http://www.obofoundry.org/ontology/envo.html", + "data_url": "https://github.com/EnvironmentOntology/envo/releases", + "logo_url": "https://ci.kbase.us/ui-assets/images/third-party-data-sources/envo/logo-119-64.png" + } +} +``` + +Response JSON schema is the same as for `GET /api/v1/specs/data_sources?name=data_source_name`, but the data is held in an object under the key `data_source`. + + +## Administration + +The following environment variables should be configured: + +* `KBASE_AUTH_URL` - url of the KBase authentication (auth2) server to use +* `SHARD_COUNT` - number of shards to use when creating new collections +* `KBASE_WORKSPACE_URL` - url of the KBase workspace server to use (for authorizing workspace access) +* `DB_URL` - url of the arangodb database to use for http API access +* `DB_USER` - username for the arangodb database +* `DB_PASS` - password for the arangodb database +* `DB_READONLY_USER` - read-only username for the arangodb database +* `DB_READONLY_PASS` - read-only password for the arangodb database + +### Update specs + +To update specs while the server is running, use this curl command with an RE_ADMIN token: + +```sh +curl -X PUT -H "Authorization: " \ + "https://ci.kbase.us/services/relation_engine_api/api/v1/specs?init_collections=1 +``` + +## Deprecated Endpoints + +#### GET `/api/v1/specs/schemas` (replaced by `/api/v1/specs/collections`) + +This endpoint has been deprecated; queries should use `/api/v1/specs/collections` instead. + + +## Development + +See the [Contribution Guidelines](/.github/CONTRIBUTING.md). + +Run tests with: + +```sh +make test +``` + +## Deployment + +The docker image is pushed to Docker Hub when new commits are made to master. The script that runs when pushing to docker hub is found in `hooks/build`. + +Alternatively, set the image name in `scripts/local-build.sh` and run it to build and deploy locally, which may be a lot faster. + +## Project anatomy + +* The main server code is in `./relation_engine_server`. +* Tests are in `./relation_engine_server/test` diff --git a/relation_engine_server/__init__.py b/relation_engine_server/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/relation_engine_server/api_versions/__init__.py b/relation_engine_server/api_versions/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/relation_engine_server/api_versions/api_v1.py b/relation_engine_server/api_versions/api_v1.py new file mode 100644 index 00000000..8cccfb3c --- /dev/null +++ b/relation_engine_server/api_versions/api_v1.py @@ -0,0 +1,226 @@ +import flask +from relation_engine_server.utils import ( + arango_client, + spec_loader, + auth, + bulk_import, + pull_spec, + config, + parse_json, + ensure_specs, +) +from relation_engine_server.utils.json_validation import run_validator +from relation_engine_server.exceptions import InvalidParameters + +api_v1 = flask.Blueprint("api_v1", __name__) + + +@api_v1.route("/data_sources", methods=["GET"]) +def list_data_sources(): + # note the custom response format is used by the frontend, so this endpoint is provided + # in addition to the /specs/data_sources endpoint + + data_sources = spec_loader.get_names("data_sources") + return flask.jsonify({"data_sources": data_sources}) + + +@api_v1.route("/data_sources/", methods=["GET"]) +def fetch_data_source(name): + + data_source = spec_loader.get_schema("data_source", name) + return flask.jsonify({"data_source": data_source}) + + +@api_v1.route("/specs/data_sources", methods=["GET"]) +def show_data_sources(): + """Show the current data sources loaded from the spec.""" + name = flask.request.args.get("name") + if name: + return flask.jsonify(spec_loader.get_schema("data_source", name)) + return flask.jsonify(spec_loader.get_names("data_sources")) + + +@api_v1.route("/specs/stored_queries", methods=["GET"]) +def show_stored_queries(): + """Show the current stored query names loaded from the spec.""" + name = flask.request.args.get("name") + if name: + return flask.jsonify( + {"stored_query": spec_loader.get_schema("stored_query", name)} + ) + return flask.jsonify(spec_loader.get_names("stored_query")) + + +@api_v1.route("/specs/collections", methods=["GET"]) +@api_v1.route("/specs/schemas", methods=["GET"]) +def show_collections(): + """Show the names of the (document) collections (edges and vertices) loaded from the spec.""" + name = flask.request.args.get("name") + doc_id = flask.request.args.get("doc_id") + if name: + return flask.jsonify(spec_loader.get_schema("collection", name)) + elif doc_id: + return flask.jsonify(spec_loader.get_schema_for_doc(doc_id)) + else: + return flask.jsonify(spec_loader.get_names("collection")) + + +@api_v1.route("/query_results", methods=["POST"]) +def run_query(): + """ + Run a stored query as a query against the database. + Auth: + - only kbase re admins for ad-hoc queries + - public stored queries (these have access controls within them based on params) + """ + json_body = parse_json.get_json_body() or {} + # fetch number of documents to return + batch_size = int(flask.request.args.get("batch_size", 10000)) + full_count = flask.request.args.get("full_count", False) + + if "query" in json_body: + # Run an adhoc query for a sysadmin + auth.require_auth_token(roles=["RE_ADMIN"]) + query_text = _preprocess_stored_query(json_body["query"], json_body) + del json_body["query"] + if "ws_ids" in query_text: + # Fetch any authorized workspace IDs using a KBase auth token, if present + auth_token = auth.get_auth_header() + json_body["ws_ids"] = auth.get_workspace_ids(auth_token) + + resp_body = arango_client.run_query( + query_text=query_text, + bind_vars=json_body, + batch_size=batch_size, + full_count=full_count, + ) + return flask.jsonify(resp_body) + + if "stored_query" in flask.request.args or "view" in flask.request.args: + # Run a query from a query name + # Note: we are maintaining backwards compatibility here with the "view" arg. + # "stored_query" is the more accurate name + query_name = flask.request.args.get("stored_query") or flask.request.args.get( + "view" + ) + stored_query = spec_loader.get_stored_query(query_name) + + if "params" in stored_query: + # Validate the user params for the query + stored_query_path = spec_loader.get_stored_query(query_name, path_only=True) + run_validator( + schema_file=stored_query_path, data=json_body, validate_at="/params" + ) + + stored_query_source = _preprocess_stored_query( + stored_query["query"], stored_query + ) + if "ws_ids" in stored_query_source: + # Fetch any authorized workspace IDs using a KBase auth token, if present + auth_token = auth.get_auth_header() + json_body["ws_ids"] = auth.get_workspace_ids(auth_token) + + resp_body = arango_client.run_query( + query_text=stored_query_source, + bind_vars=json_body, + batch_size=batch_size, + full_count=full_count, + ) + return flask.jsonify(resp_body) + + if "cursor_id" in flask.request.args: + # Run a query from a cursor ID + cursor_id = flask.request.args["cursor_id"] + resp_body = arango_client.run_query(cursor_id=cursor_id) + return flask.jsonify(resp_body) + # No valid options were passed + raise InvalidParameters("Pass in a query name or a cursor_id") + + +@api_v1.route("/specs", methods=["PUT"]) +def update_specs(): + """ + Manually check for updates, download spec releases, and init new collections. + Auth: admin + """ + auth.require_auth_token(["RE_ADMIN"]) + init_collections = "init_collections" in flask.request.args + release_url = flask.request.args.get("release_url") + update_name = pull_spec.download_specs(init_collections, release_url, reset=True) + return flask.jsonify( + { + "status": "updated", + "updated_from": update_name, + } + ) + + +@api_v1.route("/documents", methods=["PUT"]) +def save_documents(): + """ + Create, update, or replace many documents in a batch. + Auth: admin + """ + auth.require_auth_token(["RE_ADMIN"]) + collection_name = flask.request.args["collection"] + query = {"collection": collection_name, "type": "documents"} + if flask.request.args.get("display_errors"): + # Display an array of error messages + query["details"] = "true" + if flask.request.args.get("on_duplicate"): + query["onDuplicate"] = flask.request.args["on_duplicate"] + if flask.request.args.get("overwrite"): + query["overwrite"] = "true" + resp = bulk_import.bulk_import(query) + if resp.get("errors") > 0: + return (flask.jsonify(resp), 400) + else: + return flask.jsonify(resp) + + +@api_v1.route("/config", methods=["GET"]) +def show_config(): + """Show public config data.""" + conf = config.get_config() + return flask.jsonify( + { + "auth_url": conf["auth_url"], + "workspace_url": conf["workspace_url"], + "kbase_endpoint": conf["kbase_endpoint"], + "db_url": conf["db_url"], + "db_name": conf["db_name"], + "spec_repo_url": conf["spec_repo_url"], + "spec_release_url": conf["spec_release_url"], + "spec_release_path": conf["spec_release_path"], + } + ) + + +@api_v1.route("/ensure_specs", methods=["GET"]) +def ensure_all_specs(): + """ + Ensure that the local index/view/analyzer specs under spec/ have a + corresponding spec on the server. + + This endpoint is not strictly necessary, as the ensure_specs.ensure_all() + code should triggered in startup scripts. This is more insurance in case + one wishes to ensure the specs without re-deployment + + Example ensure_specs.ensure_all() return value: + { + "indexes": [], + "views": ["Compounds/arangosearch", "Reactions/arangosearch"], + "analyzers": ["icu_tokenize/text"] + } + """ + failed_names = ensure_specs.ensure_all() + if any([name for schema_type, names in failed_names.items() for name in names]): + return flask.jsonify(failed_names), 500 + else: + return flask.jsonify(failed_names) + + +def _preprocess_stored_query(query_text, config): + """Inject some default code into each stored query.""" + ws_id_text = " LET ws_ids = @ws_ids " if "ws_ids" in query_text else "" + return "\n".join([config.get("query_prefix", ""), ws_id_text, query_text]) diff --git a/relation_engine_server/exceptions.py b/relation_engine_server/exceptions.py new file mode 100644 index 00000000..5b18f839 --- /dev/null +++ b/relation_engine_server/exceptions.py @@ -0,0 +1,41 @@ +""" +Collection of exception classes for the Relation Engine server. +""" + + +class InvalidParameters(Exception): + """Invalid request parameters.""" + + def __init__(self, msg): + self.msg = msg + + def __str__(self): + return self.msg + + +class MissingHeader(Exception): + """Missing required header in a request.""" + + def __init__(self, header_name): + self.header_name = header_name + + def __str__(self): + return "Missing header: " + self.header_name + + +class UnauthorizedAccess(Exception): + """Authentication failed for an authorization header.""" + + def __init__(self, auth_url, response): + self.auth_url = auth_url + self.response = response + + +class NotFound(Exception): + """A resource was not found (yields a 404 response).""" + + def __init__(self, details): + self.details = details + + def __str__(self): + return self.details diff --git a/relation_engine_server/main.py b/relation_engine_server/main.py new file mode 100644 index 00000000..e2f14a5f --- /dev/null +++ b/relation_engine_server/main.py @@ -0,0 +1,176 @@ +"""The main entrypoint for running the Flask server.""" +import flask +import json +import os +from uuid import uuid4 +import traceback +from jsonschema.exceptions import ValidationError + +from relation_engine_server.api_versions.api_v1 import api_v1 +from relation_engine_server.exceptions import ( + MissingHeader, + UnauthorizedAccess, + InvalidParameters, + NotFound, +) +from relation_engine_server.utils.spec_loader import SchemaNonexistent +from relation_engine_server.utils import arango_client + +app = flask.Flask(__name__) +app.config["DEBUG"] = os.environ.get("FLASK_DEBUG", True) +app.config["SECRET_KEY"] = os.environ.get("FLASK_SECRET_KEY", str(uuid4())) +app.url_map.strict_slashes = False # allow both `get /v1/` and `get /v1` +app.register_blueprint(api_v1, url_prefix="/api/v1") + + +def return_error(error_dict, code): + """return the appropriate error structure and code + + Errors returned by the server have the basic format + + 'error': { + 'message': , + } + + The 'error' dictionary may have extra keys if there is additional information. + + This helper wraps the whole structure in an extra dict under the key 'error'. + + """ + return (flask.jsonify({"error": error_dict}), code) + + +@app.route("/", methods=["GET"]) +def root(): + """Server status. develop is default branch""" + if os.path.exists(".git/refs/heads/develop"): + with open(".git/refs/heads/develop", "r") as fd: + commit_hash = fd.read().strip() + else: + commit_hash = "unknown" + arangodb_status = arango_client.server_status() + repo_url = "https://github.com/kbase/relation_engine_api.git" + body = { + "arangodb_status": arangodb_status, + "commit_hash": commit_hash, + "repo_url": repo_url, + } + return flask.jsonify(body) + + +@app.errorhandler(json.decoder.JSONDecodeError) +def json_decode_error(err): + """A problem parsing json.""" + resp = { + "message": "Unable to parse JSON", + "source_json": err.doc, + "pos": err.pos, + "lineno": err.lineno, + "colno": err.colno, + } + return return_error(resp, 400) + + +@app.errorhandler(arango_client.ArangoServerError) +def arango_server_error(err): + resp = { + "message": str(err), + "arango_message": err.resp_json["errorMessage"], + } + return return_error(resp, 400) + + +# Invalid request body json params or missing headers +@app.errorhandler(MissingHeader) +@app.errorhandler(InvalidParameters) +def generic_400(err): + resp = { + "message": str(err), + } + return return_error(resp, 400) + + +@app.errorhandler(ValidationError) +def validation_error(err): + """Json Schema validation error.""" + # Refer to the documentation on jsonschema.exceptions.ValidationError: + # https://python-jsonschema.readthedocs.io/en/stable/errors/ + resp = { + "message": err.message, + "failed_validator": err.validator, + "value": err.instance, + "path": list(err.absolute_path), + } + return return_error(resp, 400) + + +@app.errorhandler(UnauthorizedAccess) +def unauthorized_access(err): + resp = { + "message": "Unauthorized", + "auth_url": err.auth_url, + "auth_response": err.response, + } + return return_error(resp, 403) + + +@app.errorhandler(SchemaNonexistent) +def schema_does_not_exist(err): + """General error cases.""" + resp = { + "message": "Not found", + "details": str(err), + "name": err.name, + } + return return_error(resp, 404) + + +@app.errorhandler(NotFound) +@app.errorhandler(404) +def page_not_found(err): + resp = { + "message": "Not found", + } + if hasattr(err, "details"): + resp["details"] = err.details + return return_error(resp, 404) + + +@app.errorhandler(405) +def method_not_allowed(err): + resp = { + "message": "Method not allowed", + } + return return_error(resp, 405) + + +# Any other unhandled exceptions -> 500 +@app.errorhandler(Exception) +@app.errorhandler(500) +def server_error(err): + print("=" * 80) + print("500 Unexpected Server Error") + print("-" * 80) + traceback.print_exc() + print("=" * 80) + resp = {"message": "Unexpected server error"} + # TODO only set below two fields in dev mode + resp["class"] = err.__class__.__name__ + resp["details"] = str(err) + return return_error(resp, 500) + + +@app.after_request +def after_request(resp): + # Log request + print(" ".join([flask.request.method, flask.request.path, "->", resp.status])) + # Enable CORS + resp.headers["Access-Control-Allow-Origin"] = "*" + env_allowed_headers = os.environ.get( + "HTTP_ACCESS_CONTROL_REQUEST_HEADERS", "Authorization, Content-Type" + ) + resp.headers["Access-Control-Allow-Headers"] = env_allowed_headers + # Set JSON content type and response length + resp.headers["Content-Type"] = "application/json" + resp.headers["Content-Length"] = resp.calculate_content_length() + return resp diff --git a/relation_engine_server/test/__init__.py b/relation_engine_server/test/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/relation_engine_server/test/data/collections/sample/directory/README.md b/relation_engine_server/test/data/collections/sample/directory/README.md new file mode 100644 index 00000000..e69de29b diff --git a/relation_engine_server/test/data/collections/sample/set/README.md b/relation_engine_server/test/data/collections/sample/set/README.md new file mode 100644 index 00000000..e69de29b diff --git a/relation_engine_server/test/data/collections/straight/edge.yaml b/relation_engine_server/test/data/collections/straight/edge.yaml new file mode 100644 index 00000000..e69de29b diff --git a/relation_engine_server/test/data/collections/straight/to/README.md b/relation_engine_server/test/data/collections/straight/to/README.md new file mode 100644 index 00000000..e69de29b diff --git a/relation_engine_server/test/data/collections/straight/to/the/core.json b/relation_engine_server/test/data/collections/straight/to/the/core.json new file mode 100644 index 00000000..e69de29b diff --git a/relation_engine_server/test/data/collections/straight/to/the/point.json b/relation_engine_server/test/data/collections/straight/to/the/point.json new file mode 100644 index 00000000..e69de29b diff --git a/relation_engine_server/test/data/collections/test_another_node.json b/relation_engine_server/test/data/collections/test_another_node.json new file mode 100644 index 00000000..e69de29b diff --git a/relation_engine_server/test/data/collections/test_edge.yaml b/relation_engine_server/test/data/collections/test_edge.yaml new file mode 100644 index 00000000..e69de29b diff --git a/relation_engine_server/test/data/collections/test_node.yaml b/relation_engine_server/test/data/collections/test_node.yaml new file mode 100644 index 00000000..e69de29b diff --git a/relation_engine_server/test/data/json_validation/defaults.json b/relation_engine_server/test/data/json_validation/defaults.json new file mode 100644 index 00000000..0967ef42 --- /dev/null +++ b/relation_engine_server/test/data/json_validation/defaults.json @@ -0,0 +1 @@ +{} diff --git a/relation_engine_server/test/data/json_validation/defaults.yaml b/relation_engine_server/test/data/json_validation/defaults.yaml new file mode 100644 index 00000000..0967ef42 --- /dev/null +++ b/relation_engine_server/test/data/json_validation/defaults.yaml @@ -0,0 +1 @@ +{} diff --git a/relation_engine_server/test/data/json_validation/fruit.yaml b/relation_engine_server/test/data/json_validation/fruit.yaml new file mode 100644 index 00000000..75e2acf7 --- /dev/null +++ b/relation_engine_server/test/data/json_validation/fruit.yaml @@ -0,0 +1,9 @@ +$schema: "http://json-schema.org/draft-07/schema#" +name: fruit +type: string +oneOf: + - const: peach + - const: plum + - const: dragonfruit + - const: strawberry + - const: pear diff --git a/relation_engine_server/test/data/json_validation/fruits_array.yaml b/relation_engine_server/test/data/json_validation/fruits_array.yaml new file mode 100644 index 00000000..ed6c710f --- /dev/null +++ b/relation_engine_server/test/data/json_validation/fruits_array.yaml @@ -0,0 +1,13 @@ +$schema: "http://json-schema.org/draft-07/schema#" +name: fruits_array +definitions: + fruits: + type: array + items: + $ref: fruit.yaml + default: [] + uniqueItems: true + examples: + - ['peach', 'plum'] + - ['strawberry'] + - [] diff --git a/relation_engine_server/test/data/json_validation/invalid_date.json b/relation_engine_server/test/data/json_validation/invalid_date.json new file mode 100644 index 00000000..9b9a7378 --- /dev/null +++ b/relation_engine_server/test/data/json_validation/invalid_date.json @@ -0,0 +1,5 @@ +{ + "name": "invalid_date", + "distance": 1, + "date": "20200606" +} diff --git a/relation_engine_server/test/data/json_validation/invalid_date.yaml b/relation_engine_server/test/data/json_validation/invalid_date.yaml new file mode 100644 index 00000000..e4613be1 --- /dev/null +++ b/relation_engine_server/test/data/json_validation/invalid_date.yaml @@ -0,0 +1,3 @@ +name: invalid_date +distance: 1 +date: "20200606" diff --git a/relation_engine_server/test/data/json_validation/invalid_date_type.json b/relation_engine_server/test/data/json_validation/invalid_date_type.json new file mode 100644 index 00000000..50d309a2 --- /dev/null +++ b/relation_engine_server/test/data/json_validation/invalid_date_type.json @@ -0,0 +1,5 @@ +{ + "name": "invalid_date", + "distance": 1, + "date": 20200606 +} diff --git a/relation_engine_server/test/data/json_validation/invalid_date_type.yaml b/relation_engine_server/test/data/json_validation/invalid_date_type.yaml new file mode 100644 index 00000000..1d029817 --- /dev/null +++ b/relation_engine_server/test/data/json_validation/invalid_date_type.yaml @@ -0,0 +1,3 @@ +name: invalid_date +distance: 1 +date: 20200606 diff --git a/relation_engine_server/test/data/json_validation/invalid_pattern.json b/relation_engine_server/test/data/json_validation/invalid_pattern.json new file mode 100644 index 00000000..9ee2461d --- /dev/null +++ b/relation_engine_server/test/data/json_validation/invalid_pattern.json @@ -0,0 +1,4 @@ +{ + "name": "what's-the-problem with-this-string?", + "distance": 1 +} diff --git a/relation_engine_server/test/data/json_validation/invalid_pattern.yaml b/relation_engine_server/test/data/json_validation/invalid_pattern.yaml new file mode 100644 index 00000000..66a97fff --- /dev/null +++ b/relation_engine_server/test/data/json_validation/invalid_pattern.yaml @@ -0,0 +1,2 @@ +name: what's-the-problem with-this-string? +distance: 1 diff --git a/relation_engine_server/test/data/json_validation/invalid_uri.json b/relation_engine_server/test/data/json_validation/invalid_uri.json new file mode 100644 index 00000000..334aa51f --- /dev/null +++ b/relation_engine_server/test/data/json_validation/invalid_uri.json @@ -0,0 +1,5 @@ +{ + "name": "uri_validation", + "distance": 1, + "home_page": "where is it?" +} diff --git a/relation_engine_server/test/data/json_validation/invalid_uri.yaml b/relation_engine_server/test/data/json_validation/invalid_uri.yaml new file mode 100644 index 00000000..358adc59 --- /dev/null +++ b/relation_engine_server/test/data/json_validation/invalid_uri.yaml @@ -0,0 +1,3 @@ +name: uri_validation +distance: 1 +home_page: where is it? diff --git a/relation_engine_server/test/data/json_validation/test_schema.json b/relation_engine_server/test/data/json_validation/test_schema.json new file mode 100644 index 00000000..d0e93a98 --- /dev/null +++ b/relation_engine_server/test/data/json_validation/test_schema.json @@ -0,0 +1,44 @@ +{ + "name": "test_schema", + "$schema": "http://json-schema.org/draft-07/schema#", + "properties": { + "params": { + "type": "object", + "properties": { + "name": { + "type": "string", + "format": "regex", + "pattern": "^\\w+$", + "default": "blank" + }, + "distance": { + "type": "integer", + "minimum": 0, + "maximum": 10, + "default": 1 + }, + "home_page": { + "type": "string", + "format": "uri" + }, + "title": { + "type": "string" + }, + "date": { + "title": "date", + "description": "A type of dried fruit", + "type": "string", + "format": "date" + }, + "fruits": { + "type": "array", + "items": { + "$ref": "fruit.yaml" + }, + "default": [], + "uniqueItems": "true" + } + } + } + } +} diff --git a/relation_engine_server/test/data/json_validation/test_schema.yaml b/relation_engine_server/test/data/json_validation/test_schema.yaml new file mode 100644 index 00000000..bbdc0707 --- /dev/null +++ b/relation_engine_server/test/data/json_validation/test_schema.yaml @@ -0,0 +1,36 @@ +name: test_schema +$schema: 'http://json-schema.org/draft-07/schema#' +properties: + params: + type: object + properties: + name: + type: string + format: regex + pattern: ^\w+$ + default: blank + title: + type: string + distance: + type: integer + minimum: 0 + maximum: 10 + default: 1 + home_page: + type: string + format: uri + date: + title: date + description: A type of dried fruit + type: string + format: date + fruits: + type: array + items: + $ref: fruit.yaml + default: [] + uniqueItems: true + examples: + - ['peach', 'plum'] + - ['strawberry'] + - [] diff --git a/relation_engine_server/test/data/json_validation/unquoted_date.yaml b/relation_engine_server/test/data/json_validation/unquoted_date.yaml new file mode 100644 index 00000000..9dc694f0 --- /dev/null +++ b/relation_engine_server/test/data/json_validation/unquoted_date.yaml @@ -0,0 +1,3 @@ +name: unquoted_date +distance: 3 +date: 2020-06-06 diff --git a/relation_engine_server/test/data/json_validation/valid_date.json b/relation_engine_server/test/data/json_validation/valid_date.json new file mode 100644 index 00000000..71831bb6 --- /dev/null +++ b/relation_engine_server/test/data/json_validation/valid_date.json @@ -0,0 +1,5 @@ +{ + "name": "valid_date", + "distance": 3, + "date": "2020-06-06" +} diff --git a/relation_engine_server/test/data/json_validation/valid_date.yaml b/relation_engine_server/test/data/json_validation/valid_date.yaml new file mode 100644 index 00000000..2a964808 --- /dev/null +++ b/relation_engine_server/test/data/json_validation/valid_date.yaml @@ -0,0 +1,3 @@ +name: valid_date +distance: 3 +date: "2020-06-06" diff --git a/relation_engine_server/test/data/json_validation/valid_pattern.json b/relation_engine_server/test/data/json_validation/valid_pattern.json new file mode 100644 index 00000000..e02c12d6 --- /dev/null +++ b/relation_engine_server/test/data/json_validation/valid_pattern.json @@ -0,0 +1,4 @@ +{ + "name": "No_problem_with_this_string", + "distance": 3 +} diff --git a/relation_engine_server/test/data/json_validation/valid_pattern.yaml b/relation_engine_server/test/data/json_validation/valid_pattern.yaml new file mode 100644 index 00000000..835d68e6 --- /dev/null +++ b/relation_engine_server/test/data/json_validation/valid_pattern.yaml @@ -0,0 +1,2 @@ +name: No_problem_with_this_string +distance: 3 diff --git a/relation_engine_server/test/data/json_validation/valid_uri.json b/relation_engine_server/test/data/json_validation/valid_uri.json new file mode 100644 index 00000000..e885d722 --- /dev/null +++ b/relation_engine_server/test/data/json_validation/valid_uri.json @@ -0,0 +1,5 @@ +{ + "name": "valid_uri", + "distance": 3, + "home_page": "http://json-validation.com:5000/this/is/valid" +} diff --git a/relation_engine_server/test/data/json_validation/valid_uri.yaml b/relation_engine_server/test/data/json_validation/valid_uri.yaml new file mode 100644 index 00000000..e31a0c49 --- /dev/null +++ b/relation_engine_server/test/data/json_validation/valid_uri.yaml @@ -0,0 +1,3 @@ +name: valid_uri +distance: 3 +home_page: http://json-validation.com:5000/this/is/valid diff --git a/relation_engine_server/test/data/schema_refs/collection_types/common_stuff.json b/relation_engine_server/test/data/schema_refs/collection_types/common_stuff.json new file mode 100644 index 00000000..80f18d5c --- /dev/null +++ b/relation_engine_server/test/data/schema_refs/collection_types/common_stuff.json @@ -0,0 +1,25 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "definitions": { + "marks_out_of_10": { + "type": "number", + "minimum": 0, + "maximum": 10 + }, + "node": { + "type": "object", + "properties": { + "type": { + "$ref": "/app/relation_engine_server/test/data/schema_refs/node_elements.json#/definitions/node/properties/type" + }, + "id": { + "$ref": "file:///app/relation_engine_server/test/data/schema_refs/node_elements.json#/definitions/node/properties/id" + } + }, + "required": ["type", "id"] + }, + "edge": { + "$ref": "../level_1/level_2/edge.yaml" + } + } +} diff --git a/relation_engine_server/test/data/schema_refs/collection_types/common_stuff.yaml b/relation_engine_server/test/data/schema_refs/collection_types/common_stuff.yaml new file mode 100644 index 00000000..593201cb --- /dev/null +++ b/relation_engine_server/test/data/schema_refs/collection_types/common_stuff.yaml @@ -0,0 +1,20 @@ +$schema: 'http://json-schema.org/draft-07/schema#' +definitions: + marks_out_of_10: + type: number + minimum: 0 + maximum: 10 + node: + type: object + properties: + type: + $ref: >- + /app/relation_engine_server/test/data/schema_refs/node_elements.json#/definitions/node/properties/type + id: + $ref: >- + file:///app/relation_engine_server/test/data/schema_refs/node_elements.json#/definitions/node/properties/id + required: + - type + - id + edge: + $ref: ../level_1/level_2/edge.yaml diff --git a/relation_engine_server/test/data/schema_refs/collection_types/node_elements.json b/relation_engine_server/test/data/schema_refs/collection_types/node_elements.json new file mode 100644 index 00000000..231dac44 --- /dev/null +++ b/relation_engine_server/test/data/schema_refs/collection_types/node_elements.json @@ -0,0 +1,23 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "definitions": { + "node": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["gene", "pheno"] + }, + "id": { + "type": "string", + "format": "regex", + "pattern": "^[a-zA-Z]+:\\d+$" + }, + "name": { + "type": "string" + } + }, + "required": ["type", "id"] + } + } +} diff --git a/relation_engine_server/test/data/schema_refs/collection_types/node_elements.yaml b/relation_engine_server/test/data/schema_refs/collection_types/node_elements.yaml new file mode 100644 index 00000000..d8dcb55a --- /dev/null +++ b/relation_engine_server/test/data/schema_refs/collection_types/node_elements.yaml @@ -0,0 +1,19 @@ +$schema: 'http://json-schema.org/draft-07/schema#' +definitions: + node: + type: object + properties: + type: + type: string + enum: + - gene + - pheno + id: + type: string + format: regex + pattern: '^[a-zA-Z]+:\d+$' + name: + type: string + required: + - type + - id diff --git a/relation_engine_server/test/data/schema_refs/edge.json b/relation_engine_server/test/data/schema_refs/edge.json new file mode 100644 index 00000000..c03ab862 --- /dev/null +++ b/relation_engine_server/test/data/schema_refs/edge.json @@ -0,0 +1,35 @@ +{ + "name": "edge", + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Arabidopsis gene-gene or gene-phenotype edge", + "description": "Generic gene-to-gene or gene-to-phenotype edge for Dan Jacobson Arabidopsis data", + "type": "object", + "required": [ + "score", + "edge_type", + "_from", + "_to", + "_key" + ], + "properties": { + "_key": { + "type": "string", + "title": "Key" + }, + "_from": { + "type": "string", + "title": "Gene ID" + }, + "_to": { + "type": "string", + "title": "Gene or Phenotype ID" + }, + "score": { + "title": "Edge Score (Weight)", + "type": "number" + }, + "edge_type": { + "$ref": "level_1/edge_type.json" + } + } +} diff --git a/relation_engine_server/test/data/schema_refs/edge.yaml b/relation_engine_server/test/data/schema_refs/edge.yaml new file mode 100644 index 00000000..2c006fc4 --- /dev/null +++ b/relation_engine_server/test/data/schema_refs/edge.yaml @@ -0,0 +1,21 @@ +name: edge +"$schema": http://json-schema.org/draft-07/schema# +title: Arabidopsis gene-gene or gene-phenotype edge +description: Generic gene-to-gene or gene-to-phenotype edge for Dan Jacobson Arabidopsis data +type: object +required: [score, edge_type, _from, _to, _key] +properties: + _key: + type: string + title: Key + _from: + type: string + title: Gene ID + _to: + type: string + title: Gene or Phenotype ID + score: + title: Edge Score (Weight) + type: number + edge_type: + $ref: level_1/edge_type.yaml diff --git a/relation_engine_server/test/data/schema_refs/level_1/edge.json b/relation_engine_server/test/data/schema_refs/level_1/edge.json new file mode 100644 index 00000000..7e9939cb --- /dev/null +++ b/relation_engine_server/test/data/schema_refs/level_1/edge.json @@ -0,0 +1,35 @@ +{ + "name": "edge", + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Arabidopsis gene-gene or gene-phenotype edge", + "description": "Generic gene-to-gene or gene-to-phenotype edge for Dan Jacobson Arabidopsis data", + "type": "object", + "required": [ + "score", + "edge_type", + "_from", + "_to", + "_key" + ], + "properties": { + "_key": { + "type": "string", + "title": "Key" + }, + "_from": { + "type": "string", + "title": "Gene ID" + }, + "_to": { + "type": "string", + "title": "Gene or Phenotype ID" + }, + "score": { + "title": "Edge Score (Weight)", + "type": "number" + }, + "edge_type": { + "$ref": "edge_type.json" + } + } +} diff --git a/relation_engine_server/test/data/schema_refs/level_1/edge.yaml b/relation_engine_server/test/data/schema_refs/level_1/edge.yaml new file mode 100644 index 00000000..3009e0be --- /dev/null +++ b/relation_engine_server/test/data/schema_refs/level_1/edge.yaml @@ -0,0 +1,21 @@ +name: edge +"$schema": http://json-schema.org/draft-07/schema# +title: Arabidopsis gene-gene or gene-phenotype edge +description: Generic gene-to-gene or gene-to-phenotype edge for Dan Jacobson Arabidopsis data +type: object +required: [score, edge_type, _from, _to, _key] +properties: + _key: + type: string + title: Key + _from: + type: string + title: Gene ID + _to: + type: string + title: Gene or Phenotype ID + score: + title: Edge Score (Weight) + type: number + edge_type: + $ref: edge_type.yaml diff --git a/relation_engine_server/test/data/schema_refs/level_1/edge_type.json b/relation_engine_server/test/data/schema_refs/level_1/edge_type.json new file mode 100644 index 00000000..0f440d79 --- /dev/null +++ b/relation_engine_server/test/data/schema_refs/level_1/edge_type.json @@ -0,0 +1,29 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "name": "edge_type", + "title": "Edge Type", + "description": "Edge types in Dan Jacobson Exascale dataset", + "type": "string", + "oneOf": [ + { + "const": "domain_co_occur", + "description": "A layer of protein domain co-occurrence values from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated from weighted mutual information scores to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015)." + }, + { + "const": "gene_coexpr", + "description": "A subset of pairwise gene coexpression values from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated from Pearson correlation coefficients to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015)." + }, + { + "const": "pheno_assn", + "description": "GWAS associations produced by analyzing a subset of phenotypes and SNPs in the Arabidopsis 1001 Genomes database. Edge values are significant association scores after FDR correction." + }, + { + "const": "ppi_hithru", + "description": "Log likelihood score. A layer of protein-protein interaction values derived from four high-throughput PPI screening experiments; from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015)." + }, + { + "const": "ppi_liter", + "description": "A layer of protein-protein interaction values from literature-curated small- to medium-scale experimental data; from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015)." + } + ] +} diff --git a/relation_engine_server/test/data/schema_refs/level_1/edge_type.yaml b/relation_engine_server/test/data/schema_refs/level_1/edge_type.yaml new file mode 100644 index 00000000..15263b80 --- /dev/null +++ b/relation_engine_server/test/data/schema_refs/level_1/edge_type.yaml @@ -0,0 +1,19 @@ +$schema: "http://json-schema.org/draft-07/schema#" +name: edge_type +title: Edge Type +description: Edge types in Dan Jacobson Exascale dataset +type: string +oneOf: + - const: domain_co_occur + description: A layer of protein domain co-occurrence values from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated from weighted mutual information scores to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015). + - const: gene_coexpr + description: A subset of pairwise gene coexpression values from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were + calculated from Pearson correlation coefficients to normalize the data + for comparison across studies and different types of data layers (Lee et + al, 2015). + - const: pheno_assn + description: GWAS associations produced by analyzing a subset of phenotypes and SNPs in the Arabidopsis 1001 Genomes database. Edge values are significant association scores after FDR correction. + - const: ppi_hithru + description: Log likelihood score. A layer of protein-protein interaction values derived from four high-throughput PPI screening experiments; from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015). + - const: ppi_liter + description: A layer of protein-protein interaction values from literature-curated small- to medium-scale experimental data; from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015). diff --git a/relation_engine_server/test/data/schema_refs/level_1/level_2/edge.json b/relation_engine_server/test/data/schema_refs/level_1/level_2/edge.json new file mode 100644 index 00000000..9cd204d0 --- /dev/null +++ b/relation_engine_server/test/data/schema_refs/level_1/level_2/edge.json @@ -0,0 +1,35 @@ +{ + "name": "edge", + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Arabidopsis gene-gene or gene-phenotype edge", + "description": "Generic gene-to-gene or gene-to-phenotype edge for Dan Jacobson Arabidopsis data", + "type": "object", + "required": [ + "score", + "edge_type", + "_from", + "_to", + "_key" + ], + "properties": { + "_key": { + "type": "string", + "title": "Key" + }, + "_from": { + "type": "string", + "title": "Gene ID" + }, + "_to": { + "type": "string", + "title": "Gene or Phenotype ID" + }, + "score": { + "title": "Edge Score (Weight)", + "type": "number" + }, + "edge_type": { + "$ref": "../edge_type.json" + } + } +} diff --git a/relation_engine_server/test/data/schema_refs/level_1/level_2/edge.yaml b/relation_engine_server/test/data/schema_refs/level_1/level_2/edge.yaml new file mode 100644 index 00000000..f01cf077 --- /dev/null +++ b/relation_engine_server/test/data/schema_refs/level_1/level_2/edge.yaml @@ -0,0 +1,21 @@ +name: edge +"$schema": http://json-schema.org/draft-07/schema# +title: Arabidopsis gene-gene or gene-phenotype edge +description: Generic gene-to-gene or gene-to-phenotype edge for Dan Jacobson Arabidopsis data +type: object +required: [score, edge_type, _from, _to, _key] +properties: + _key: + type: string + title: Key + _from: + type: string + title: Gene ID + _to: + type: string + title: Gene or Phenotype ID + score: + title: Edge Score (Weight) + type: number + edge_type: + $ref: ../edge_type.yaml diff --git a/relation_engine_server/test/data/schema_refs/level_1/test_object.json b/relation_engine_server/test/data/schema_refs/level_1/test_object.json new file mode 100644 index 00000000..46e6bfaa --- /dev/null +++ b/relation_engine_server/test/data/schema_refs/level_1/test_object.json @@ -0,0 +1,19 @@ +{ + "name": "test_object", + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Test object", + "description": "Object composed from numerous different files", + "type": "object", + "required": ["edge", "node", "marks_out_of_ten"], + "properties": { + "edge": { + "$ref": "edge.json" + }, + "node": { + "$ref": "../../schema_refs/collection_types/node_elements.yaml#/definitions/node" + }, + "marks_out_of_ten": { + "$ref": "file:///app/relation_engine_server/test/data/schema_refs/collection_types/common_stuff.json#/definitions/marks_out_of_10" + } + } +} diff --git a/relation_engine_server/test/data/schema_refs/level_1/test_object.yaml b/relation_engine_server/test/data/schema_refs/level_1/test_object.yaml new file mode 100644 index 00000000..d6f6084b --- /dev/null +++ b/relation_engine_server/test/data/schema_refs/level_1/test_object.yaml @@ -0,0 +1,17 @@ +name: test_object +$schema: 'http://json-schema.org/draft-07/schema#' +title: Test object +description: Object composed from numerous different files +type: object +required: + - edge + - marks_out_of_ten + - node +properties: + edge: + $ref: edge.json + node: + $ref: '../../schema_refs/collection_types/node_elements.yaml#/definitions/node' + marks_out_of_ten: + $ref: >- + file:///app/relation_engine_server/test/data/schema_refs/collection_types/common_stuff.json#/definitions/marks_out_of_10 diff --git a/relation_engine_server/test/data/test_file.md b/relation_engine_server/test/data/test_file.md new file mode 100644 index 00000000..e69de29b diff --git a/relation_engine_server/test/mock_auth/auth_admin.json b/relation_engine_server/test/mock_auth/auth_admin.json new file mode 100644 index 00000000..631e5bea --- /dev/null +++ b/relation_engine_server/test/mock_auth/auth_admin.json @@ -0,0 +1,26 @@ +{ + "methods": [ + "GET" + ], + "path": "/api/V2/me", + "headers": { + "Authorization": "admin_token" + }, + "response": { + "status": "200", + "body": { + "created": 1528306100471, + "lastlogin": 1542068355002, + "display": "Test User", + "roles": [], + "customroles": [ + "RE_ADMIN" + ], + "policyids": [], + "user": "username", + "local": false, + "email": "user@example.com", + "idents": [] + } + } +} diff --git a/relation_engine_server/test/mock_auth/auth_invalid.json b/relation_engine_server/test/mock_auth/auth_invalid.json new file mode 100644 index 00000000..e74e7269 --- /dev/null +++ b/relation_engine_server/test/mock_auth/auth_invalid.json @@ -0,0 +1,23 @@ +{ + "methods": [ + "GET" + ], + "path": "/api/V2/me", + "headers": { + "Authorization": "invalid_token" + }, + "response": { + "status": "401", + "body": { + "error": { + "httpcode": 401, + "httpstatus": "Unauthorized", + "appcode": 10020, + "apperror": "Invalid token", + "message": "10020 Invalid token", + "callid": "1757210147564211", + "time": 1542737889450 + } + } + } +} diff --git a/relation_engine_server/test/mock_auth/auth_missing.json b/relation_engine_server/test/mock_auth/auth_missing.json new file mode 100644 index 00000000..1661807c --- /dev/null +++ b/relation_engine_server/test/mock_auth/auth_missing.json @@ -0,0 +1,21 @@ +{ + "methods": [ + "GET" + ], + "path": "/api/V2/me", + "headers": {"Authorization": ""}, + "response": { + "status": "400", + "body": { + "error": { + "httpcode": 400, + "httpstatus": "Bad Request", + "appcode": 10010, + "apperror": "No authentication token", + "message": "10010 No authentication token: No user token provided", + "callid": "7334881776774415", + "time": 1542737656377 + } + } + } +} diff --git a/relation_engine_server/test/mock_auth/auth_non_admin.json b/relation_engine_server/test/mock_auth/auth_non_admin.json new file mode 100644 index 00000000..9236f450 --- /dev/null +++ b/relation_engine_server/test/mock_auth/auth_non_admin.json @@ -0,0 +1,24 @@ +{ + "methods": [ + "GET" + ], + "path": "/api/V2/me", + "headers": { + "Authorization": "non_admin_token" + }, + "response": { + "status": "200", + "body": { + "created": 1528306100471, + "lastlogin": 1542068355002, + "display": "Test User", + "roles": [], + "customroles": [], + "policyids": [], + "user": "username", + "local": false, + "email": "user@example.com", + "idents": [] + } + } +} diff --git a/relation_engine_server/test/mock_workspace/list_workspace_ids_invalid.json b/relation_engine_server/test/mock_workspace/list_workspace_ids_invalid.json new file mode 100644 index 00000000..89100454 --- /dev/null +++ b/relation_engine_server/test/mock_workspace/list_workspace_ids_invalid.json @@ -0,0 +1,22 @@ +{ + "methods": ["POST"], + "path": "/", + "headers": {"Authorization": "invalid_token"}, + "body": { + "method": "Workspace.list_workspace_ids", + "version": "1.1", + "params": [{"perm": "r"}] + }, + "response": { + "status": "500", + "body": { + "version": "1.1", + "error": { + "name": "JSONRPCError", + "code": -32400, + "message": "Token validation failed!", + "error": "..." + } + } + } +} diff --git a/relation_engine_server/test/mock_workspace/list_workspace_ids_valid.json b/relation_engine_server/test/mock_workspace/list_workspace_ids_valid.json new file mode 100644 index 00000000..0c879099 --- /dev/null +++ b/relation_engine_server/test/mock_workspace/list_workspace_ids_valid.json @@ -0,0 +1,22 @@ +{ + "methods": ["POST"], + "path": "/", + "headers": {"Authorization": "valid_token"}, + "body": { + "method": "Workspace.list_workspace_ids", + "version": "1.1", + "params": [{"perm": "r"}] + }, + "response": { + "status": "200", + "body": { + "version": "1.1", + "result": [ + { + "workspaces": [1, 2, 3], + "pub": [] + } + ] + } + } +} diff --git a/relation_engine_server/test/mock_workspace/list_workspace_ids_valid2.json b/relation_engine_server/test/mock_workspace/list_workspace_ids_valid2.json new file mode 100644 index 00000000..0c4ac18a --- /dev/null +++ b/relation_engine_server/test/mock_workspace/list_workspace_ids_valid2.json @@ -0,0 +1,17 @@ +{ + "methods": ["POST"], + "path": "/", + "headers": {"Authorization": "admin_token"}, + "body": { + "method": "Workspace.list_workspace_ids", + "version": "1.1", + "params": [{"perm": "r"}] + }, + "response": { + "status": "200", + "body": { + "version": "1.1", + "result": [{"workspaces": [99], "pub": []}] + } + } +} diff --git a/relation_engine_server/test/spec_release/README.md b/relation_engine_server/test/spec_release/README.md new file mode 100644 index 00000000..1a5a479a --- /dev/null +++ b/relation_engine_server/test/spec_release/README.md @@ -0,0 +1,3 @@ +## Test Spec Release + +`sample_spec_release`, and the corresponding archive, `spec.tar.gz`, contain a set of sample schema files suitable for use in tests. diff --git a/relation_engine_server/test/spec_release/sample_spec_release/spec/collections/ncbi/ncbi_taxon.yaml b/relation_engine_server/test/spec_release/sample_spec_release/spec/collections/ncbi/ncbi_taxon.yaml new file mode 100644 index 00000000..18810eba --- /dev/null +++ b/relation_engine_server/test/spec_release/sample_spec_release/spec/collections/ncbi/ncbi_taxon.yaml @@ -0,0 +1,65 @@ +name: ncbi_taxon +type: vertex +delta: true + +indexes: + - type: fulltext + fields: [scientific_name] + minLength: 1 + - type: persistent + fields: [id, expired, created] + - type: persistent + fields: [expired, created, last_version] + +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + description: Template for a vertex entry in the NCBI taxonomy tree. + required: [id, scientific_name, rank, strain] + properties: + id: + type: string + description: NCBI Taxon id (positive integer) + examples: ['1', '2053699'] + scientific_name: + type: string + title: Taxon name. + examples: ['Methylophilus methylotrophus', 'Bacteria', 'Firmicutes'] + aliases: + type: array + description: Aliases + examples: + - - category: authority + name: Borreliella burgdorferi (Johnson et al. 1984) Adeolu and Gupta 2015 + - category: genbank common name + name: Lyme disease spirochet + - category: synonym + name: Borrelia burgdorferi + - - category: common name + name: E. coli + - category: authority + name: '"Bacterium coli commune" Escherich 1885' + - category: synonym + name: Bacterium coli + items: + type: object + required: ['category', 'name'] + properties: + category: {type: string} + name: {type: string} + rank: + type: string + title: Taxonomic rank + examples: ["Domain", "Phylum", "no rank"] + strain: + type: boolean + title: Strain flag + description: Whether this node corresponds to a strain. Strains are considered to be nodes + that have a rank of "no rank" and whose parents' rank is either species or subspecies or + where the parent's strain flag is true. + ncbi_taxon_id: + type: integer + title: The NCBI taxon ID as a number + gencode: + type: integer + title: The numerc ID of the genetic code for this organism. diff --git a/relation_engine_server/test/spec_release/sample_spec_release/spec/collections/test/test_edge.yaml b/relation_engine_server/test/spec_release/sample_spec_release/spec/collections/test/test_edge.yaml new file mode 100644 index 00000000..fab7ad6e --- /dev/null +++ b/relation_engine_server/test/spec_release/sample_spec_release/spec/collections/test/test_edge.yaml @@ -0,0 +1,10 @@ +name: test_edge +type: edge +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + required: [_from, _to] + description: Example edge schema for testing. + properties: + _from: {type: string} + _to: {type: string} diff --git a/relation_engine_server/test/spec_release/sample_spec_release/spec/collections/test/test_vertex.yaml b/relation_engine_server/test/spec_release/sample_spec_release/spec/collections/test/test_vertex.yaml new file mode 100644 index 00000000..b2d34668 --- /dev/null +++ b/relation_engine_server/test/spec_release/sample_spec_release/spec/collections/test/test_vertex.yaml @@ -0,0 +1,11 @@ +name: test_vertex +type: vertex +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + required: [_key] + description: An example vertex schema for testing + properties: + _key: {type: string} + is_public: {type: boolean} + ws_id: {type: integer} diff --git a/relation_engine_server/test/spec_release/sample_spec_release/spec/data_sources/ncbi_taxonomy.yaml b/relation_engine_server/test/spec_release/sample_spec_release/spec/data_sources/ncbi_taxonomy.yaml new file mode 100644 index 00000000..37a88195 --- /dev/null +++ b/relation_engine_server/test/spec_release/sample_spec_release/spec/data_sources/ncbi_taxonomy.yaml @@ -0,0 +1,6 @@ +name: ncbi_taxonomy +category: taxonomy +title: NCBI Taxonomy +home_url: https://www.ncbi.nlm.nih.gov/taxonomy +data_url: ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/ +logo_path: /images/third-party-data-sources/ncbi/logo-51-64.png diff --git a/relation_engine_server/test/spec_release/sample_spec_release/spec/migrations/__init__.py b/relation_engine_server/test/spec_release/sample_spec_release/spec/migrations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/relation_engine_server/test/spec_release/sample_spec_release/spec/migrations/example.py b/relation_engine_server/test/spec_release/sample_spec_release/spec/migrations/example.py new file mode 100644 index 00000000..ce5ce389 --- /dev/null +++ b/relation_engine_server/test/spec_release/sample_spec_release/spec/migrations/example.py @@ -0,0 +1,3 @@ +# TODO + +x = 1 diff --git a/relation_engine_server/test/spec_release/sample_spec_release/spec/stored_queries/ncbi_tax/ncbi_fetch_taxon.yaml b/relation_engine_server/test/spec_release/sample_spec_release/spec/stored_queries/ncbi_tax/ncbi_fetch_taxon.yaml new file mode 100644 index 00000000..3a9c4170 --- /dev/null +++ b/relation_engine_server/test/spec_release/sample_spec_release/spec/stored_queries/ncbi_tax/ncbi_fetch_taxon.yaml @@ -0,0 +1,18 @@ +# Fetch a taxon document by taxonomy ID +name: ncbi_fetch_taxon +params: + type: object + required: [id, ts] + properties: + id: + type: string + title: NCBI Taxonomy ID + ts: + type: integer + title: Versioning timestamp +query: | + for t in ncbi_taxon + filter t.id == @id + filter t.created <= @ts AND t.expired >= @ts + limit 1 + return t diff --git a/relation_engine_server/test/spec_release/sample_spec_release/spec/stored_queries/test/fetch_test_vertex.yaml b/relation_engine_server/test/spec_release/sample_spec_release/spec/stored_queries/test/fetch_test_vertex.yaml new file mode 100644 index 00000000..8845f4a1 --- /dev/null +++ b/relation_engine_server/test/spec_release/sample_spec_release/spec/stored_queries/test/fetch_test_vertex.yaml @@ -0,0 +1,13 @@ +# Test query - fetch a single test vertex by ID +name: fetch_test_vertex +params: + type: object + required: [key] + properties: + key: + type: string + title: _key to match on +query: | + FOR o IN test_vertex + FILTER o._key == @key + RETURN o diff --git a/relation_engine_server/test/spec_release/sample_spec_release/spec/stored_queries/test/list_test_vertices.yaml b/relation_engine_server/test/spec_release/sample_spec_release/spec/stored_queries/test/list_test_vertices.yaml new file mode 100644 index 00000000..5d027d78 --- /dev/null +++ b/relation_engine_server/test/spec_release/sample_spec_release/spec/stored_queries/test/list_test_vertices.yaml @@ -0,0 +1,7 @@ +# Test query - List all test vertices +# Has some simple auth against ws_ids +name: list_test_vertices +query: | + FOR o IN test_vertex + FILTER o.is_public || o.ws_id IN ws_ids + RETURN o diff --git a/relation_engine_server/test/spec_release/sample_spec_release/spec/views/test_vertices.json b/relation_engine_server/test/spec_release/sample_spec_release/spec/views/test_vertices.json new file mode 100644 index 00000000..d45c3731 --- /dev/null +++ b/relation_engine_server/test/spec_release/sample_spec_release/spec/views/test_vertices.json @@ -0,0 +1,34 @@ +{ + "name": "test_vertices", + "type": "arangosearch", + "writebufferIdle": 64, + "writebufferActive": 0, + "primarySort": [], + "writebufferSizeMax": 33554432, + "commitIntervalMsec": 1000, + "consolidationPolicy": { + "type": "bytes_accum", + "threshold": 0.1 + }, + "cleanupIntervalStep": 10, + "links": { + "test_vertex": { + "analyzers": [ + "identity" + ], + "fields": { + "_key": { + "analyzers": [ + "text_en" + ] + }, + "is_public": {}, + "ws_id": {} + }, + "includeAllFields": false, + "storeValues": "none", + "trackListPositions": false + } + }, + "consolidationIntervalMsec": 60000 +} diff --git a/relation_engine_server/test/test_api_v1.py b/relation_engine_server/test/test_api_v1.py new file mode 100644 index 00000000..113471f5 --- /dev/null +++ b/relation_engine_server/test/test_api_v1.py @@ -0,0 +1,913 @@ +""" +Simple integration tests on the API itself. + +These tests run within the re_api docker image, and require access to the ArangoDB, auth, and workspace images. +""" +import unittest +import requests +import json +import os + +from relation_engine_server.utils.config import get_config +from relation_engine_server.utils.wait_for import wait_for_api +from spec.test.test_ensure_specs import ensure_borked_indexes + +_CONF = get_config() + +# Use the mock auth tokens +NON_ADMIN_TOKEN = "non_admin_token" +ADMIN_TOKEN = "admin_token" +INVALID_TOKEN = "invalid_token" + +# Use the docker-compose url of the running flask server +URL = os.environ.get("TEST_URL", "http://localhost:5000") +VERSION = "v1" +API_URL = "/".join([URL, "api", VERSION]) + +HEADERS_NON_ADMIN = { + "Authorization": "Bearer " + NON_ADMIN_TOKEN, + "Content-Type": "application/json", +} +HEADERS_ADMIN = { + "Authorization": "Bearer " + ADMIN_TOKEN, + "Content-Type": "application/json", +} + + +def create_test_docs(count): + """Produce some test documents.""" + + def doc(i): + return '{"name": "name", "_key": "%s", "is_public": true}' % i + + return "\n".join(doc(i) for i in range(0, count)) + + +def create_test_edges(count): + """Produce some test edges.""" + + def doc(i): + return '{"_from": "test_vertex/%s", "_to": "test_vertex/%s"}' % (i, i) + + return "\n".join(doc(i) for i in range(0, count)) + + +def save_test_docs(count, edges=False): + if edges: + docs = create_test_edges(count) + collection = "test_edge" + else: + docs = create_test_docs(count) + collection = "test_vertex" + return requests.put( + API_URL + "/documents", + params={"overwrite": True, "collection": collection}, + data=docs, + headers=HEADERS_ADMIN, + ).json() + + +class TestApi(unittest.TestCase): + @classmethod + def setUpClass(cls): + wait_for_api() + cls.maxDiff = None + + def test_request( + self, + url=None, + params=None, + data=None, + headers=None, + method="get", + status_code=200, + resp_json=None, + resp_test=None, + ): + """test a request to the server + + arguments: + url url to be appended to API_URL (i.e. request will be made to API_URL + url) + params request parameters + data query data, encoded as JSON + method HTTP method; defaults to 'get' + status_code expected response status; defaults to 200 + resp_json expected response content (JSON) + resp_test a function to perform on the response to test that it is as expected + """ + + # this method should only be run from another test method + if url is None: + self.assertTrue(True) + return + + resp = requests.request( + method, + API_URL + url, + params=params, + data=data, + headers=headers, + ) + self.assertEqual(resp.status_code, status_code) + if resp_json: + self.assertEqual(resp_json, resp.json()) + + if resp_test: + resp_test(self, resp) + + def test_root(self): + """Test root path for api.""" + resp_json = requests.get(URL + "/").json() + self.assertEqual(resp_json["arangodb_status"], "connected_authorized") + self.assertTrue(resp_json["commit_hash"]) + self.assertTrue(resp_json["repo_url"]) + + def test_config(self): + """Test config fetch.""" + resp_json = requests.get(API_URL + "/config").json() + self.assertTrue(len(resp_json["auth_url"])) + self.assertTrue(len(resp_json["workspace_url"])) + self.assertTrue(len(resp_json["kbase_endpoint"])) + self.assertTrue(len(resp_json["db_url"])) + self.assertTrue(len(resp_json["db_name"])) + + def test_update_specs(self): + """Test the endpoint that triggers an update on the specs.""" + resp = requests.put( + API_URL + "/specs", + headers=HEADERS_ADMIN, + params={"reset": "1", "init_collections": "1"}, + ) + resp_json = resp.json() + self.assertEqual(resp.status_code, 200) + self.assertEqual(resp_json["status"], "updated") + self.assertEqual( + resp_json["updated_from"], + "/app/relation_engine_server/test/spec_release/spec.tar.gz", + ) + + # delete the SPEC_TEST_READY env var as it is no longer true + os.environ.pop("SPEC_TEST_READY", None) + + # Test that the indexes get created and not duplicated + url = _CONF["db_url"] + "/_api/index" + auth = (_CONF["db_user"], _CONF["db_pass"]) + resp = requests.get(url, params={"collection": "ncbi_taxon"}, auth=auth) + resp_json = resp.json() + indexes = resp_json["indexes"] + self.assertEqual(len(indexes), 4) + fields = [i["fields"] for i in indexes] + self.assertEqual( + set(tuple(f) for f in fields), + { + ("_key",), + ("scientific_name",), + ("id", "expired", "created"), + ("expired", "created", "last_version"), + }, + ) + + def check_list_contains(self, the_list, must_contain): + """ensure the_list contains the items in must_contain""" + for item in must_contain: + self.assertIn(item, the_list) + + def test_list_collections(self): + """Test the listing out of registered collection schemas for vertices and edges.""" + for variant in ["schemas", "collections"]: + + def check_resp_json_contains(self, resp): + resp_json = resp.json() + self.check_list_contains( + resp_json, ["test_edge", "test_vertex", "ncbi_taxon"] + ) + + self.test_request("/specs/" + variant, resp_test=check_resp_json_contains) + + def test_list_data_sources(self): + """test the data source listing endpoints""" + + # there are two different data_sources endpoints that return very similar results + # /data_sources is used by the UI and requires slightly different response formatting + # /specs/data_sources is in the standard /specs format used by collections and stored_queries + + data_sources = ["ncbi_taxonomy"] + + # /spec/data_sources endpoint + def check_resp_json_spec_endpoint(self, resp): + resp_json = resp.json() + self.check_list_contains( + resp_json, + data_sources, + ) + + self.test_request( + "/specs/data_sources", resp_test=check_resp_json_spec_endpoint + ) + + def test_list_stored_queries(self): + """Test the listing out of saved AQL stored queries.""" + + def check_resp_json_contains(self, resp): + resp_json = resp.json() + self.check_list_contains( + resp_json, + ["fetch_test_vertex", "list_test_vertices", "ncbi_fetch_taxon"], + ) + + self.test_request( + "/specs/stored_queries", + resp_test=check_resp_json_contains, + ) + + def test_fetch_collection_and_fetch_schema_for_doc(self): + """Given a collection name or a document ID, fetch its schema.""" + + name = "test_vertex" + collection_params = {"name": name} # valid collection + document_params = {"doc_id": name + "/123"} # valid document + + def check_resp_json(self, resp): + resp_json = resp.json() + self.assertEqual(resp_json["name"], name) + self.assertEqual(resp_json["type"], "vertex") + self.assertTrue(resp_json["schema"]) + + for variant in ["schemas", "collections"]: + for params in [document_params, collection_params]: + self.test_request( + "/specs/" + variant, + params=params, + resp_test=check_resp_json, + ) + + def test_fetch_data_source(self): + """fetch a data source by name""" + + name = "ncbi_taxonomy" + + def check_resp_json(self, resp): + resp_json = resp.json() + self.assertEqual(type(resp_json), dict) + self.assertEqual( + set(resp_json.keys()), + {"name", "category", "title", "home_url", "data_url", "logo_url"}, + ) + self.assertTrue( + "/ui-assets/images/third-party-data-sources/ncbi" + in resp_json["logo_url"] + ) + + self.test_request( + "/specs/data_sources", {"name": name}, resp_test=check_resp_json + ) + + def test_fetch_stored_query(self): + """fetch a stored query by name""" + + name = "fetch_test_vertex" + + # note that the stored_queries endpoint returns the query data in a dict + # under the key 'stored_query' + def check_resp_json(self, resp): + resp_json = resp.json() + self.assertEqual(type(resp_json["stored_query"]), dict) + self.assertEqual(resp_json["stored_query"]["name"], name) + self.assertEqual( + set(resp_json["stored_query"].keys()), {"name", "query", "params"} + ) + + self.test_request( + "/specs/stored_queries", {"name": name}, resp_test=check_resp_json + ) + + def test_fetch_invalid_data_source(self): + """Unknown data source name should yield 404 status.""" + + name = "invalid_data_source" + self.test_request( + "/specs/data_sources", + {"name": name}, + status_code=404, + resp_json={ + "error": { + "message": "Not found", + "details": f"Data source '{name}' does not exist.", + "name": name, + } + }, + ) + + def test_fetch_invalid_collections_and_documents(self): + """Test the case where the collection or document does not exist.""" + + name = "fake_collection" + collection_params = {"name": name} # fetch an invalid collection + document_params = {"doc_id": name + "/123"} # fetch an invalid document + for variant in ["schemas", "collections"]: + for params in [document_params, collection_params]: + + self.test_request( + "/specs/" + variant, + params=params, + status_code=404, + resp_json={ + "error": { + "message": "Not found", + "details": f"Collection '{name}' does not exist.", + "name": name, + } + }, + ) + + def test_fetch_invalid_stored_queries(self): + """Test the case where the stored query does not exist.""" + + name = "made_up_stored_query" + self.test_request( + "/specs/stored_queries", + params={"name": name}, + status_code=404, + resp_json={ + "error": { + "message": "Not found", + "details": f"Stored query '{name}' does not exist.", + "name": name, + } + }, + ) + + def test_ensure_specs(self): + """Test endpoint for testing local specs against server specs""" + self.test_request( + "/ensure_specs", + resp_json={ + "indexes": [], + "views": [], + "analyzers": [], + }, + ) + + @unittest.skip("TODO - DELETE index") + def test_ensure_specs_fail(self): + self.test_request( + "/ensure_specs", + status_code=500, + resp_json={ + "indexes": ensure_borked_indexes()[0], + "views": [], + "analyzers": [], + }, + ) + + def test_show_data_sources(self): + resp = requests.get(API_URL + "/data_sources") + self.assertTrue(resp.ok) + resp_json = resp.json() + self.assertTrue(len(resp_json["data_sources"]) > 0) + self.assertEqual(set(type(x) for x in resp_json["data_sources"]), {str}) + + def test_show_data_source(self): + + name = "ncbi_taxonomy" + + def check_resp_json(self, resp): + resp_json = resp.json() + self.assertEqual(type(resp_json["data_source"]), dict) + self.assertEqual( + set(resp_json["data_source"].keys()), + {"name", "category", "title", "home_url", "data_url", "logo_url"}, + ) + self.assertTrue( + "/ui-assets/images/third-party-data-sources/ncbi" + in resp_json["data_source"]["logo_url"] + ) + + self.test_request("/data_sources/" + name, resp_test=check_resp_json) + + resp = requests.get(API_URL + "/data_sources/ncbi_taxonomy") + self.assertTrue(resp.ok) + resp_json = resp.json() + self.assertEqual(type(resp_json["data_source"]), dict) + self.assertEqual( + set(resp_json["data_source"].keys()), + {"name", "category", "title", "home_url", "data_url", "logo_url"}, + ) + self.assertTrue( + "/ui-assets/images/third-party-data-sources/ncbi" + in resp_json["data_source"]["logo_url"] + ) + + def test_show_data_source_unknown(self): + """Unknown data source name should yield 404 status.""" + name = "xyzyxz" + + self.test_request( + f"/data_sources/{name}", + status_code=404, + resp_json={ + "error": { + "message": "Not found", + "details": f"Data source '{name}' does not exist.", + "name": name, + } + }, + ) + + def test_save_documents_missing_auth(self): + """Test an invalid attempt to save a doc with a missing auth token.""" + self.test_request( + "/documents?on_duplicate=error&overwrite=true&collection", + method="put", + status_code=400, + resp_json={"error": {"message": "Missing header: Authorization"}}, + ) + + def test_save_documents_invalid_auth(self): + """Test an invalid attempt to save a doc with a bad auth token.""" + + # see ./mock_auth/auth_invalid.json for the response + auth_response = { + "error": { + "httpcode": 401, + "httpstatus": "Unauthorized", + "appcode": 10020, + "apperror": "Invalid token", + "message": "10020 Invalid token", + "callid": "1757210147564211", + "time": 1542737889450, + } + } + + self.test_request( + "/documents?on_duplicate=error&overwrite=true&collection", + headers={"Authorization": "Bearer " + INVALID_TOKEN}, + method="put", + status_code=403, + resp_json={ + "error": { + "message": "Unauthorized", + "auth_url": "http://auth:5000", + "auth_response": json.dumps(auth_response), + } + }, + ) + + def test_save_documents_non_admin(self): + """Test an invalid attempt to save a doc as a non-admin.""" + self.test_request( + "/documents?on_duplicate=error&overwrite=true&collection", + headers=HEADERS_NON_ADMIN, + method="put", + status_code=403, + resp_json={ + "error": { + "auth_response": "Missing role", + "auth_url": "http://auth:5000", + "message": "Unauthorized", + } + }, + ) + + def test_save_documents_invalid_schema(self): + """Test the case where some documents fail against their schema.""" + + self.test_request( + "/documents", + params={"on_duplicate": "ignore", "collection": "test_vertex"}, + data='{"name": "x"}\n{"name": "y"}', + headers=HEADERS_ADMIN, + method="put", + status_code=400, + resp_json={ + "error": { + "message": "'_key' is a required property", + "value": {"name": "x"}, + "path": [], + "failed_validator": "required", + } + }, + ) + + def test_save_documents_missing_schema(self): + """Test the case where the collection/schema does not exist.""" + + name = "fake_collection" + self.test_request( + "/documents", + method="put", + params={"collection": name}, + data="", + headers=HEADERS_ADMIN, + status_code=404, + resp_json={ + "error": { + "message": "Not found", + "details": f"Collection '{name}' does not exist.", + "name": name, + } + }, + ) + + def test_save_documents_invalid_json(self): + """Test an attempt to save documents with an invalid JSON body.""" + resp_json = requests.put( + API_URL + "/documents", + params={"collection": "test_vertex"}, + data="\n", + headers=HEADERS_ADMIN, + ).json() + self.assertTrue("Unable to parse" in resp_json["error"]["message"]) + self.assertEqual(resp_json["error"]["pos"], 1) + self.assertEqual(resp_json["error"]["source_json"], "\n") + + def test_create_documents(self): + """Test all valid cases for saving documents.""" + resp = save_test_docs(3) + expected = { + "created": 3, + "errors": 0, + "empty": 0, + "updated": 0, + "ignored": 0, + "error": False, + } + self.assertEqual(resp, expected) + + def test_create_edges(self): + """Test all valid cases for saving edges.""" + resp = save_test_docs(3, edges=True) + expected = { + "created": 3, + "errors": 0, + "empty": 0, + "updated": 0, + "ignored": 0, + "error": False, + } + self.assertEqual(resp, expected) + + def test_update_documents(self): + """Test updating existing documents.""" + resp_json = requests.put( + API_URL + "/documents", + params={"on_duplicate": "update", "collection": "test_vertex"}, + data=create_test_docs(3), + headers=HEADERS_ADMIN, + ).json() + expected = { + "created": 0, + "errors": 0, + "empty": 0, + "updated": 3, + "ignored": 0, + "error": False, + } + self.assertEqual(resp_json, expected) + + def test_update_edge(self): + """Test updating existing edge.""" + edges = create_test_edges(3) + resp = requests.put( + API_URL + "/documents", + params={"on_duplicate": "update", "collection": "test_edge"}, + data=create_test_edges(3), + headers=HEADERS_ADMIN, + ) + self.assertTrue(resp.ok) + resp_json = requests.put( + API_URL + "/documents", + params={"on_duplicate": "update", "collection": "test_edge"}, + data=edges, + headers=HEADERS_ADMIN, + ).json() + expected = { + "created": 0, + "errors": 0, + "empty": 0, + "updated": 3, + "ignored": 0, + "error": False, + } + self.assertEqual(resp_json, expected) + + def test_replace_documents(self): + """Test replacing of existing documents.""" + resp_json = requests.put( + API_URL + "/documents", + params={"on_duplicate": "replace", "collection": "test_vertex"}, + data=create_test_docs(3), + headers=HEADERS_ADMIN, + ).json() + expected = { + "created": 0, + "errors": 0, + "empty": 0, + "updated": 3, + "ignored": 0, + "error": False, + } + self.assertEqual(resp_json, expected) + + def test_save_documents_dupe_errors(self): + """Test where we want to raise errors on duplicate documents.""" + save_test_docs(3) + resp_json = requests.put( + API_URL + "/documents", + params={ + "on_duplicate": "error", + "collection": "test_vertex", + "display_errors": "1", + }, + data=create_test_docs(3), + headers=HEADERS_ADMIN, + ).json() + self.assertEqual(resp_json["created"], 0) + self.assertEqual(resp_json["errors"], 3) + self.assertTrue(resp_json["details"]) + + def test_save_documents_ignore_dupes(self): + """Test ignoring duplicate, existing documents when saving.""" + resp_json = requests.put( + API_URL + "/documents", + params={"on_duplicate": "ignore", "collection": "test_vertex"}, + data=create_test_docs(3), + headers=HEADERS_ADMIN, + ).json() + expected = { + "created": 0, + "errors": 0, + "empty": 0, + "updated": 0, + "ignored": 3, + "error": False, + } + self.assertEqual(resp_json, expected) + + def test_admin_query(self): + """Test an ad-hoc query made by an admin.""" + save_test_docs(1) + query = "for v in test_vertex sort rand() limit @count return v._id" + resp_json = requests.post( + API_URL + "/query_results", + params={}, + headers=HEADERS_ADMIN, + data=json.dumps({"query": query, "count": 1}), + ).json() + self.assertEqual(resp_json["count"], 1) + self.assertEqual(len(resp_json["results"]), 1) + + def test_admin_query_non_admin(self): + """Test an ad-hoc query error as a non-admin.""" + query = "for v in test_vertex sort rand() limit @count return v._id" + self.test_request( + "/query_results", + method="post", + params={}, + headers=HEADERS_NON_ADMIN, + data=json.dumps({"query": query, "count": 1}), + status_code=403, + resp_json={ + "error": { + "message": "Unauthorized", + "auth_url": "http://auth:5000", + "auth_response": "Missing role", + } + }, + ) + + def test_admin_query_invalid_auth(self): + """Test the error response for an ad-hoc admin query without auth.""" + + # see ./mock_auth/auth_invalid.json for response + query = "for v in test_vertex sort rand() limit @count return v._id" + self.test_request( + "/query_results", + method="post", + params={}, + headers={"Authorization": INVALID_TOKEN}, + data=json.dumps({"query": query, "count": 1}), + status_code=403, + resp_json={ + "error": { + "message": "Unauthorized", + "auth_url": "http://auth:5000", + "auth_response": json.dumps( + { + "error": { + "httpcode": 401, + "httpstatus": "Unauthorized", + "appcode": 10020, + "apperror": "Invalid token", + "message": "10020 Invalid token", + "callid": "1757210147564211", + "time": 1542737889450, + } + } + ), + } + }, + ) + + def test_query_with_cursor(self): + """Test getting more data via a query cursor and setting batch size.""" + save_test_docs(count=20) + resp_json = requests.post( + API_URL + "/query_results", + params={ + "stored_query": "list_test_vertices", + "batch_size": 10, + "full_count": True, + }, + ).json() + self.assertTrue(resp_json["cursor_id"]) + self.assertEqual(resp_json["has_more"], True) + self.assertEqual(resp_json["count"], 20) + self.assertEqual(resp_json["stats"]["fullCount"], 20) + self.assertTrue(len(resp_json["results"]), 10) + + cursor_id = resp_json["cursor_id"] + resp_json = requests.post( + API_URL + "/query_results", params={"cursor_id": cursor_id} + ).json() + self.assertEqual(resp_json["count"], 20) + self.assertEqual(resp_json["stats"]["fullCount"], 20) + self.assertEqual(resp_json["has_more"], False) + self.assertEqual(resp_json["cursor_id"], None) + self.assertTrue(len(resp_json["results"]), 10) + + # Try to get the same cursor again + self.test_request( + "/query_results", + method="post", + params={"cursor_id": cursor_id}, + status_code=400, + resp_json={ + "error": { + "message": "ArangoDB server error.", + "arango_message": "cursor not found", + } + }, + ) + + def test_query_no_name(self): + """Test a query error with a stored query name that does not exist.""" + + name = "nonexistent" + self.test_request( + "/query_results", + method="post", + params={"stored_query": name}, + status_code=404, + resp_json={ + "error": { + "message": "Not found", + "details": f"Stored query '{name}' does not exist.", + "name": name, + } + }, + ) + + def test_query_missing_bind_var(self): + """Test a query error with a missing bind variable.""" + + arango_msg = ( + "AQL: bind parameter 'xyz' was not declared in the query (while parsing)" + ) + self.test_request( + "/query_results", + method="post", + params={"stored_query": "list_test_vertices"}, + data=json.dumps({"xyz": "test_vertex"}), + status_code=400, + resp_json={ + "error": { + "message": "ArangoDB server error.", + "arango_message": arango_msg, + } + }, + ) + + def test_auth_query_with_access(self): + """Test the case where we query a collection with specific workspace access.""" + ws_id = 3 + # Remove all test vertices and create one with a ws_id + requests.put( + API_URL + "/documents", + params={"overwrite": True, "collection": "test_vertex"}, + data=json.dumps({"name": "requires_auth", "_key": "123", "ws_id": ws_id}), + headers=HEADERS_ADMIN, + ) + resp_json = requests.post( + API_URL + "/query_results", + params={"stored_query": "list_test_vertices"}, + headers={ + "Authorization": "valid_token" + }, # see ./mock_workspace/endpoints.json + ).json() + self.assertEqual(resp_json["count"], 1) + self.assertEqual(resp_json["results"][0]["ws_id"], ws_id) + + def test_auth_query_no_access(self): + """Test the case where we try to query a collection without the right workspace access.""" + # Remove all test vertices and create one with a ws_id + requests.put( + API_URL + "/documents", + params={"overwrite": True, "collection": "test_vertex"}, + data='{"name": "requires_auth", "_key": "1", "ws_id": 9999}', + headers=HEADERS_ADMIN, + ) + resp_json = requests.post( + API_URL + "/query_results", + params={"stored_query": "list_test_vertices"}, + headers={ + "Authorization": "valid_token" + }, # see ./mock_workspace/endpoints.json + ).json() + self.assertEqual(resp_json["count"], 0) + + def test_query_cannot_pass_ws_ids(self): + """Test that users cannot set the ws_ids param.""" + ws_id = 99 + requests.put( + API_URL + "/documents", + params={"overwrite": True, "collection": "test_vertex"}, + data='{"name": "requires_auth", "_key": "1", "ws_id": 99}', + headers=HEADERS_ADMIN, + ) + resp_json = requests.post( + API_URL + "/query_results", + params={"view": "list_test_vertices"}, + data=json.dumps({"ws_ids": [ws_id]}), + headers={"Authorization": "valid_token"}, + ).json() + self.assertEqual(resp_json["count"], 0) + + def test_auth_query_invalid_token(self): + """Test the case where we try to authorize a query using an invalid auth token.""" + requests.put( + API_URL + "/documents", + params={"overwrite": True, "collection": "test_vertex"}, + data='{"name": "requires_auth", "_key": "1", "ws_id": 99}', + headers=HEADERS_ADMIN, + ) + + # see ./mock_workspace/list_workspace_ids_invalid.json for response + self.test_request( + "/query_results", + params={"view": "list_test_vertices"}, + data=json.dumps({"ws_ids": [1]}), + headers={"Authorization": INVALID_TOKEN}, + method="post", + status_code=403, + resp_json={ + "error": { + "message": "Unauthorized", + "auth_url": "http://workspace:5000", + "auth_response": json.dumps( + { + "version": "1.1", + "error": { + "name": "JSONRPCError", + "code": -32400, + "message": "Token validation failed!", + "error": "...", + }, + } + ), + } + }, + ) + + def test_auth_adhoc_query(self): + """Test that the 'ws_ids' bind-var is set for RE_ADMINs.""" + ws_id = 99 + requests.put( + API_URL + "/documents", + params={"overwrite": True, "collection": "test_vertex"}, + data=json.dumps({"name": "requires_auth", "key": "1", "ws_id": ws_id}), + headers={"Authorization": "valid_token"}, + ) + # This is the same query as list_test_vertices.aql in the spec + query = "for o in test_vertex filter o.is_public || o.ws_id IN ws_ids return o" + resp_json = requests.post( + API_URL + "/query_results", + data=json.dumps({"query": query}), + headers={ + "Authorization": ADMIN_TOKEN + }, # see ./mock_workspace/endpoints.json + ).json() + self.assertEqual(resp_json["count"], 1) + + def test_save_docs_invalid(self): + """Test that an invalid bulk save returns a 400 response""" + doc = {"_from": "|||", "_to": "|||"} + resp = requests.put( + API_URL + "/documents", + params={"overwrite": True, "collection": "test_edge", "display_errors": 1}, + data=json.dumps(doc), + headers=HEADERS_ADMIN, + ) + self.assertEqual(resp.status_code, 400) + resp_json = resp.json() + self.assertEqual(resp_json["errors"], 1) diff --git a/relation_engine_server/test/test_json_validation.py b/relation_engine_server/test/test_json_validation.py new file mode 100644 index 00000000..f5a57d0b --- /dev/null +++ b/relation_engine_server/test/test_json_validation.py @@ -0,0 +1,517 @@ +""" +Test JSON validation functions + +The majority of the validation tests use `test_schema`, defined below and replicated as +JSON and YAML files. The tests are run with files and data structures for both the schema +and the data to be validated to ensure that all formats function the same. + +Test data files are in relation_engine_server/test/data/json_validation + +schema files: test_schema.json and test_schema.yaml (replicates test_schema) +data files: generally named (in)?valid_.(json|yaml) + +Other validation tests are at the bottom of the file. + +These tests run within the re_api docker image. +""" +import unittest +import os.path as os_path +import json +import yaml +from relation_engine_server.utils.json_validation import run_validator +from jsonschema.exceptions import ValidationError, RefResolutionError +from jsonpointer import JsonPointerException + +test_data_dirs = ["/app", "relation_engine_server", "test", "data"] +json_validation_dir = os_path.join(*(test_data_dirs + ["json_validation"])) +schema_refs_dir = os_path.join(*(test_data_dirs + ["schema_refs"])) + + +test_schema = { + "properties": { + "params": { + "type": "object", + "properties": { + "name": { + "type": "string", + "format": "regex", + "pattern": "^\\w+$", + "default": "blank", + }, + "distance": { + "type": "integer", + "minimum": 0, + "maximum": 10, + "default": 1, + }, + "home_page": { + "type": "string", + "format": "uri", + }, + "date": { + "title": "date", + "description": "A type of dried fruit", + "type": "string", + "format": "date", + }, + "fruits": { + "type": "array", + "items": { + "name": "fruit", + "type": "string", + "oneOf": [ + {"const": "peach"}, + {"const": "plum"}, + {"const": "pear"}, + {"const": "strawberry"}, + ], + }, + "default": [], + "uniqueItems": True, + }, + }, + } + } +} + +fruits_explicit = { + "type": "array", + "items": { + "name": "fruit", + "type": "string", + "oneOf": [ + {"const": "peach"}, + {"const": "plum"}, + {"const": "pear"}, + {"const": "strawberry"}, + ], + }, + "default": [], + "uniqueItems": True, +} + +fruits_array_ref = { + "$ref": "file://" + + os_path.join(json_validation_dir, "fruits_array.yaml") + + "#/definitions/fruits" +} + +fruit_ref = { + "type": "array", + "items": {"$ref": "file://" + os_path.join(json_validation_dir, "fruit.yaml")}, + "default": [], + "uniqueItems": True, +} + +valid_json_loc = "/properties/params" +schema_defaults = {"name": "blank", "distance": 1, "fruits": []} + +test_schema_list = [ + ["schema", test_schema], + ["schema_file", os_path.join(json_validation_dir, "test_schema.json")], + ["schema_file", os_path.join(json_validation_dir, "test_schema.yaml")], +] + +valid_edge_data = { + "_from": "here", + "_to": "eternity", + "score": 1.23456, + "_key": "abcdefg", + "edge_type": "domain_co_occur", +} + +invalid_edge_data = { + "_from": "here", + "_to": "eternity", + "score": 1.23456, + "_key": "abcdefg", + "edge_type": "whatever", +} + + +class TestJsonValidation(unittest.TestCase): + def test_non_validation_validator_errors(self): + """test errors in the validator that are unrelated to the validation functionality""" + + err_str = "Please supply either a schema or a schema file path" + with self.assertRaisesRegex(ValueError, err_str): + run_validator() + + with self.assertRaisesRegex(ValueError, err_str): + run_validator(data={}) + + # only supply one of schema or schema_file + with self.assertRaisesRegex(ValueError, err_str): + run_validator(schema={}, schema_file="/path/to/file") + + err_str = "Please supply either a data structure or a data file path" + with self.assertRaisesRegex(ValueError, err_str): + run_validator(schema={}) + + with self.assertRaisesRegex(ValueError, err_str): + run_validator(schema={}, data={}, data_file="") + + with self.assertRaisesRegex(ValueError, err_str): + run_validator(schema={}, data=None, data_file=None) + + # invalid file type + test_file = os_path.join(*(test_data_dirs + ["test_file.md"])) + err_msg = f"Unknown file type encountered: {test_file}" + with self.assertRaisesRegex(TypeError, err_msg): + run_validator(schema_file=test_file, data={}) + + # invalid jsonpointer string - note the grammar error is from jsonpointer + err_str = "location must starts with /" + json_loc = "start validating here" + with self.assertRaisesRegex(JsonPointerException, err_str): + run_validator(schema=test_schema, data={}, validate_at=json_loc) + + # invalid jsonpointer ref + err_str = "member 'property' not found in" + json_loc = "/properties/params/property" + with self.assertRaisesRegex(JsonPointerException, err_str): + run_validator(schema=test_schema, data={}, validate_at=json_loc) + + # finally!! + output = run_validator( + schema=test_schema, + data={"name": "name", "distance": 3}, + validate_at=valid_json_loc, + ) + self.assertEqual(output, {**schema_defaults, **{"name": "name", "distance": 3}}) + + def test_json_validation(self): + """Generic JSON validation tests to ensure that all is working as expected""" + + # run these tests with the schema as a data structure, as JSON, and as YAML + test_list = [ + self.test_add_defaults, + self.test_pattern_validation, + self.test_uri_validation, + self.test_date_format_validation, + self.test_array_validation, + ] + + for test_schema in test_schema_list: + schema_file_arg = schema_arg = test_schema[1] + + if test_schema[0] == "schema": + schema_file_arg = None + else: + schema_arg = None + + for test_name in test_list: + with self.subTest(test_name=test_name.__name__): + test_name(schema_arg, schema_file_arg) + + def execute_tests( + self, schema_arg, schema_file_arg, tests, file_types=[None, "json", "yaml"] + ): + + for t in tests: + for file_ext in file_types: + data = t["input"] + data_file = os_path.join(json_validation_dir, f"{t['file']}.{file_ext}") + if file_ext is None: + data_file = None + else: + data = None + + with self.subTest(input=t["input"], file_type=file_ext): + if "err_str" in t: + with self.assertRaisesRegex(ValidationError, t["err_str"]): + run_validator( + schema=schema_arg, + schema_file=schema_file_arg, + data=data, + data_file=data_file, + validate_at=valid_json_loc, + ) + + else: + output = run_validator( + schema=schema_arg, + schema_file=schema_file_arg, + data=data, + data_file=data_file, + validate_at=valid_json_loc, + ) + self.assertEqual(output, {**schema_defaults, **t["output"]}) + + def test_add_defaults(self, schema_arg=None, schema_file_arg=None): + """Test that the jsonschema validator sets default values.""" + + # skip if the test is not being called from test_json_validation + if schema_arg is None and schema_file_arg is None: + self.assertTrue(True) + return + + tests = [ + { + "input": {}, + "file": "defaults", + "output": schema_defaults, + } + ] + + self.execute_tests(schema_arg, schema_file_arg, tests) + + def test_pattern_validation(self, schema_arg=None, schema_file_arg=None): + """Test pattern validation""" + + # skip if the test is not being called from test_json_validation + if schema_arg is None and schema_file_arg is None: + self.assertTrue(True) + return + + tests = [ + { + "input": { + "name": "what's-the-problem with-this-string?", + "distance": 3, + }, + "file": "invalid_pattern", + "err_str": '"what\'s-the-problem with-this-string\?" does not match .*?', + }, + { + "input": {"name": "No_problem_with_this_string", "distance": 3}, + "file": "valid_pattern", + "output": { + "name": "No_problem_with_this_string", + "distance": 3, + }, + }, + ] + self.execute_tests(schema_arg, schema_file_arg, tests) + + def test_uri_validation(self, schema_arg=None, schema_file_arg=None): + """Test URI validation is operational""" + + # skip if the test is not being called from test_json_validation + if schema_arg is None and schema_file_arg is None: + self.assertTrue(True) + return + + tests = [ + { + "input": { + "name": "valid_uri", + "distance": 3, + "home_page": "http://json-validation.com:5000/this/is/valid", + }, + "file": "valid_uri", + "output": { + "name": "valid_uri", + "distance": 3, + "home_page": "http://json-validation.com:5000/this/is/valid", + }, + }, + { + "input": {"name": "invalid_uri", "home_page": "where is it?"}, + "file": "invalid_uri", + "err_str": "'where is it\?' is not a 'uri'", + }, + ] + + self.execute_tests(schema_arg, schema_file_arg, tests) + + def test_date_format_validation(self, schema_arg=None, schema_file_arg=None): + """ensure that fancy date formats are correctly validated""" + + # skip if the test is not being called from test_json_validation + if schema_arg is None and schema_file_arg is None: + self.assertTrue(True) + return + + tests = [ + { + "input": {"date": "20200606"}, + "file": "invalid_date", + "err_str": "'20200606' is not a 'date'", + }, + { + "input": {"date": 20200606}, + "file": "invalid_date_type", + "err_str": "20200606 is not of type 'string'", + }, + { + "input": {"name": "valid_date", "date": "2020-06-06", "distance": 3}, + "file": "valid_date", + "output": { + **schema_defaults, + "name": "valid_date", + "date": "2020-06-06", + "distance": 3, + }, + }, + ] + + self.execute_tests(schema_arg, schema_file_arg, tests) + + # pyyaml-specific issue: dates get automatically parsed into datetime objects (doh!) + file_path = os_path.join(json_validation_dir, "unquoted_date.yaml") + err_str = "datetime.date\(2020, 6, 6\) is not of type 'string'" + with self.assertRaisesRegex(ValidationError, err_str): + run_validator( + schema=schema_arg, + schema_file=schema_file_arg, + data_file=file_path, + validate_at=valid_json_loc, + ) + + def test_array_validation(self, schema_arg=None, schema_file_arg=None): + """ + check array validation and default population works correctly when refs are used + + The current implementation of the population of defaults does not allow defaults to be + populated if the property is a reference, i.e. + + 'properties': { + 'fruits': { + '$ref': '...' + } + } + + """ + + # skip if the test is not being called from test_json_validation + if schema_arg is None and schema_file_arg is None: + self.assertTrue(True) + return + + # test the use of refs when populating defaults + tests = [ + { + "fruits": fruit_ref, + "name": "using fruit.yaml -- array item is a ref", + "output": {"params": {"name": "name", "distance": 1, "fruits": []}}, + }, + { + # N.b. the default does not get populated in this case! + # This is a change from the expected functionality + "fruits": fruits_array_ref, + "name": "using fruits_array.yaml -- the array is a ref", + "output": { + "params": { + "name": "name", + "distance": 1, + } + }, + }, + { + "fruits": fruits_explicit, + "name": "with no references", + "output": {"params": {"name": "name", "distance": 1, "fruits": []}}, + }, + ] + + for t in tests: + with self.subTest(desc=t["name"]): + test_schema["properties"]["params"]["properties"]["fruits"] = t[ + "fruits" + ] + output = run_validator( + schema=test_schema, data={"params": {"name": "name"}} + ) + self.assertEqual(output, t["output"]) + + # restore the original value + test_schema["properties"]["params"]["properties"]["fruits"] = fruits_explicit + + def test_schema_references(self): + """Ensure referenced schemas, including those written in yaml, can be accessed.""" + + # same schema in different places + path_list = [[], ["level_1"], ["level_1", "level_2"]] + + err_msg = "'whatever' is not valid under any of the given schemas" + for path in path_list: + + for file_ext in ["json", "yaml"]: + with self.subTest(file_ext=file_ext): + file_path = os_path.join( + *(test_data_dirs + ["schema_refs"] + path), "edge." + file_ext + ) + + # fails due to invalid data + with self.assertRaisesRegex(ValidationError, err_msg): + run_validator( + schema_file=file_path, + data=invalid_edge_data, + ) + + # valid data + self.assertEqual( + run_validator( + schema_file=file_path, + data=valid_edge_data, + ), + valid_edge_data, + ) + + # validate using the schema instead of the schema_file + with open(file_path) as fd: + contents = ( + yaml.safe_load(fd) if file_ext == "yaml" else json.load(fd) + ) + + # if there is no $id in the schema, the ref resolver won't know + # where the schema file is located and will not resolve relative references + with self.assertRaisesRegex( + RefResolutionError, "No such file or directory" + ): + run_validator(schema=contents, data=valid_edge_data) + + # inject an $id with the current file path + contents["$id"] = file_path + self.assertEqual( + run_validator( + schema=contents, + data=valid_edge_data, + ), + valid_edge_data, + ) + + def test_complex_schema_references(self): + """test validation with complex references that reference other references""" + + valid_data = { + "node": { + "id": "TAIR:19830", + "type": "gene", + }, + "edge": valid_edge_data, + "marks_out_of_ten": 5, + } + + invalid_data = { + "node": { + "id": "TAIR:19830", + "type": "gene", + }, + "edge": invalid_edge_data, + "marks_out_of_ten": 5, + } + + err_msg = "'whatever' is not valid under any of the given schemas" + for file_ext in ["json", "yaml"]: + with self.subTest(file_ext=file_ext): + file_path = os_path.join( + *(test_data_dirs + ["schema_refs", "level_1"]), + "test_object." + file_ext, + ) + + # data fails validation + with self.assertRaisesRegex(ValidationError, err_msg): + run_validator( + schema_file=file_path, + data=invalid_data, + ) + + self.assertEqual( + run_validator( + schema_file=file_path, + data=valid_data, + ), + valid_data, + ) diff --git a/relation_engine_server/test/test_pull_spec.py b/relation_engine_server/test/test_pull_spec.py new file mode 100644 index 00000000..7ca658d7 --- /dev/null +++ b/relation_engine_server/test/test_pull_spec.py @@ -0,0 +1,43 @@ +import unittest +from unittest import mock +import re + +from relation_engine_server.utils.pull_spec import download_specs +from relation_engine_server.utils.wait_for import wait_for_api +from relation_engine_server.utils.config import get_config +from spec.test.test_ensure_specs import ensure_borked_indexes + +_CONF = get_config() + + +class TestPullSpec(unittest.TestCase): + @classmethod + def setUpClass(cls): + wait_for_api() + + def test_download_specs__success(self): + """Test ensure specs in `download_specs` for success case""" + update_name = download_specs(init_collections=True, reset=True) + self.assertEqual(_CONF["spec_release_path"], update_name) + + @mock.patch( + "relation_engine_server.utils.ensure_specs.ensure_indexes", + ensure_borked_indexes, + ) + def test_download_specs__fail(self): + """Test ensure specs in `download_specs` for fail case""" + with self.assertRaisesRegex( + RuntimeError, + re.escape( + """Some local specs have no matching server specs: +{ + "indexes": [ + "%s" + ], + "views": [], + "analyzers": [] +}""" + % ensure_borked_indexes()[0][0] + ), + ): + download_specs(init_collections=True, reset=True) diff --git a/relation_engine_server/test/test_spec_loader.py b/relation_engine_server/test/test_spec_loader.py new file mode 100644 index 00000000..746949b1 --- /dev/null +++ b/relation_engine_server/test/test_spec_loader.py @@ -0,0 +1,232 @@ +""" +Test spec_loader functions + +These tests run within the re_api docker image. +""" +import unittest +import os.path as os_path +from urllib.parse import urlparse +from relation_engine_server.utils import spec_loader +from relation_engine_server.utils.spec_loader import SchemaNonexistent +from relation_engine_server.utils.config import get_config + + +class TestSpecLoader(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.test_dir = os_path.join("/app", "relation_engine_server", "test") + cls.test_spec_dir = os_path.join( + cls.test_dir, "spec_release", "sample_spec_release", "spec" + ) + + cls.config = get_config() + cls.repo_path = cls.config["spec_paths"]["root"] + for key in cls.config["spec_paths"].keys(): + if cls.repo_path in cls.config["spec_paths"][key]: + cls.config["spec_paths"][key] = cls.config["spec_paths"][key].replace( + cls.repo_path, cls.test_spec_dir + ) + + @classmethod + def tearDownClass(cls): + # undo all the config changes + for key in cls.config["spec_paths"].keys(): + if cls.test_spec_dir in cls.config["spec_paths"][key]: + cls.config["spec_paths"][key] = cls.config["spec_paths"][key].replace( + cls.test_spec_dir, cls.repo_path + ) + + def test_get_names(self, schema_type_names=[], expected=[]): + """test getting the names of all the schemas of a given type""" + + # this method should only be run from another test method + if len(schema_type_names) == 0: + self.assertTrue(True) + return + + schema_type_singular = schema_type_names[0] + schema_type_plural = schema_type_names[1] + method = getattr(spec_loader, "get_" + schema_type_singular + "_names") + + # save the original value + original_config_dir = self.config["spec_paths"][schema_type_plural] + # set the config to the test directory + self.config["spec_paths"][schema_type_plural] = os_path.join( + self.test_dir, "data", schema_type_plural + ) + + got_names_method = method() + got_names_singular = spec_loader.get_names(schema_type_singular) + got_names_plural = spec_loader.get_names(schema_type_plural) + + self.config["spec_paths"][schema_type_plural] = os_path.join( + self.test_dir, "data", "empty" + ) + got_names_method_empty = method() + got_names_empty = spec_loader.get_names(schema_type_singular) + + # restore the original value before running tests + self.config["spec_paths"][schema_type_plural] = original_config_dir + + # ensure the results are as expected + # get_collection_names + self.assertEqual(set(expected), set(got_names_method)) + # get_names('collection') + self.assertEqual(set(expected), set(got_names_singular)) + # get_names('collections') + self.assertEqual(set(expected), set(got_names_plural)) + + # empty collections dir + self.assertEqual(got_names_method_empty, []) + self.assertEqual(got_names_empty, []) + + def test_run_spec_loading_tests(self, schema_type_names=[], test_name=None): + """test the different ways of returning a schema file path or its contents""" + + # only run the test if it's being called from another test + if test_name is None: + self.assertTrue(True) + return + + schema_type_singular = schema_type_names[0] + schema_type_plural = schema_type_names[1] + # e.g. 'spec_loader.get_collection' + method = getattr(spec_loader, "get_" + schema_type_singular) + + # get the path of the requested file + result_path = method(test_name, path_only=True) + self.assertIsInstance(result_path, str) + self.assertIn(test_name, result_path) + self.assertIn( + self.config["spec_paths"][schema_type_plural], + result_path, + ) + + # use get_schema directly to get the file path + for schema_type in schema_type_names: + self.assertEqual( + result_path, spec_loader.get_schema(schema_type, test_name, True) + ) + + # get the file contents + result_obj = method(test_name) + self.assertIs(type(result_obj), dict) + self.assertEqual(result_obj["name"], test_name) + + # check the contents of the dict when getting a data source + if schema_type_singular == "data_source": + + # logo_url should start with the same base as config['kbase_endpoint'] + endpoint = urlparse(self.config["kbase_endpoint"]) + self.assertIn( + endpoint.scheme + "://" + endpoint.netloc, result_obj["logo_url"] + ) + + # logo_path is deleted + self.assertNotIn("logo_path", result_obj.keys()) + + # a nonexistent file raises the appropriate error + fake_name = "test/test_node" + err_msg = ( + schema_type_singular.capitalize().replace("_", " ") + + " '" + + fake_name + + "' does not exist." + ) + with self.assertRaisesRegex(SchemaNonexistent, err_msg): + method(fake_name, path_only=True) + + def test_get_schemas_of_various_types(self): + """test retrieving schemas or paths to schemas for the different schema types""" + + schema_type_list = [ + { + # schema_type_names: singular, plural + "schema_type_names": ["collection", "collections"], + "example": "ncbi_taxon", + "names": [ + "core", + "edge", + "point", + "test_another_node", + "test_edge", + "test_node", + ], + }, + { + "schema_type_names": ["data_source", "data_sources"], + "example": "ncbi_taxonomy", + }, + { + "schema_type_names": ["stored_query", "stored_queries"], + "example": "ncbi_fetch_taxon", + }, + { + "schema_type_names": ["view", "views"], + "example": "test_vertices", + }, + ] + + for schema in schema_type_list: + with self.subTest(schema=schema["schema_type_names"][0]): + self.test_run_spec_loading_tests( + schema["schema_type_names"], schema["example"] + ) + if "names" in schema: + self.test_get_names(schema["schema_type_names"], schema["names"]) + + def test_non_existent_schema(self): + + err_msg = "Reality does not exist" + with self.assertRaisesRegex(SchemaNonexistent, err_msg): + spec_loader.get_names("Reality") + + def test_get_schema_for_doc(self): + """test getting the schema for a specific document""" + + test_name = "ncbi_taxon" + test_doc = test_name + "/12345" + # get the path of the requested file + result_path = spec_loader.get_schema_for_doc(test_doc, path_only=True) + self.assertIsInstance(result_path, str) + self.assertIn(test_name, result_path) + self.assertIn( + self.config["spec_paths"]["collections"], + result_path, + ) + + # get the file contents + result_obj = spec_loader.get_schema_for_doc(test_doc) + self.assertIs(type(result_obj), dict) + self.assertEqual(result_obj["name"], test_name) + + fake_name = "fake_name/12345" + # a nonexistent file raises the appropriate error + err_msg = f"Collection 'fake_name' does not exist." + with self.assertRaisesRegex(SchemaNonexistent, err_msg): + spec_loader.get_schema_for_doc(fake_name, path_only=True) + + def test_prevent_non_spec_dir_access(self): + """ + Ensure that matching files in directories outside the designated spec repo cannot be retrieved + """ + + # this query is OK as the file is still in the spec repo + path_in_spec_repo = "../../../../../**/fetch_test_vertex" + result = spec_loader.get_schema( + "stored_queries", path_in_spec_repo, path_only=True + ) + self.assertEqual( + result, + os_path.join( + self.test_spec_dir, "stored_queries", "test", "fetch_test_vertex.yaml" + ), + ) + + # this matches a file in one of the other test data dirs => should throw an error + path_outside_spec_repo = "../../../../data/collections/test_node" + err_msg = f"Stored query '{path_outside_spec_repo}' does not exist" + with self.assertRaisesRegex(SchemaNonexistent, err_msg): + spec_loader.get_schema( + "stored_queries", path_outside_spec_repo, path_only=True + ) diff --git a/relation_engine_server/utils/__init__.py b/relation_engine_server/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/relation_engine_server/utils/arango_client.py b/relation_engine_server/utils/arango_client.py new file mode 100644 index 00000000..920cab6c --- /dev/null +++ b/relation_engine_server/utils/arango_client.py @@ -0,0 +1,427 @@ +""" +Make ajax requests to the ArangoDB server. +""" +import sys +import os +import requests +import json + +from relation_engine_server.utils.config import get_config + +_CONF = get_config() + + +def adb_request(req_method, url_append, **kw): + """Make HTTP request to ArangoDB server""" + resp = req_method( + _CONF["api_url"] + url_append, + auth=(_CONF["db_user"], _CONF["db_pass"]), + **kw, + ) + if not resp.ok or resp.json()["error"]: + raise ArangoServerError(resp.text) + return resp.json() + + +def server_status(): + """Get the status of our connection and authorization to the ArangoDB server.""" + auth = (_CONF["db_user"], _CONF["db_pass"]) + adb_url = f"{_CONF['api_url']}/version" + try: + resp = requests.get(adb_url, auth=auth) + except requests.exceptions.ConnectionError: + return "no_connection" + if resp.ok: + return "connected_authorized" + elif resp.status_code == 401: + return "unauthorized" + else: + return "unknown_failure" + + +def run_query( + query_text=None, cursor_id=None, bind_vars=None, batch_size=10000, full_count=False +): + """Run a query using the arangodb http api. Can return a cursor to get more results.""" + url = _CONF["api_url"] + "/cursor" + req_json = { + "batchSize": min(5000, batch_size), + "memoryLimit": 16000000000, # 16gb + } + if cursor_id: + method = "PUT" + url += "/" + cursor_id + else: + method = "POST" + req_json["count"] = True + req_json["query"] = query_text + if full_count: + req_json["options"] = {"fullCount": True} + if bind_vars: + req_json["bindVars"] = bind_vars + # Run the query as the readonly user + resp = requests.request( + method, + url, + data=json.dumps(req_json), + auth=(_CONF["db_readonly_user"], _CONF["db_readonly_pass"]), + ) + resp_json = resp.json() + if not resp.ok or resp_json["error"]: + raise ArangoServerError(resp.text) + return { + "results": resp_json["result"], + "count": resp_json["count"], + "has_more": resp_json["hasMore"], + "cursor_id": resp_json.get("id"), + "stats": resp_json["extra"]["stats"], + } + + +def get_all_collections(): + """ + Fetch information for all existing non-system collections + + Resp to GET /_api/collection is + { + "error": False, + "code": 200, + "result": [ + { + "id": str of int, + "name": str, + "status": int, + "type": int, + "isSystem": bool, + "globallyUniqueId": str, + }, + ... + ] + } + + Returns + [ + {"id": ..., ...}, + {"id": ..., ...}, + ... + ] + """ + resp_json = adb_request( + req_method=requests.get, + url_append="/collection", + # --- + params={"excludeSystem": True}, + ) + return resp_json["result"] + + +def create_collection(name, config): + """ + Create a single collection by name using some basic defaults. + We ignore duplicates. For any other server error, an exception is thrown. + Shard the new collection based on the number of db nodes (10 shards for each). + """ + is_edge = config["type"] == "edge" + num_shards = int(os.environ.get("SHARD_COUNT", 30)) + url = _CONF["api_url"] + "/collection" + # collection types: + # 2 is a document collection + # 3 is an edge collection + collection_type = 3 if is_edge else 2 + print(f"Creating collection {name} (edge: {is_edge})") + data = json.dumps( + { + "keyOptions": {"allowUserKeys": True}, + "name": name, + "type": collection_type, + "numberOfShards": num_shards, + "waitForSync": True, + } + ) + resp = requests.post(url, data, auth=(_CONF["db_user"], _CONF["db_pass"])) + resp_json = resp.json() + if not resp.ok: + if "duplicate" not in resp_json["errorMessage"]: + # Unable to create a collection + raise ArangoServerError(resp.text) + print(f"Successfully created collection {name}") + if config.get("indexes"): + _create_indexes(name, config) + + +def get_all_indexes(): + """ + Fetch all existing indexes for all non-system collections + + Returns + { + "coll_name_0": + [ + { + "deduplicate" : true, + "estimates" : true, + "fields" : [ + "price" + ], + "id" : "products/68128", + "name" : "idx_1721606625944403968", + "selectivityEstimate" : 1, + "sparse" : true, + "type" : "skiplist", + "unique" : false + }, + ... + ], + ... + } + """ + coll_names = [coll["name"] for coll in get_all_collections()] + all_indexes = {} + for coll_name in coll_names: + all_indexes[coll_name] = _get_coll_indexes(coll_name) + return all_indexes + + +def _get_coll_indexes(coll_name): + """ + Fetch existing indexes for a collection + Resp to GET /_api/index is + { + "error" : False, + "code" : 200, + "indexes" : [ + { + "deduplicate" : true, + "estimates" : true, + "fields" : [ + "price" + ], + "id" : "products/68128", + "name" : "idx_1721606625944403968", + "selectivityEstimate" : 1, + "sparse" : true, + "type" : "skiplist", + "unique" : false + }, + ... + ], + ... + } + """ + resp_json = adb_request( + req_method=requests.get, + url_append="/index", + params={"collection": coll_name}, + ) + return resp_json["indexes"] + + +def _create_indexes(coll_name, config): + """Create indexes for a collection""" + url = _CONF["api_url"] + "/index" + indexes = _get_coll_indexes(coll_name) + for idx_conf in config["indexes"]: + idx_type = idx_conf["type"] + idx_url = url + "#" + idx_type + if _index_exists(idx_conf, indexes): + # POSTing again would not overwrite anyway + continue + print(f"Creating {idx_type} index for collection {coll_name}: {idx_conf}") + resp = requests.post( + idx_url, + params={"collection": coll_name}, + data=json.dumps(idx_conf), + auth=(_CONF["db_user"], _CONF["db_pass"]), + ) + if not resp.ok: + raise RuntimeError(resp.text) + print( + f'Successfully created {idx_type} index on {idx_conf["fields"]} for {coll_name}.' + ) + + +def _index_exists(idx_conf, indexes): + """ + Check if an index for a collection was already created in the database. + idx_conf - index config object from a collection schema + indexes - result of request to arangodb's /_api/index?collection=coll_name + """ + for idx in indexes: + if idx_conf["fields"] == idx["fields"] and idx_conf["type"] == idx["type"]: + return True + return False + + +def import_from_file(file_path, query): + """Import documents from a file.""" + with open(file_path, "rb") as file_desc: + resp = requests.post( + _CONF["api_url"] + "/import", + data=file_desc, + auth=(_CONF["db_user"], _CONF["db_pass"]), + params=query, + ) + if not resp.ok: + raise ArangoServerError(resp.text) + resp_json = resp.json() + if resp_json.get("errors", 0) > 0: + err_msg = f"{resp_json['errors']} errors creating documents\n" + sys.stderr.write(err_msg) + details = resp_json.get("details") + if details: + sys.stderr.write(f"Error details:\n{details[0]}\n") + return resp_json + + +def get_all_views(): + """ + Fetch all existing views from server + + Resp to GET /_api/view is + { + "error": false, + "code": 200, + "result": [ + {"id": str, "name": str, "type": str}, + ... + ] + } + + Resp to GET /_api/view/{view_name}/properties is + { + "error" : false, + "code" : 200, + "writebufferIdle" : 64, + "type" : "arangosearch", + "writebufferSizeMax" : 33554432, + "consolidationPolicy" : { + "type" : "tier", + "segmentsBytesFloor" : 2097152, + "segmentsBytesMax" : 5368709120, + "segmentsMax" : 10, + "segmentsMin" : 1, + "minScore" : 0 + }, + "name" : "products", + "primarySort" : [ ], + "globallyUniqueId" : "hA5F3C05BE80C/68910", + "id" : "68910", + "storedValues" : [ ], + "writebufferActive" : 0, + "consolidationIntervalMsec" : 1000, + "cleanupIntervalStep" : 2, + "commitIntervalMsec" : 1000, + "links" : { + }, + "primarySortCompression" : "lz4" + } + + Returns + [ + {}, + {}, + ... + ] + where each item is the properties dict (from above) + """ + resp_json = adb_request( + req_method=requests.get, + url_append="/view", + ) + view_names = [view["name"] for view in resp_json["result"]] + + view_properties = [] + for view_name in view_names: + resp_json = adb_request( + req_method=requests.get, + url_append=f"/view/{view_name}/properties", + ) + view_properties.append(resp_json) + + return view_properties + + +def create_view(name, config): + """ + Create a view by name, ignoring duplicates. + For any other server error, an exception is thrown. + """ + + url = _CONF["api_url"] + "/view#arangosearch" + + if "name" not in config: + config["name"] = name + if "type" not in config: + config["type"] = "arangosearch" + print(f"Creating view {name}") + data = json.dumps(config) + resp = requests.post(url, data, auth=(_CONF["db_user"], _CONF["db_pass"])) + resp_json = resp.json() + if not resp.ok: + if "duplicate" not in resp_json["errorMessage"]: + # Unable to create the view + raise ArangoServerError(resp.text) + + +def get_all_analyzers(): + """ + Fetch all existing analyzers from server + Resp to GET /_api/analyzer is + { + "error" : false, + "code" : 200, + "result" : [ + { + "name" : "text_pt", + "type" : "text", + "properties" : { + "locale" : "pt.utf-8", + "case" : "lower", + "stopwords" : [ ], + "accent" : false, + "stemming" : true + }, + "features" : [ + "frequency", + "norm", + "position" + ] + }, + ... + ] + } + + Returns + [ + { ... } + ] + """ + resp_json = adb_request( + requests.get, + url_append="/analyzer", + ) + analyzers = resp_json["result"] + return analyzers + + +def create_analyzer(name, config): + print(f"Creating analyzer {name}") + resp = requests.post( + url=_CONF["api_url"] + "/analyzer", + data=json.dumps(config), + auth=(_CONF["db_user"], _CONF["db_pass"]), + ) + if not resp.ok: + if "duplicate" not in resp.json()["errorMessage"]: + raise ArangoServerError(resp.text) + + +class ArangoServerError(Exception): + """A request to the ArangoDB server has failed (non-2xx).""" + + def __init__(self, resp_text): + self.resp_text = resp_text + self.resp_json = json.loads(resp_text) + + def __str__(self): + return "ArangoDB server error." diff --git a/relation_engine_server/utils/auth.py b/relation_engine_server/utils/auth.py new file mode 100644 index 00000000..9d8f3096 --- /dev/null +++ b/relation_engine_server/utils/auth.py @@ -0,0 +1,68 @@ +""" +Authorization and authentication utilities. +""" +import json +import flask +import requests + +from relation_engine_server.utils.config import get_config +from relation_engine_server.exceptions import MissingHeader, UnauthorizedAccess + + +def require_auth_token(roles=[]): + """ + Function that validates an authentication token in a flask request context. + + If any roles are provided, the token holder must have *at least one* of the roles. + + Raises some exception if any auth requirement is not met. + """ + config = get_config() + if not flask.request.headers.get("Authorization"): + # No authorization token was provided in the headers + raise MissingHeader("Authorization") + token = get_auth_header() + # Make an authorization request to the kbase auth2 server + headers = {"Authorization": token} + auth_url = config["auth_url"] + "/api/V2/me" + auth_resp = requests.get(auth_url, headers=headers) + if not auth_resp.ok: + print("-" * 80) + print(auth_resp.text) + raise UnauthorizedAccess(config["auth_url"], auth_resp.text) + auth_json = auth_resp.json() + if len(roles): + check_roles( + required=roles, given=auth_json["customroles"], auth_url=config["auth_url"] + ) + + +def check_roles(required, given, auth_url): + for role in required: + if role in given: + return + raise UnauthorizedAccess(auth_url, "Missing role") + + +def get_auth_header(): + return flask.request.headers.get("Authorization", "").replace("Bearer", "").strip() + + +def get_workspace_ids(auth_token): + """Get a list of workspace IDs that the given username is allowed to access in + the workspace.""" + if not auth_token: + return [] # anonymous users + config = get_config() + ws_url = config["workspace_url"] + # Make an admin request to the workspace (command is 'listWorkspaceIds') + payload = { + "method": "Workspace.list_workspace_ids", + "version": "1.1", + "params": [{"perm": "r"}], + } + headers = {"Authorization": auth_token} + resp = requests.post(ws_url, data=json.dumps(payload), headers=headers) + if not resp.ok: + raise UnauthorizedAccess(ws_url, resp.text) + return resp.json()["result"][0]["workspaces"] diff --git a/relation_engine_server/utils/bulk_import.py b/relation_engine_server/utils/bulk_import.py new file mode 100644 index 00000000..543ffba0 --- /dev/null +++ b/relation_engine_server/utils/bulk_import.py @@ -0,0 +1,48 @@ +import time +import os +import tempfile +import flask +import json +import hashlib + +from relation_engine_server.utils.json_validation import get_schema_validator +from relation_engine_server.utils.spec_loader import get_collection +from relation_engine_server.utils.arango_client import import_from_file + + +def bulk_import(query_params): + """ + Stream lines of JSON from a request body, validating each one against a + schema, then write them into a temporary file that can be passed into the + arango client. + """ + schema_file = get_collection(query_params["collection"], path_only=True) + validator = get_schema_validator(schema_file=schema_file, validate_at="/schema") + # We can't use a context manager here + # We need to close the file to have the file contents readable + # and we need to prevent deletion of the temp file on close (default behavior of tempfiles) + temp_fd = tempfile.NamedTemporaryFile(mode="a", delete=False) + try: + # Stream request data line-by-line + # Parse each line to json, validate the schema, and write to a file + for line in flask.request.stream: + json_line = json.loads(line) + validator.validate(json_line) + json_line = _write_edge_key(json_line) + json_line["updated_at"] = int(time.time() * 1000) + temp_fd.write(json.dumps(json_line) + "\n") + temp_fd.close() + resp_json = import_from_file(temp_fd.name, query_params) + finally: + # Always remove the temp file + os.remove(temp_fd.name) + return resp_json + + +def _write_edge_key(json_line): + """For edges, we want a deterministic key so there are no duplicates.""" + if "_key" not in json_line and "_from" in json_line and "_to" in json_line: + json_line["_key"] = hashlib.blake2b( + json_line["_from"].encode() + json_line["_to"].encode(), digest_size=8 + ).hexdigest() + return json_line diff --git a/relation_engine_server/utils/config.py b/relation_engine_server/utils/config.py new file mode 100644 index 00000000..fdea1b05 --- /dev/null +++ b/relation_engine_server/utils/config.py @@ -0,0 +1,58 @@ +""" +Load configuration data from environment variables. +""" +import os +import functools +from urllib.parse import urljoin + + +@functools.lru_cache(maxsize=1) +def get_config(): + """Load environment configuration data.""" + spec_path = os.environ.get("SPEC_PATH", "/spec") # /spec + + # The root url of a remote git repo that holds the specifications (ie. this repo) + spec_repo_url = os.environ.get("SPEC_REPO_URL") + # The specific URL of the spec tarball + spec_release_url = os.environ.get("SPEC_RELEASE_URL") + # The specific local path of the spec tarball + spec_release_path = os.environ.get("SPEC_RELEASE_PATH") + + kbase_endpoint = os.environ.get("KBASE_ENDPOINT", "https://ci.kbase.us/services") + auth_url = os.environ.get("KBASE_AUTH_URL", urljoin(kbase_endpoint + "/", "auth")) + workspace_url = os.environ.get( + "KBASE_WORKSPACE_URL", urljoin(kbase_endpoint + "/", "ws") + ) + + db_url = os.environ.get("DB_URL", "http://arangodb:8529") + db_name = os.environ.get("DB_NAME", "_system") + db_user = os.environ.get("DB_USER", "root") + db_pass = os.environ.get("DB_PASS", "") + db_readonly_user = os.environ.get("DB_READONLY_USER", db_user) + db_readonly_pass = os.environ.get("DB_READONLY_PASS", db_pass) + api_url = db_url + "/_db/" + db_name + "/_api" + return { + "auth_url": auth_url, + "workspace_url": workspace_url, + "kbase_endpoint": kbase_endpoint, + "db_url": db_url, + "api_url": api_url, + "db_name": db_name, + "db_user": db_user, + "db_pass": db_pass, + "db_readonly_user": db_readonly_user, + "db_readonly_pass": db_readonly_pass, + "spec_repo_url": spec_repo_url, + "spec_release_url": spec_release_url, + "spec_release_path": spec_release_path, + "spec_paths": { + "root": spec_path, # /spec + "release_id": os.path.join(spec_path, ".release_id"), + "collections": os.path.join(spec_path, "collections"), # /spec/collections + "datasets": os.path.join(spec_path, "datasets"), + "data_sources": os.path.join(spec_path, "data_sources"), + "stored_queries": os.path.join(spec_path, "stored_queries"), + "views": os.path.join(spec_path, "views"), + "analyzers": os.path.join(spec_path, "analyzers"), + }, + } diff --git a/relation_engine_server/utils/ensure_specs.py b/relation_engine_server/utils/ensure_specs.py new file mode 100644 index 00000000..835deb06 --- /dev/null +++ b/relation_engine_server/utils/ensure_specs.py @@ -0,0 +1,278 @@ +""" +Ensure that all the specs in the spec/**/*.json and spec/**/*.yaml are +present in the server, with the top-level fields of the local specs being +a subset of the top-level fields of the server specs +""" +from typing import Union, Callable + +from relation_engine_server.utils.json_validation import load_json_yaml +from relation_engine_server.utils import arango_client +from spec.validate import get_schema_type_paths + + +def match(spec_local, specs_server): + for spec_server in specs_server: + if is_obj_subset_rec(spec_local, spec_server): + return True + return False + + +def get_local_coll_indexes(): + """ + Read all schemas for the collection schema type + Return just collection name and indexes + """ + coll_spec_paths = [] + coll_name_2_indexes = {} + for coll_spec_path in get_schema_type_paths("collection"): + coll = load_json_yaml(coll_spec_path) + if "indexes" not in coll: + continue + coll_spec_paths.append(coll_spec_path) + coll_name_2_indexes[coll["name"]] = coll["indexes"] + return coll_spec_paths, coll_name_2_indexes + + +def get_local_views(): + view_spec_paths = get_schema_type_paths("view") + view_specs = [load_json_yaml(view_spec_path) for view_spec_path in view_spec_paths] + return view_spec_paths, view_specs + + +def get_local_analyzers(): + analyzer_spec_paths = get_schema_type_paths("analyzer") + analyzer_specs = [ + load_json_yaml(analyzer_spec_path) for analyzer_spec_path in analyzer_spec_paths + ] + return analyzer_spec_paths, analyzer_specs + + +def ensure_indexes(): + """ + Returns tuple + First item is list of borked index names, e.g. + [ + "coll_name_3/fulltext/['scientific_name']", + "coll_name_4/persistent/['id', 'key']", + ] + Second item is struct of failed indexes, e.g., + { + coll_name_3: [ + {"type": "fulltext", "fields": ["scientific_name"] ...} + ], + coll_name_4: [ + {"type": "persistent", "fields": ["id", "key"] ...} + ] + } + """ + coll_name_2_indexes_server = arango_client.get_all_indexes() + coll_spec_paths, coll_name_2_indexes_local = get_local_coll_indexes() + + failed_specs = {} + for coll_spec_path, (coll_name, indexes_local) in zip( + coll_spec_paths, coll_name_2_indexes_local.items() + ): + print(f"Ensuring indexes for {coll_spec_path}") + if coll_name not in coll_name_2_indexes_server: + failed_specs[coll_name] = indexes_local + continue + else: + failed_specs[coll_name] = [] + indexes_server = coll_name_2_indexes_server[coll_name] + for index_local in indexes_local: + if not match(index_local, indexes_server): + failed_specs[coll_name] = index_local + + failed_specs = { + k: v for k, v in failed_specs.items() if v + } # filter out 0-failure colls + if failed_specs: + print_failed_specs("indexes", failed_specs) + else: + print("All index specs ensured") + + return get_names(failed_specs, "indexes"), failed_specs + + +def ensure_views(): + """ + Returns tuple + First item is list of failed view names, e.g., + [ + "Compounds/arangosearch" + ] + Second item is list of failed specs, e.g., + [ + {"name": "Compounds", "type": "arangosearch", ...} + ] + """ + all_views_server = arango_client.get_all_views() + mod_obj_literal(all_views_server, float, round_float) + + failed_specs = [] + for view_spec_path, view_local in zip(*get_local_views()): + print(f"Ensuring view {view_spec_path}") + if not match(view_local, all_views_server): + failed_specs.append(view_local) + + if failed_specs: + print_failed_specs("views", failed_specs) + else: + print("All view specs ensured") + + return get_names(failed_specs, "views"), failed_specs + + +def ensure_analyzers(): + """ + Returns tuple + First item is list of failed view names, e.g., + [ + "icu_tokenize/text" + ] + Second item is list of failed specs, e.g., + [ + {"name": "icu_tokenize", "type": "text", ...} + ] + """ + all_analyzers_server = arango_client.get_all_analyzers() + mod_obj_literal(all_analyzers_server, str, excise_namespace) + + failed_specs = [] + for analyzer_spec_path, analyzer_local in zip(*get_local_analyzers()): + print(f"Ensuring analyzer {analyzer_spec_path}") + if not match(analyzer_local, all_analyzers_server): + failed_specs.append(analyzer_local) + + if failed_specs: + print_failed_specs("analyzers", failed_specs) + else: + print("All analyzer specs ensured") + + return get_names(failed_specs, "analyzers"), failed_specs + + +def ensure_all(): + """ + Return names of failed specs if any, e.g., + { + "indexes": [ + ], + "views": [ + "Coumpounds/arangosearch", + "Reactions/arangosearch", + ], + "analyzers": [ + "icu_tokenize/text", + ], + } + """ + failed_indexes_names, _ = ensure_indexes() + failed_views_names, _ = ensure_views() + failed_analyzers_names, _ = ensure_analyzers() + + return { + "indexes": failed_indexes_names, + "views": failed_views_names, + "analyzers": failed_analyzers_names, + } + + +def get_names(specs, schema_type): + """ + Given views/analyzers/collections, collate names using required properties + """ + names = [] + if schema_type in ["views", "analyzers"]: + for spec in specs: + names.append(f"{spec['name']}/{spec['type']}") + elif schema_type in ["indexes"]: + for coll_name, indexes in specs.items(): + for index in indexes: + names.append(f"{coll_name}/{index['type']}/{index['fields']}") + else: + raise RuntimeError(f'Unknown schema type "{schema_type}"') + return names + + +def print_failed_specs(schema_type, failed_specs): + """ + Print message with names of failed local specs + """ + + fail_msg = ( + "\n" + f"----------> {len(failed_specs)} {schema_type} failed ---------->" + "\n" + f"----------> names: {get_names(failed_specs, schema_type)} ---------->" + "\n" + f"----------> Please compare local/server specs ---------->" + ) + + print(fail_msg) + + +def round_float(num: float) -> float: + """ + For round-off error in floats + Arbitrarily chose 7 places + """ + return round(num, 7) + + +def excise_namespace(analyzer_name: str) -> str: + """ + Remove namespace prefix, e.g., + namespace::thing -> thing + """ + return analyzer_name.split("::")[-1] + + +def is_obj_subset_rec( + l: Union[dict, list, float, str, int], + r: Union[dict, list, float, str, int], +): + """ + Compare two JSON objects, to see if, essentially, l <= r + If comparing dicts, recursively compare + If comparing lists, shallowly compare. For now, YAGN more + """ + if isinstance(l, dict) and isinstance(r, dict): + return all( + [k in r.keys() and is_obj_subset_rec(l[k], r[k]) for k in l.keys()] + ) # ignore: typing + elif isinstance(l, list) and isinstance(r, list): + return all([le in r for le in l]) + else: + return l == r # noqa: E741 + + +def mod_obj_literal( + spec_unit: Union[list, dict], + literal_type: type, + func: Callable[[Union[float, str]], Union[float, str]], +) -> None: + """ + Modify dict in-place recursively + Some specs won't match because of + * round-off error in floats + * namespacing in analyzers, e.g., "_system::icu_tokenize" + + Parameters + ---------- + spec_unit - recursively accessed data structure unit of JSON obj + literal_type - str or float + func - function called to modify that str or float in-place + """ + if isinstance(spec_unit, dict): + for k, v in spec_unit.items(): + if isinstance(v, dict) or isinstance(v, list): + mod_obj_literal(v, literal_type, func) + elif isinstance(v, literal_type): + spec_unit[k] = func(v) # type: ignore + elif isinstance(spec_unit, list): + for i, v in enumerate(spec_unit): + if isinstance(v, dict) or isinstance(v, list): + mod_obj_literal(v, literal_type, func) + elif isinstance(v, literal_type): + spec_unit[i] = func(v) # type: ignore diff --git a/relation_engine_server/utils/json_validation.py b/relation_engine_server/utils/json_validation.py new file mode 100644 index 00000000..c44538f5 --- /dev/null +++ b/relation_engine_server/utils/json_validation.py @@ -0,0 +1,177 @@ +""" +JSON Schema validation + +See the docs on adding default values: https://python-jsonschema.readthedocs.io/en/stable/faq/ + +Example usage: + + schema = {'properties': {'foo': {'default': 'bar'}}} + obj = {} + Validator(schema).validate(obj) + assert obj == {'foo': 'bar'} +""" +from jsonschema import validators, Draft7Validator, FormatChecker, RefResolver + +from jsonschema.compat import ( + urlopen, + urlsplit, +) +from jsonschema.exceptions import ValidationError +from jsonpointer import resolve_pointer +import yaml +import json +import requests + + +def extend_with_default(validator_class): + validate_properties = validator_class.VALIDATORS["properties"] + + def set_defaults(validator, properties, instance, schema): + for property, subschema in properties.items(): + if "default" in subschema: + instance.setdefault(property, subschema["default"]) + for error in validate_properties(validator, properties, instance, schema): + yield error + + return validators.extend(validator_class, {"properties": set_defaults}) + + +Validator = extend_with_default(Draft7Validator) + + +def get_schema_validator(schema=None, schema_file=None, validate_at=""): + """ + Get a validator for the supplied schema + + :param schema: (dict) the schema as a data structure + :param schema_file: (string) path to the schema file (json or yaml format) + + :param validate_at: (string) where in the data structure the schema to validate against + is located, in JSON pointer syntax + defaults to the root of the schema object if not set + + only one of `schema` and `schema_file` should be supplied + + :return: + validator (Validator) jsonschema validator instance + + """ + + if ( + schema == schema_file + and schema is None + or schema is not None + and schema_file is not None + ): + raise ValueError("Please supply either a schema or a schema file path") + + # schema to validate against + if schema is None: + schema = load_json_yaml(schema_file) + + # get the appropriate location in the schema + validation_schema = resolve_pointer(schema, validate_at) + + if schema_file: + resolver = ExtendedRefResolver(schema_file, schema) + else: + resolver = ExtendedRefResolver.from_schema(schema) + + return Validator( + validation_schema, format_checker=FormatChecker(), resolver=resolver + ) + + +def run_validator( + schema=None, + schema_file=None, + validate_at="", + data=None, + data_file=None, + nicer_errors=False, +): + """ + Validate data against a schema, filling in defaults if appropriate + + :param schema: (dict) the schema as a data structure + :param schema_file: (string) path to the schema file (json or yaml format) + + :param validate_at: (string) where in the data structure the schema to validate against + is located, in JSON pointer syntax + defaults to the root of the schema object if not set + + :param data: (*) data to validate + :param data_file: (string) path to file containing data (json or yaml format) + + + only one of `schema` and `schema_file` should be supplied + + only one of `data` and `data_file` should be supplied + + :return: + data (*) validated data + + """ + + validator = get_schema_validator(schema, schema_file, validate_at) + + if data is None and data_file is None or data is not None and data_file is not None: + raise ValueError("Please supply either a data structure or a data file path") + + # data to validate + if data is None: + data = load_json_yaml(data_file) + + if validator.is_valid(data): + return data + + if not nicer_errors: + # this will throw a ValidationError + validator.validate(data) + + err_msg = "".join( + e.message + "\n" for e in sorted(validator.iter_errors(data), key=str) + ) + + raise ValidationError(err_msg) + + +def load_json_yaml(file): + """Loads the given JSON/YAML file""" + + with open(file) as fd: + if file.endswith(".yaml") or file.endswith(".yml"): + return yaml.safe_load(fd) + + if file.endswith(".json"): + return json.load(fd) + + raise TypeError("Unknown file type encountered: " + file) + + +class ExtendedRefResolver(RefResolver): + def resolve_remote(self, uri): + + scheme = urlsplit(uri).scheme + # if there's no scheme, it's a local file, so prefix it with "file://" + if scheme == "": + uri = "file://" + uri + + if scheme in self.handlers: + result = self.handlers[scheme](uri) + elif scheme in [u"http", u"https"]: + # Requests has support for detecting the correct encoding of + # json over http + result = requests.get(uri).json() + else: + # Otherwise, pass off to urllib and assume utf-8 + with urlopen(uri) as url: + content = url.read().decode("utf-8") + if uri.endswith(".yaml") or uri.endswith(".yml"): + result = yaml.safe_load(content) + else: + result = json.loads(content) + + if self.cache_remote: + self.store[uri] = result + return result diff --git a/relation_engine_server/utils/parse_json.py b/relation_engine_server/utils/parse_json.py new file mode 100644 index 00000000..a65fd5c1 --- /dev/null +++ b/relation_engine_server/utils/parse_json.py @@ -0,0 +1,14 @@ +import json +import flask + + +def get_json_body(): + """ + Parse json out of a request body, if present. + If the request body is empty, we return None rather than throwing any parsing errors. + """ + json_body = None # type: ignore + req_data = flask.request.get_data() + if req_data: + json_body = json.loads(req_data) + return json_body diff --git a/relation_engine_server/utils/pull_spec.py b/relation_engine_server/utils/pull_spec.py new file mode 100644 index 00000000..e75f0a41 --- /dev/null +++ b/relation_engine_server/utils/pull_spec.py @@ -0,0 +1,145 @@ +import os +import requests +import tarfile +import tempfile +import shutil +import json +import yaml +from typing import Optional + +from relation_engine_server.utils import arango_client +from relation_engine_server.utils.config import get_config +from relation_engine_server.utils.ensure_specs import ensure_all +from spec.validate import get_schema_type_paths + +_CONF = get_config() + + +def download_specs( + init_collections: bool = True, + release_url: Optional[str] = None, + reset: bool = False, +) -> Optional[str]: + """ + Check and download the latest spec and extract it to the spec path. + Returns: + The name or path of the release used to update the specs + """ + update_name: Optional[str] = None + if reset or not os.path.exists(_CONF["spec_paths"]["root"]): + # Remove the spec directory, ignoring if it is already missing + shutil.rmtree(_CONF["spec_paths"]["root"], ignore_errors=True) + # Directory to extract into + temp_dir = tempfile.mkdtemp() + # Download and extract a new release to /spec/repo + if _CONF["spec_release_path"]: + update_name = _CONF["spec_release_path"] + _extract_tarball(_CONF["spec_release_path"], temp_dir) + else: + if _CONF["spec_release_url"]: + tarball_url = _CONF["spec_release_url"] + else: + tarball_url = _fetch_github_release_url() + update_name = tarball_url + resp = requests.get(tarball_url, stream=True) + with tempfile.NamedTemporaryFile() as temp_file: + # The temp file will be closed/deleted when the context ends + # Download from the tarball url to the temp file + _download_file(resp, temp_file.name) + # Extract the downloaded tarball into the spec path + _extract_tarball(temp_file.name, temp_dir) + # At this point, the repo content is extracted into the temp directory + # Get the top-level directory name from the tarball + subdir = os.listdir(temp_dir)[0] + # Move /tmp/temp_dir/x/spec into /spec + shutil.move(os.path.join(temp_dir, subdir, "spec"), _CONF["spec_paths"]["root"]) + # Remove our temporary extraction directory + shutil.rmtree(temp_dir) + # Initialize all the collections + if init_collections: + do_init_collections() + do_init_views() + do_init_analyzers() + # Check that local specs have matching server specs + # Necessary because creating resources like indexes + # does not overwrite any pre-existing indexes + failed_names = ensure_all() + if any([name for schema_type, names in failed_names.items() for name in names]): + raise RuntimeError( + "Some local specs have no matching server specs:" + "\n" + json.dumps(failed_names, indent=4) + ) + return update_name + + +def do_init_collections(): + """Initialize any uninitialized collections in the database from a set of collection schemas.""" + for path in get_schema_type_paths("collection"): + coll_name = os.path.basename(os.path.splitext(path)[0]) + with open(path) as fd: + config = yaml.safe_load(fd) + arango_client.create_collection(coll_name, config) + + +def do_init_views(): + """Initialize any uninitialized views in the database from a set of schemas.""" + for path in get_schema_type_paths("view"): + view_name = os.path.basename(os.path.splitext(path)[0]) + with open(path) as fd: + config = json.load(fd) + arango_client.create_view(view_name, config) + + +def do_init_analyzers(): + for path in get_schema_type_paths("analyzer"): + analyzer_name = os.path.basename(os.path.splitext(path)[0]) + with open(path) as fd: + config = json.load(fd) + arango_client.create_analyzer(analyzer_name, config) + + +def _fetch_github_release_url(): + """Find the latest relation engine spec release using the github api.""" + # Download information about the latest release + release_resp = requests.get(_CONF["spec_repo_url"] + "/releases/latest") + release_info = release_resp.json() + if release_resp.status_code != 200: + # This may be a github API rate usage limit, or some other error + raise RuntimeError(release_info["message"]) + return release_info["tarball_url"] + + +def _download_file(resp, path): + """Download a streaming response as a file to path.""" + with open(path, "wb") as tar_file: + for chunk in resp.iter_content(chunk_size=1024): + tar_file.write(chunk) + + +def _extract_tarball(tar_path, dest_dir): + """Extract a gzipped tarball to a destination directory.""" + with tarfile.open(tar_path, "r:gz") as tar: + tar.extractall(path=dest_dir) + + +def _has_latest_spec(info): + """Check if downloaded release info matches the latest downloaded spec.""" + release_id = str(info["id"]) + if os.path.exists(_CONF["spec_paths"]["release_id"]): + with open(_CONF["spec_paths"]["release_id"], "r") as fd: + current_release_id = fd.read() + if release_id == current_release_id: + return True + return False + + +def _save_release_id(info): + """Save a release ID as the latest downloaded spec.""" + release_id = str(info["id"]) + # Write the release ID to /spec/.release_id + with open(_CONF["spec_release_id_path"], "w") as fd: + fd.write(release_id) + + +if __name__ == "__main__": + download_specs() diff --git a/relation_engine_server/utils/spec_loader.py b/relation_engine_server/utils/spec_loader.py new file mode 100644 index 00000000..8e2863da --- /dev/null +++ b/relation_engine_server/utils/spec_loader.py @@ -0,0 +1,184 @@ +""" +Utilities for loading stored queries, collections, and migrations from the spec. +""" +import glob +import json +import os +import re +import yaml + +from relation_engine_server.utils.config import get_config + +_CONF = get_config() + +_schema_types = { + # singular version of schema_type names + "singular": ["collection", "dataset", "data_source", "stored_query", "view"], + # plural version of schema_type names + "plural": ["collections", "datasets", "data_sources", "stored_queries", "views"], +} + +_VALID_SCHEMA_TYPES = _schema_types["singular"] + _schema_types["plural"] + + +def _switch_schema_type_name(schema_type, to_form): + """switch a schema_type name to the `to_form` version, ensuring that the schema exists first""" + + # this schema type does not exist + if schema_type not in _VALID_SCHEMA_TYPES: + raise SchemaNonexistent(schema_type) + + if schema_type in _schema_types[to_form]: + return schema_type + + from_form = "singular" if to_form == "plural" else "plural" + ix = _schema_types[from_form].index(schema_type) + return _schema_types[to_form][ix] + + +def pluralise_schema_type(schema_type): + """ensure a schema_type is in the plural form""" + return _switch_schema_type_name(schema_type, "plural") + + +def singularise_schema_type(schema_type): + """ensure a schema_type is in the singular form""" + return _switch_schema_type_name(schema_type, "singular") + + +def get_names(schema_type): + """ + get a list of all schemas of the specified schema_type + + Throws a SchemaNonexistent error if the schema_type does not exist. + """ + + # ensure that the name is in the plural form + schema_search_type = pluralise_schema_type(schema_type) + + yaml_paths = _find_paths(_CONF["spec_paths"][schema_search_type], "*.yaml") + json_paths = _find_paths(_CONF["spec_paths"][schema_search_type], "*.json") + + names = [_get_file_name(path) for path in sorted(yaml_paths + json_paths)] + + return names + + +def get_schema(schema_type, name, path_only=False): + """ + Get content or file path for a named schema of specified schema_type. + If path_only is true, the file path is returned; if not, the file contents are returned. + + Throws a SchemaNonexistent error if the named schema does not exist. + """ + + schema_search_type = pluralise_schema_type(schema_type) + + yaml_paths = _find_paths(_CONF["spec_paths"][schema_search_type], f"{name}.yaml") + json_paths = _find_paths(_CONF["spec_paths"][schema_search_type], f"{name}.json") + # ensure we're using the canonical path and that all paths are unique + # we are only interested in paths that are in the designated spec repo + repo_path = os.path.abspath(_CONF["spec_paths"]["root"]) + all_paths_set = set(os.path.abspath(path) for path in yaml_paths + json_paths) + all_paths = [p for p in all_paths_set if p.startswith(repo_path)] + + if len(all_paths) == 0: + raise SchemaNonexistent(singularise_schema_type(schema_type), name) + + # ignore duplicates or multiple results, just go with the first one + path = all_paths[0] + if path_only: + return path + + with open(path) as fd: + if path.endswith(".json"): + contents = json.load(fd) + else: + contents = yaml.safe_load(fd) + + if schema_search_type == "data_sources" and "logo_path" in contents: + # Append the logo root url to be the ui-assets server url with the correct environment + base_logo_url = re.sub( + r"\/services\/?", "/ui-assets", _CONF["kbase_endpoint"] + ) + contents["logo_url"] = base_logo_url + contents["logo_path"] + del contents["logo_path"] + + return contents + + +def get_collection_names(): + """Return a dict of vertex and edge base names.""" + return get_names("collections") + + +def get_data_source_names(): + """Return an array of all the data source names.""" + return get_names("data_sources") + + +def get_stored_query_names(): + """Return an array of all stored queries base names.""" + return get_names("stored_queries") + + +def get_view_names(): + """Return an array of all view base names.""" + return get_names("views") + + +def get_collection(name, path_only=False): + """Get YAML content (or file path) for a specific collection. Throws an error if nonexistent.""" + return get_schema("collection", name, path_only) + + +def get_schema_for_doc(doc_id, path_only=False): + """Get the schema for a particular document by its full ID.""" + (coll_name, _) = doc_id.split("/") + return get_schema("collection", coll_name, path_only) + + +def get_data_source(name, path_only=False): + """Get YAML content (or file path) for a data source. Throws an error if it does not exist.""" + return get_schema("data_source", name, path_only) + + +def get_stored_query(name, path_only=False): + """Get AQL content or file path for a specific stored query. Throws an error if nonexistent.""" + return get_schema("stored_query", name, path_only) + + +def get_view(name, path_only=False): + """Get content or file path for a view file. Throws an error if nonexistent.""" + return get_schema("view", name, path_only) + + +def _find_paths(dir_path, file_pattern): + """ + Return all file paths from a filename pattern, starting from a parent + directory and looking in all subdirectories. + """ + pattern = os.path.join(dir_path, "**", file_pattern) + return glob.glob(pattern, recursive=True) + + +def _get_file_name(path): + """ + Get the file base name without extension from a file path. + """ + return os.path.splitext(os.path.basename(path))[0] + + +class SchemaNonexistent(Exception): + """Requested schema or schema type is not in the spec""" + + def __init__(self, schema_type, name=None): + self.schema_type = schema_type + self.name = name + + def __str__(self): + schema_type = self.schema_type.capitalize().replace("_", " ") + if self.name is None: + return f"{schema_type} does not exist." + + return f"{schema_type} '{self.name}' does not exist." diff --git a/relation_engine_server/utils/wait_for.py b/relation_engine_server/utils/wait_for.py new file mode 100644 index 00000000..06b4a59a --- /dev/null +++ b/relation_engine_server/utils/wait_for.py @@ -0,0 +1,89 @@ +""" +Block until all dependent services come online. +""" +import requests +import time +import sys +from relation_engine_server.utils.config import get_config +from typing import List + +_CONF = get_config() + + +def get_service_conf(): + return { + "arangodb": { + "url": _CONF["api_url"] + "/collection", + "callback": _assert_json_content, + "raise_for_status": True, + }, + "auth": { + "url": _CONF["auth_url"], + }, + "workspace": { + "url": _CONF["workspace_url"], + }, + "localhost": { + "url": "http://127.0.0.1:5000", + "raise_for_status": True, + }, + } + + +def wait_for_service(service_list: List[str]) -> None: + """wait for a service or list of services to start up""" + timeout = int(time.time()) + 60 + services_pending = set(service_list) + service_conf = get_service_conf() + while services_pending: + still_pending = set() + for name in services_pending: + try: + conf = service_conf[name] + auth = (_CONF["db_user"], _CONF["db_pass"]) + resp = requests.get(conf["url"], auth=auth) + if conf.get("raise_for_status"): + resp.raise_for_status() + if conf.get("callback") is not None: + conf["callback"](resp) + # The service is up + except Exception as err: + print(f"Still waiting for {name} to start...") + if int(time.time()) > timeout: + raise RuntimeError( + f"Timed out waiting for {name} to start with error: {err}" + ) + still_pending.add(name) + time.sleep(3) + services_pending = still_pending + print(f"{', '.join(service_list)} started!") + + +def wait_for_arangodb(): + """wait for arangodb to be ready""" + wait_for_service(["arangodb"]) + + +def wait_for_services(): + """wait for the workspace, auth, and arango to start up""" + wait_for_service(["auth", "workspace", "arangodb"]) + + +def wait_for_api(): + """wait for the workspace, auth, arango, AND localhost:5000 to start up""" + wait_for_services() + wait_for_service(["localhost"]) + + +def _assert_json_content(resp: requests.models.Response) -> None: + """Assert that a response body has non-empty JSON content.""" + if len(resp.content) == 0: + raise RuntimeError("No content in response") + resp.json() + + +if __name__ == "__main__": + if sys.argv[1] == "services": + wait_for_services() + elif sys.argv[1] == "api": + wait_for_api() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..0960d7cc --- /dev/null +++ b/requirements.txt @@ -0,0 +1,14 @@ +Flask==1.0.2 +itsdangerous==2.0.1 +greenlet==0.4.16 +gunicorn==19.9.0 +gevent==1.3.7 +simplejson==3.16.0 +python-dotenv==0.9.1 +requests==2.20.0 +jsonpointer==2.0 +jsonschema==3.2.0 +jsonschema[format]==3.2.0 +pyyaml==5.4 +rfc3987==1.3.8 +jinja2==3.0.3 diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/data/edges.yaml b/scripts/data/edges.yaml new file mode 100644 index 00000000..8d36c3e0 --- /dev/null +++ b/scripts/data/edges.yaml @@ -0,0 +1,33 @@ +name: __NAME___edges +type: edge +delta: true + +indexes: + - type: persistent + fields: [id, expired, created] + - type: persistent + fields: [expired, created, last_version] + +schema: + "$schema": http://json-schema.org/draft-07/schema# + title: __NAME___edges + type: object + description: A entry for edges in the __NAME__ ontology hierarchy + properties: + id: + type: string + description: an edge ID, consisting of from::to::type + type: + type: string + description: __NAME__ edge type + from: + type: string + description: __NAME__ id + to: + type: string + description: __NAME__ id + required: + - id + - type + - from + - to diff --git a/scripts/data/merges.yaml b/scripts/data/merges.yaml new file mode 100644 index 00000000..fe32ff7c --- /dev/null +++ b/scripts/data/merges.yaml @@ -0,0 +1,33 @@ +name: __NAME___merges +type: edge +delta: true + +indexes: + - type: persistent + fields: [id, expired, created] + - type: persistent + fields: [expired, created, last_version] + +schema: + "$schema": http://json-schema.org/draft-07/schema# + title: __NAME___merges + type: object + description: A entry for merge edges in the __NAME__ ontology hierarchy + properties: + id: + type: string + description: an edge ID, consisting of from::to::type + type: + type: string + description: __NAME__ merge edge type + from: + type: string + description: __NAME__ id + to: + type: string + description: __NAME__ id + required: + - id + - type + - from + - to diff --git a/scripts/data/terms.yaml b/scripts/data/terms.yaml new file mode 100644 index 00000000..a838ee2b --- /dev/null +++ b/scripts/data/terms.yaml @@ -0,0 +1,111 @@ +name: __NAME___terms +type: vertex +delta: true + +indexes: + - type: persistent + fields: [id, expired, created] + - type: persistent + fields: [expired, created, last_version] + +schema: + "$schema": http://json-schema.org/draft-07/schema# + title: __NAME___terms + type: object + description: A entry for vertices in the __NAME__ ontology hierarchy + properties: + id: + type: string + description: The unique id of the current term. + type: + type: string + description: The type of the node. + name: + type: ["null", "string"] # some OBO classes have no label + description: The term name. + namespace: + type: ["null", "string"] # some OBO classes have no namespace + description: The namespace of the term. + alt_ids: + type: array + items: + type: string + description: Defines an alternate id for this term. A term may have any number + of alternate ids. + def: + type: ["null", "object"] # some OBO classes have no definition + description: The definition of the current term. + required: + - val + properties: + pred: + type: string + description: The definition predicate + val: + type: string + description: The definition value + xrefs: + type: array + description: A dbxref that describes an analagous term in another vocabulary + items: + type: string + comments: + type: array + items: + type: string + description: Comments for this term. + subsets: + type: array + items: + type: string + description: This tag indicates a term subset to which this term belongs. + synonyms: + description: This tag gives a synonym for this term, some xrefs to describe the + origins of the synonym, and may indicate a synonym category or scope information. + type: array + items: + type: object + required: + - val + properties: + pred: + type: string + description: The synonym predicate + val: + type: string + description: The synonym value + xrefs: + type: array + description: A dbxref that describes an analagous term in another vocabulary + items: + type: string + xrefs: + description: DBxrefs that describes an analagous term in another vocabulary + type: array + items: + type: object + required: + - val + properties: + pred: + type: string + description: The xref predicate + val: + type: string + description: The xref value + xrefs: + type: array + description: A dbxref that describes an analagous term in another vocabulary + items: + type: string + required: + - id + - type + - name + - namespace + - alt_ids + - def + - comments + - subsets + - synonyms + - xrefs diff --git a/scripts/docker_deploy b/scripts/docker_deploy new file mode 100755 index 00000000..7c4c0a10 --- /dev/null +++ b/scripts/docker_deploy @@ -0,0 +1,18 @@ +#!/bin/bash +# Build and deploy the docker image to Dockerhub + +# Exit on error +set -e +# Show the commands we execute +set -o xtrace + +ver=$(cat VERSION) +export IMAGE_NAME="kbase/relation_engine_api:$ver" +export BRANCH=`git symbolic-ref --short HEAD` +export DATE=`date -u +"%Y-%m-%dT%H:%M:%SZ"` +export COMMIT=`git rev-parse --short HEAD` +docker build --build-arg BUILD_DATE=$DATE \ + --build-arg VCS_REF=$COMMIT \ + --build-arg BRANCH=$BRANCH \ + -t ${IMAGE_NAME} . +docker push $IMAGE_NAME diff --git a/scripts/prepare_ontology.py b/scripts/prepare_ontology.py new file mode 100644 index 00000000..1b4e9f48 --- /dev/null +++ b/scripts/prepare_ontology.py @@ -0,0 +1,92 @@ +import sys +import os +import yaml +import json +import shutil +import warnings + +""" +python3 scripts/prepare_ontology.py scripts/test/data/data_sources.json fake_ontology +""" + +PLACEHOLDER = "__NAME__" +BIN_PATH = os.path.dirname(os.path.abspath(__file__)) +COLLECTIONS_PATH = os.path.join(BIN_PATH, "../spec/collections") +DATASOURCES_PATH = os.path.join(BIN_PATH, "../spec/data_sources") +DATAFILES_PATH = os.path.join(BIN_PATH, "data") +COLLECTIONS_DATAFILES = ["terms", "edges", "merges"] + + +def main(): + if len(sys.argv) <= 2: + raise ValueError("data_source and/or namespace are missing") + + datasource = parse_input(sys.argv[1], sys.argv[2]) + + prepare_collections_file(datasource, COLLECTIONS_PATH) + prepare_data_sources_file(datasource, DATASOURCES_PATH) + return + + +def parse_input(input, name): + with open(input) as file: + for d in json.load(file): + if d.get("ns") == name: + return d + raise ValueError("no namespace: " + name) + + +def prepare_collections_file(datasource, collections_path): + if not os.path.exists(collections_path): + raise FileNotFoundError(collections_path + " doesn't exists") + name, type = parse_namespace(datasource["ns"]) + target_dir = os.path.join(collections_path, name.upper()) + os.makedirs(target_dir, exist_ok=True) + for f in COLLECTIONS_DATAFILES: + source_file = os.path.join(DATAFILES_PATH, f + ".yaml") + target_file = os.path.join(target_dir, name.upper() + "_" + f + ".yaml") + data = "" + with open(source_file, "r") as source: + data = yaml.safe_load(source.read().replace(PLACEHOLDER, name.upper())) + if not os.path.exists(target_file): + with open(target_file, "w") as target: + yaml.dump(data, target) + else: + warnings.warn(target_file + " exists") + return target_dir + + +def prepare_data_sources_file(datasource, datasources_path): + if not os.path.exists(datasources_path): + raise FileNotFoundError(datasources_path + " doesn't exists") + name, type = parse_namespace(datasource["ns"]) + target_file = os.path.join(datasources_path, datasource["ns"] + ".yaml") + data = { + "name": datasource["ns"], + "category": type, + "title": datasource["title"], + "home_url": datasource["home_url"], + "data_url": datasource["data_url"], + } + if not os.path.exists(target_file): + with open(target_file, "w") as target: + yaml.dump(data, target) + else: + warnings.warn(target_file + " exists") + return target_file + + +def parse_namespace(ns): + return tuple(ns.split("_")) + + +def clean_up_data(path): + if os.path.exists(path): + if os.path.isfile(path): + os.remove(path) + elif os.path.isdir(path): + shutil.rmtree(path) + + +if __name__ == "__main__": + main() diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh new file mode 100644 index 00000000..6dff8759 --- /dev/null +++ b/scripts/run_tests.sh @@ -0,0 +1,23 @@ +#!/bin/sh + +set -e + +# Create tarball of the test spec directory +(cd /app/relation_engine_server/test/spec_release && \ + tar czvf spec.tar.gz sample_spec_release) + +black . +flake8 --max-complexity 20 /app +mypy --ignore-missing-imports /app +bandit -r /app + +# start server, using the specs in /spec/repo +sh /app/scripts/start_server.sh & +coverage erase +# spec validation +python -m spec.validate +# run importer/, relation_engine_server/, and spec/ tests +coverage run --branch -m pytest +# RE client tests +PYTHONPATH=client_src python -m pytest client_src/test +coverage html --omit=*/test_* diff --git a/scripts/start_server.sh b/scripts/start_server.sh new file mode 100644 index 00000000..71e06535 --- /dev/null +++ b/scripts/start_server.sh @@ -0,0 +1,19 @@ +#!/bin/sh +set -e + +# Set the number of gevent workers to number of cores * 2 + 1 +# See: http://docs.gunicorn.org/en/stable/design.html#how-many-workers +calc_workers="$(($(nproc) * 2 + 1))" +# Use the WORKERS environment variable, if present +workers=${WORKERS:-$calc_workers} + +python -m relation_engine_server.utils.wait_for services +python -m relation_engine_server.utils.pull_spec + +gunicorn \ + --worker-class gevent \ + --timeout 1800 \ + --workers $workers \ + --bind :5000 \ + ${DEVELOPMENT:+"--reload"} \ + relation_engine_server.main:app diff --git a/scripts/test/__init__.py b/scripts/test/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/test/data/data_sources.json b/scripts/test/data/data_sources.json new file mode 100644 index 00000000..1ad34c47 --- /dev/null +++ b/scripts/test/data/data_sources.json @@ -0,0 +1,209 @@ +[ + { + "ns": "ncbi_taxonomy", + "type": "taxonomy", + "title": "National Center for Biotechnology Information", + "short_title": "NCBI", + "data_url": "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/", + "home_url": "https://www.ncbi.nlm.nih.gov/taxonomy", + "logo_url": "https://ci.kbase.us/ui-assets/images/third-party-data-sources/ncbi/logo-51-64.png", + "license": null, + "item_link": { + "url_template": "https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id={{id}}", + "label": "NCBI Taxonomy" + }, + "citation": "Schoch CL, et al. NCBI Taxonomy: a comprehensive update on curation, resources and tools. Database (Oxford). 2020: baaa062. [PubMed](https://www.ncbi.nlm.nih.gov/pubmed/32761142)\n\nSayers EW, et al. GenBank. Nucleic Acids Res. 2019. 47(D1):D94-D99. [PubMed](https://www.ncbi.nlm.nih.gov/pubmed/30365038)", + "additional_fields": [ + { + "id": "ncbi_taxon_id", + "type": "number", + "label": "NCBI ID", + "tooltip": "ID for this taxon at NCBI", + "description": "" + }, + { + "id": "gencode", + "type": "number", + "label": "Genetic Code", + "tooltip": "NCBI Genetic code", + "description": "" + }, + { + "id": "aliases", + "type": "array", + "label": "Aliases", + "tooltip": "Aliases for this taxon", + "description": "" + } + ] + }, + { + "ns": "gtdb", + "type": "taxonomy", + "title": "Genome Taxonomy Database", + "short_title": "GTDB", + "data_url": "https://data.ace.uq.edu.au/public/gtdb/data/releases/", + "home_url": "https://gtdb.ecogenomic.org", + "logo_url": "https://ci.kbase.us/ui-assets/images/third-party-data-sources/gtdb/logo-128-64.png", + "license": { + "url": "http://creativecommons.org/licenses/by-sa/4.0/", + "label": "Creative Commons Attribution-ShareAlike 4.0 International License" + }, + "item_link": { + "url_template": "https://gtdb.ecogenomic.org/genomes?gid={{id}}", + "label": "GTDB Taxonomy" + }, + "citation": "Parks, D.H., et al. (2020). [\"A complete domain-to-species taxonomy for Bacteria and Archaea.\"](https://rdcu.be/b3OI7) Nature Biotechnology, https://doi.org/10.1038/s41587-020-0501-8.\n\nParks, D.H., et al. (2018). [\"A standardized bacterial taxonomy based on genome phylogeny substantially revises the tree of life.\"](https://www.nature.com/articles/nbt.4229) Nature Biotechnology, 36: 996-1004.", + "additional_fields": [] + }, + { + "ns": "rdp_taxonomy", + "type": "taxonomy", + "title": "Ribosomal Database Project", + "short_title": "RDP", + "data_url": "http://rdp.cme.msu.edu/misc/resources.jsp", + "home_url": "http://rdp.cme.msu.edu/taxomatic/main.spr", + "logo_url": "http://rdp.cme.msu.edu/images/rdpinsider108x81.png", + "license": { + "url": "http://creativecommons.org/licenses/by-sa/3.0/", + "label": "Creative Commons Attribution-ShareAlike 3.0 Unported License" + }, + "item_link": null, + "citation": "Cole, J. R., Q. Wang, J. A. Fish, B. Chai, D. M. McGarrell, Y. Sun, C. T. Brown, A. Porras-Alfaro, C. R. Kuske, and J. M. Tiedje. 2014. Ribosomal Database Project: data and tools for high throughput rRNA analysis Nucl. Acids Res. 42(Database issue):D633-D642; doi: [10.1093/nar/gkt1244](http://dx.doi.org/10.1093/nar/gkt1244) [[PMID: 24288368]](http://www.ncbi.nlm.nih.gov/pubmed/24288368)", + "additional_fields": [ + { + "id": "incertae_sedis", + "type": "boolean", + "label": "Incertae Sedis?", + "tooltip": "ID for this taxon at NCBI", + "description": "Indicates a taxonomic group where its broader relationships are unknown or undefined" + }, + { + "id": "molecule", + "type": "string", + "label": "Molecule", + "tooltip": "", + "description": "" + }, + { + "id": "unclassified", + "type": "boolean", + "label": "Unclassified?", + "tooltip": "", + "description": "" + } + ] + }, + { + "ns": "silva_taxonomy", + "type": "taxonomy", + "title": "SILVA", + "short_title": "SILVA", + "data_url": "https://arb-silva.de/no_cache/download/archive/", + "home_url": "https://arb-silva.de", + "logo_url": "https://www.arb-silva.de/fileadmin/graphics_general/main/logos/silva-subtitle.svg", + "license": { + "url": "https://creativecommons.org/licenses/by/4.0/", + "label": "Create Commons Attribution 4.0 (CC-BY 4.0)" + }, + "item_link": { + "url_template": "https://www.arb-silva.de/browser/ssu/silva/{{id}}", + "label": "SILVA Taxonomy" + }, + "citation": "Quast C, Pruesse E, Yilmaz P, Gerken J, Schweer T, Yarza P, Peplies J, Glöckner FO (2013) The SILVA ribosomal RNA gene database project: improved data processing and web-based tools. [Nucl. Acids Res. 41 (D1): D590-D596](http://nar.oxfordjournals.org/content/41/D1/D590).\n\nYilmaz P, Parfrey LW, Yarza P, Gerken J, Pruesse E, Quast C, Schweer T, Peplies J, Ludwig W, Glöckner FO (2014) The SILVA and \"All-species Living Tree Project (LTP)\" taxonomic frameworks. [Nucl. Acids Res. 42:D643-D648](http://nar.oxfordjournals.org/content/42/D1/D643.full)\n\nGlöckner FO, Yilmaz P, Quast C, Gerken J, Beccati A, Ciuprina A, Bruns G, Yarza P, Peplies J, Westram R, Ludwig W (2017) 25 years of serving the community with ribosomal RNA gene reference databases and tools. [J. Biotechnol](http://www.sciencedirect.com/science/article/pii/S0168165617314943).", + "additional_fields": [ + { + "id": "datasets", + "type": "array", + "label": "Data Sets", + "tooltip": "", + "description": "" + }, + { + "id": "sequence", + "type": "sequence", + "label": "Sequence", + "tooltip": "", + "description": "" + } + ] + }, + { + "ns": "go_ontology", + "type": "ontology", + "title": "Gene Ontology", + "short_title": "GO", + "data_url": "http://release.geneontology.org/", + "home_url": "http://geneontology.org/", + "logo_url": "https://ci.kbase.us/ui-assets/images/third-party-data-sources/go/logo-248-64.png", + "license": { + "url": "https://creativecommons.org/licenses/by/4.0/legalcode", + "label": "Creative Commons Attribution 4.0 Unported License" + }, + "citation": "Ashburner et al. Gene ontology: tool for the unification of biology. Nat Genet. May 2000;25(1):25-9. [[abstract](https://www.ncbi.nlm.nih.gov/pubmed/10802651) | [full text](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3037419/)]\n\nThe Gene Ontology Consortium. The Gene Ontology Resource: 20 years and still GOing strong. Nucleic Acids Res. Jan 2019;47(D1):D330-D338. [[abstract](https://www.ncbi.nlm.nih.gov/pubmed/30395331) | [full text](https://academic.oup.com/nar/article/47/D1/D330/5160994)]", + "item_link": { + "url_template": "http://amigo.geneontology.org/amigo/term/{{term}}", + "label": "Gene Ontology AmiGO" + }, + "additional_fields": [ + { + "id": "synonyms", + "type": "array", + "label": "Synonyms", + "tooltip": "", + "description": "" + } + ] + }, + { + "ns": "envo_ontology", + "type": "ontology", + "title": "Environmental Ontology", + "short_title": "ENVO", + "data_url": "https://github.com/EnvironmentOntology/envo/releases", + "home_url": "https://sites.google.com/site/environmentontology/", + "logo_url": "https://ci.kbase.us/ui-assets/images/third-party-data-sources/envo/logo-119-64.png", + "license": { + "url": "https://creativecommons.org/licenses/by/3.0/", + "label": "Attribution 3.0 Unported (CC BY 3.0)" + }, + "citation": "Buttigieg, P. L., Morrison, N., Smith, B., Mungall, C. J., & Lewis, S. E. (2013). The environment ontology: contextualising biological and biomedical entities. Journal of Biomedical Semantics, 4(1), 43. [doi:10.1186/2041-1480-4-43](http://www.dx.doi.org/10.1186/2041-1480-4-43)\n \nButtigieg, P. L., Pafilis, E., Lewis, S. E., Schildhauer, M. P., Walls, R. L., & Mungall, C. J. (2016). The environment ontology in 2016: bridging domains with increased scope, semantic density, and interoperation. Journal of Biomedical Semantics, 7(1), 57. [doi:10.1186/s13326-016-0097-6](https://doi.org/10.1186/s13326-016-0097-6)\n ", + "item_link": { + "url_template": "http://purl.obolibrary.org/obo/{{term}}", + "label": "ENVO Ontology Ontobee" + }, + "additional_fields": [] + }, + { + "ns": "gaz_ontology", + "type": "ontology", + "title": "Gazetteer Ontology", + "short_title": "GAZ", + "home_url": "http://environmentontology.github.io/gaz/", + "data_url": "http://purl.obolibrary.org/obo/gaz.obo" + }, + { + "ns": "uo_ontology", + "type": "ontology", + "title": "Units of measurement ontology", + "short_title": "UO", + "home_url": "https://github.com/bio-ontology-research-group/unit-ontology", + "data_url": "http://purl.obolibrary.org/obo/uo.obo" + }, + { + "ns": "po_ontology", + "type": "ontology", + "title": "Plant Ontology", + "short_title": "PO", + "home_url": "http://browser.planteome.org/amigo", + "data_url": "http://purl.obolibrary.org/obo/po.obo" + }, + { + "ns": "fake_ontology", + "type": "ontology", + "title": "Fake Ontology", + "short_title": "FAKE", + "home_url": "http://environmentontology.github.io/fake/", + "data_url": "http://purl.obolibrary.org/obo/fake.obo" + } +] diff --git a/scripts/test/test_prepare_ontology.py b/scripts/test/test_prepare_ontology.py new file mode 100644 index 00000000..a800bfe1 --- /dev/null +++ b/scripts/test/test_prepare_ontology.py @@ -0,0 +1,61 @@ +""" +Tests for the prepare_ontology + +These tests run within the re_api docker image. +""" +import unittest +import os +from scripts.prepare_ontology import ( + prepare_collections_file, + prepare_data_sources_file, + parse_input, + parse_namespace, + clean_up_data, +) + +_TEST_DIR = "/app/scripts/test" +_TEST_NAMESPACE = "fake_ontology" + + +class Test_prepare_ontology(unittest.TestCase): + @classmethod + def setUpClass(self): + self.data_sources_file = os.path.join(_TEST_DIR, "data", "data_sources.json") + + def test_parse_input(self): + d = parse_input(self.data_sources_file, _TEST_NAMESPACE) + self.assertEqual(d["ns"], _TEST_NAMESPACE) + with self.assertRaises(FileNotFoundError): + parse_input("non_exist_file", _TEST_NAMESPACE) + with self.assertRaises(ValueError) as ctx: + parse_input(self.data_sources_file, "non_exist_ns") + self.assertEqual("no namespace: non_exist_ns", str(ctx.exception)) + + def test_parse_namespace(self): + n, t = parse_namespace(_TEST_NAMESPACE) + self.assertEqual(n, "fake") + self.assertEqual(t, "ontology") + + def test_data_sources_file(self): + d = parse_input(self.data_sources_file, _TEST_NAMESPACE) + ret = prepare_data_sources_file(d, _TEST_DIR) + self.assertTrue(os.path.exists(ret)) + with self.assertWarns(UserWarning): + prepare_data_sources_file(d, _TEST_DIR) + clean_up_data(ret) + self.assertFalse(os.path.exists(ret)) + with self.assertRaises(FileNotFoundError) as ctx: + prepare_data_sources_file(d, "non_exist_path") + self.assertEqual("non_exist_path doesn't exists", str(ctx.exception)) + + def test_collections_file(self): + d = parse_input(self.data_sources_file, _TEST_NAMESPACE) + ret = prepare_collections_file(d, _TEST_DIR) + self.assertTrue(os.path.exists(ret)) + with self.assertWarns(UserWarning): + prepare_collections_file(d, _TEST_DIR) + clean_up_data(ret) + self.assertFalse(os.path.exists(ret)) + with self.assertRaises(FileNotFoundError) as ctx: + prepare_collections_file(d, "non_exist_path") + self.assertEqual("non_exist_path doesn't exists", str(ctx.exception)) diff --git a/spec/README.md b/spec/README.md new file mode 100644 index 00000000..3266d3e4 --- /dev/null +++ b/spec/README.md @@ -0,0 +1,19 @@ +# Relation Engine Spec + +This repo holds the [stored queries](spec/stored_queries), [collections](spec/collections), and [migrations](migrations) for the relation engine graph database service. + +These specifications are used by the [Relation Engine API](relation_engine_server). + +* **[Stored queries](spec/stored_queries)** are stored [AQL queries](https://docs.arangodb.com/3.5/AQL/index.html) that can be used +by KBase apps to fetch data from the database. +* **[Collections, or document schemas,](spec/collections)** are [JSON schemas](https://json-schema.org/) that define what form of data can be stored in the database's collections. +* **[Datasets](spec/datasets)** contain partial and full schemas specific to a certain dataset. +* **[Data sources](spec/data_sources)** contain general information about where some of our imported data comes from. +* **[Views](spec/views)** are raw ArangoSearch view configuration files +* **[Analyzers](spec/analyzers)** are analyzer configuration files + +## Development + +### Running tests + +Tests are located in the [spec/tests](spec/tests) directory, and are run as part of the test suite triggered by `scripts/run_tests.sh`. diff --git a/spec/__init__.py b/spec/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/spec/analyzer_schema.yaml b/spec/analyzer_schema.yaml new file mode 100644 index 00000000..ec724819 --- /dev/null +++ b/spec/analyzer_schema.yaml @@ -0,0 +1,11 @@ +name: analyzer_schema +type: object +required: ['name', 'type'] +properties: + name: + type: string + title: Analyzer name + type: + type: string + title: Analyzer type + examples: ['identity', 'text'] diff --git a/spec/analyzers/README.md b/spec/analyzers/README.md new file mode 100644 index 00000000..e7905e26 --- /dev/null +++ b/spec/analyzers/README.md @@ -0,0 +1,3 @@ +# Analyzers + +These are json files for Arango analyzers. The data in them is used by the [Relation Engine API](https://github.com/kbase/relation_engine) to create views via the `POST /_api/analyzer` endpoint of the ArangoDB HTTP interface. Please [see the ArangoDB docs](https://www.arangodb.com/docs/3.5/http/analyzers.html) for the full set of parameters available. diff --git a/spec/analyzers/icu_tokenize.json b/spec/analyzers/icu_tokenize.json new file mode 100644 index 00000000..3f69a950 --- /dev/null +++ b/spec/analyzers/icu_tokenize.json @@ -0,0 +1,11 @@ +{ + "name": "icu_tokenize", + "type": "text", + "properties": { + "locale": "c.utf-8", + "accent": false, + "case": "lower", + "stemming": false, + "stopwords": [] + } +} \ No newline at end of file diff --git a/spec/collection_schema.yaml b/spec/collection_schema.yaml new file mode 100644 index 00000000..b35d9fed --- /dev/null +++ b/spec/collection_schema.yaml @@ -0,0 +1,45 @@ +name: collection_schema +type: object +required: ['name', 'type', 'schema'] +additionalProperties: false +properties: + delta: + type: boolean + default: false + indexes: + type: array + items: + type: object + required: ['fields', 'type'] + properties: + fields: + type: array + items: + type: string + type: + type: string + enum: ['fulltext', 'geo', 'hash', 'persistent'] + name: + type: string + title: Collection name + format: regex + pattern: ^\w+$ + schema: + type: object + required: ['properties', 'required'] + properties: + description: + type: string + properties: + type: object + required: + type: array + items: + type: string + title: + type: string + type: + type: string + type: + type: string + enum: ['vertex', 'edge'] diff --git a/spec/collections/ENVO/ENVO_edges.yaml b/spec/collections/ENVO/ENVO_edges.yaml new file mode 100644 index 00000000..b0e22639 --- /dev/null +++ b/spec/collections/ENVO/ENVO_edges.yaml @@ -0,0 +1,43 @@ +name: ENVO_edges +type: edge +delta: true + +indexes: + - type: persistent + fields: [id, expired, created] + - type: persistent + fields: [expired, created, last_version] + +schema: + "$schema": http://json-schema.org/draft-07/schema# + title: ENVO_edges + type: object + description: A entry for edges in the Environment Ontology (ENVO) hierarchy + properties: + id: + type: string + description: an edge ID, consisting of from::to::type + examples: + - ENVO:0000136::ENVO:0031501::is_a + - ENVO:0000022::ENVO:0051231::is_a + type: + type: string + description: ENVO edge type + examples: + - is_a + - causally_upstream_of_or_within + from: + type: string + description: ENVO id + examples: + - ENVO:0023052 + to: + type: string + title: ENVO id + examples: + - ENVO:0008150 + required: + - id + - type + - from + - to diff --git a/spec/collections/ENVO/ENVO_merges.yaml b/spec/collections/ENVO/ENVO_merges.yaml new file mode 100644 index 00000000..c0c59bdc --- /dev/null +++ b/spec/collections/ENVO/ENVO_merges.yaml @@ -0,0 +1,43 @@ +name: ENVO_merges +type: edge +delta: true + +indexes: + - type: persistent + fields: [id, expired, created] + - type: persistent + fields: [expired, created, last_version] + +schema: + "$schema": http://json-schema.org/draft-07/schema# + title: ENVO_merges + type: object + description: A entry for merge edges in the Enviroment Ontology (ENVO) hierarchy + properties: + id: + type: string + description: an edge ID, consisting of from::to::type + examples: + - ENVO:0000136::ENVO:0031501::consider + - ENVO:0000022::ENVO:0051231::replaced_by + type: + type: string + description: ENVO merge edge type + examples: + - consider + - replaced_by + from: + type: string + description: ENVO id + examples: + - ENVO:0023052 + to: + type: string + title: ENVO id + examples: + - ENVO:0008150 + required: + - id + - type + - from + - to diff --git a/spec/collections/ENVO/ENVO_terms.yaml b/spec/collections/ENVO/ENVO_terms.yaml new file mode 100644 index 00000000..48076a28 --- /dev/null +++ b/spec/collections/ENVO/ENVO_terms.yaml @@ -0,0 +1,147 @@ +name: ENVO_terms +type: vertex +delta: true + +indexes: + - type: persistent + fields: [id, expired, created] + - type: persistent + fields: [expired, created, last_version] + +schema: + "$schema": http://json-schema.org/draft-07/schema# + title: ENVO_terms + type: object + description: A entry for vertices in the Environment Ontology (ENVO) hierarchy + properties: + id: + type: string + description: The unique id of the current term. + examples: + - ENVO:0022609 + - ENVO:0044848 + type: + type: string + description: The type of the node. + examples: + - CLASS + - INDIVIDUAL + name: + type: ["null", "string"] # some OBO classes have no label + description: The term name. + examples: + - ice cap dome + - horse manure + namespace: + type: ["null", "string"] # some OBO classes have no namespace + description: The namespace of the term. + examples: + - ENVO + alt_ids: + type: array + items: + type: string + description: Defines an alternate id for this term. A term may have any number + of alternate ids. + examples: + - ["ENVO:0019952"] + - ["ENVO:0050876"] + - ["ENVO:0044848"] + def: + type: ["null", "object"] # some OBO classes have no definition + description: The definition of the current term. + required: + - val + properties: + pred: + type: string + description: The definition predicate + val: + type: string + description: The definition value + examples: + - A natural/cultural feature of outstanding or unique value because of its inherent + rarity, representative of aesthetic qualities or cultural significance. + xrefs: + type: array + description: A dbxref that describes an analagous term in another vocabulary + items: + type: string + examples: + - ["Geonames:feature"] + - ["https://en.wikipedia.org/wiki/Natural_Monument"] + comments: + type: array + items: + type: string + description: Comments for this term. + examples: + - ["This class refers to strictly sealed enclosures such as Biosphere 2 (https://en.wikipedia.org/wiki/Biosphere_2), rather than vivaria which allow matter exchange with external environmental systems."] + subsets: + type: array + items: + type: string + description: This tag indicates a term subset to which this term belongs. + examples: + - ["wwfBiome"] + - ["environmental_hazards"] + synonyms: + description: This tag gives a synonym for this term, some xrefs to describe the + origins of the synonym, and may indicate a synonym category or scope information. + type: array + items: + type: object + required: + - val + properties: + pred: + type: string + description: The synonym predicate + examples: + - hasBroadSynonym + - hasNarrowSynonym + val: + type: string + description: The synonym value + examples: + - HydrothermalVents + xrefs: + type: array + description: A dbxref that describes an analagous term in another vocabulary + items: + type: string + examples: + - ["NASA:earthrealm"] + xrefs: + description: DBxrefs that describes an analagous term in another vocabulary + type: array + items: + type: object + required: + - val + properties: + pred: + type: string + description: The xref predicate + val: + type: string + description: The xref value + examples: + - SPIRE:Soil + - https://en.wikipedia.org/wiki/Soil + xrefs: + type: array + description: A dbxref that describes an analagous term in another vocabulary + items: + type: string + required: + - id + - type + - name + - namespace + - alt_ids + - def + - comments + - subsets + - synonyms + - xrefs diff --git a/spec/collections/GAZ/GAZ_edges.yaml b/spec/collections/GAZ/GAZ_edges.yaml new file mode 100644 index 00000000..32a15f07 --- /dev/null +++ b/spec/collections/GAZ/GAZ_edges.yaml @@ -0,0 +1,37 @@ +delta: true +indexes: +- fields: + - id + - expired + - created + type: persistent +- fields: + - expired + - created + - last_version + type: persistent +name: GAZ_edges +schema: + $schema: http://json-schema.org/draft-07/schema# + description: A entry for edges in the GAZ ontology hierarchy + properties: + from: + description: GAZ id + type: string + id: + description: an edge ID, consisting of from::to::type + type: string + to: + description: GAZ id + type: string + type: + description: GAZ edge type + type: string + required: + - id + - type + - from + - to + title: GAZ_edges + type: object +type: edge diff --git a/spec/collections/GAZ/GAZ_merges.yaml b/spec/collections/GAZ/GAZ_merges.yaml new file mode 100644 index 00000000..e05bccd7 --- /dev/null +++ b/spec/collections/GAZ/GAZ_merges.yaml @@ -0,0 +1,37 @@ +delta: true +indexes: +- fields: + - id + - expired + - created + type: persistent +- fields: + - expired + - created + - last_version + type: persistent +name: GAZ_merges +schema: + $schema: http://json-schema.org/draft-07/schema# + description: A entry for merge edges in the GAZ ontology hierarchy + properties: + from: + description: GAZ id + type: string + id: + description: an edge ID, consisting of from::to::type + type: string + to: + description: GAZ id + type: string + type: + description: GAZ merge edge type + type: string + required: + - id + - type + - from + - to + title: GAZ_merges + type: object +type: edge diff --git a/spec/collections/GAZ/GAZ_terms.yaml b/spec/collections/GAZ/GAZ_terms.yaml new file mode 100644 index 00000000..810f7026 --- /dev/null +++ b/spec/collections/GAZ/GAZ_terms.yaml @@ -0,0 +1,121 @@ +delta: true +indexes: +- fields: + - id + - expired + - created + type: persistent +- fields: + - expired + - created + - last_version + type: persistent +name: GAZ_terms +schema: + $schema: http://json-schema.org/draft-07/schema# + description: A entry for vertices in the GAZ ontology hierarchy + properties: + alt_ids: + description: Defines an alternate id for this term. A term may have any number + of alternate ids. + items: + type: string + type: array + comments: + description: Comments for this term. + items: + type: string + type: array + def: + description: The definition of the current term. + properties: + pred: + description: The definition predicate + type: string + val: + description: The definition value + type: string + xrefs: + description: A dbxref that describes an analagous term in another vocabulary + items: + type: string + type: array + required: + - val + type: + - 'null' + - object + id: + description: The unique id of the current term. + type: string + name: + description: The term name. + type: + - 'null' + - string + namespace: + description: The namespace of the term. + type: + - 'null' + - string + subsets: + description: This tag indicates a term subset to which this term belongs. + items: + type: string + type: array + synonyms: + description: This tag gives a synonym for this term, some xrefs to describe + the origins of the synonym, and may indicate a synonym category or scope information. + items: + properties: + pred: + description: The synonym predicate + type: string + val: + description: The synonym value + type: string + xrefs: + description: A dbxref that describes an analagous term in another vocabulary + items: + type: string + type: array + required: + - val + type: object + type: array + type: + description: The type of the node. + type: string + xrefs: + description: DBxrefs that describes an analagous term in another vocabulary + items: + properties: + pred: + description: The xref predicate + type: string + val: + description: The xref value + type: string + xrefs: + description: A dbxref that describes an analagous term in another vocabulary + items: + type: string + type: array + required: + - val + type: object + type: array + required: + - id + - type + - name + - namespace + - alt_ids + - def + - comments + - subsets + - synonyms + - xrefs + title: GAZ_terms + type: object +type: vertex diff --git a/spec/collections/GO/GO_edges.yaml b/spec/collections/GO/GO_edges.yaml new file mode 100644 index 00000000..f2d5a1ee --- /dev/null +++ b/spec/collections/GO/GO_edges.yaml @@ -0,0 +1,43 @@ +name: GO_edges +type: edge +delta: true + +indexes: + - type: persistent + fields: [id, expired, created] + - type: persistent + fields: [expired, created, last_version] + +schema: + "$schema": http://json-schema.org/draft-07/schema# + title: GO_edges + type: object + description: A entry for edges in the Gene Ontology (GO) hierarchy + properties: + id: + type: string + description: an edge ID, consisting of from::to::type + examples: + - GO:0000136::GO:0031501::is_a + - GO:0000022::GO:0051231::is_a + type: + type: string + description: GO edge type + examples: + - is_a + - part_of + from: + type: string + description: GO id + examples: + - GO:0023052 + to: + type: string + title: GO id + examples: + - GO:0008150 + required: + - id + - type + - from + - to diff --git a/spec/collections/GO/GO_merges.yaml b/spec/collections/GO/GO_merges.yaml new file mode 100644 index 00000000..80ad08ee --- /dev/null +++ b/spec/collections/GO/GO_merges.yaml @@ -0,0 +1,45 @@ +name: GO_merges +type: edge +delta: true + +indexes: + - type: hash # don't think this needs to be a skiplist / persistent index + fields: [from] + - type: persistent + fields: [id, expired, created] + - type: persistent + fields: [expired, created, last_version] + +schema: + "$schema": http://json-schema.org/draft-07/schema# + title: GO_merges + type: object + description: A entry for merge edges in the Gene Ontology (GO) hierarchy + properties: + id: + type: string + description: an edge ID, consisting of from::to::type + examples: + - GO:0000136::GO:0031501::consider + - GO:0000022::GO:0051231::replaced_by + type: + type: string + description: GO merge edge type + examples: + - consider + - replaced_by + from: + type: string + description: GO id + examples: + - GO:0023052 + to: + type: string + title: GO id + examples: + - GO:0008150 + required: + - id + - type + - from + - to diff --git a/spec/collections/GO/GO_terms.yaml b/spec/collections/GO/GO_terms.yaml new file mode 100644 index 00000000..88804755 --- /dev/null +++ b/spec/collections/GO/GO_terms.yaml @@ -0,0 +1,154 @@ +name: GO_terms +type: vertex +delta: true + +indexes: + - type: persistent + fields: [id, expired, created] + - type: persistent + fields: [expired, created, last_version] + +schema: + "$schema": http://json-schema.org/draft-07/schema# + title: GO_terms + type: object + description: A entry for vertices in the Gene Ontology (GO) hierarchy + properties: + id: + type: string + description: The unique id of the current term. + examples: + - GO:0022609 + - GO:0044848 + type: + type: string + description: The type of the node. + examples: + - CLASS + - INDIVIDUAL + name: + type: ["null", "string"] # some OBO classes have no label + description: The term name. + examples: + - mitochondrial genome maintenance + - reproduction + namespace: + type: ["null", "string"] # some OBO classes have no namespace + description: Denotes which of the three sub-ontologies the term belongs to. + examples: + - cellular component + - biological process + - molecular function + alt_ids: + type: array + items: + type: string + description: Defines an alternate id for this term. A term may have any number + of alternate ids. + examples: + - ["GO:0019952"] + - ["GO:0050876"] + - ["GO:0044848"] + def: + type: ["null", "object"] # some OBO classes have no definition + description: The definition of the current term. + required: + - val + properties: + pred: + type: string + description: The definition predicate + val: + type: string + description: The definition value + examples: + - Any process that modulates the frequency, rate or extent of glycolysis. + xrefs: + type: array + description: A dbxref that describes an analagous term in another vocabulary + items: + type: string + examples: + - ["GOC:go_curators"] + - ["ISBN:0815340729"] + comments: + type: array + items: + type: string + description: Comments for this term. + examples: + - ["This term was made obsolete because it refers to a class of gene products and + a biological process rather than a molecular function."] + subsets: + type: array + items: + type: string + description: This tag indicates a term subset to which this term belongs. + examples: + - ["goslim_yeast"] + - ["goslim_chembl"] + - ["goslim_metagenomics"] + - ["goslim_pir"] + - ["goslim_plant"] + synonyms: + description: This tag gives a synonym for this term, some xrefs to describe the + origins of the synonym, and may indicate a synonym category or scope information. + type: array + items: + type: object + required: + - val + properties: + pred: + type: string + description: The synonym predicate + examples: + - hasBroadSynonym + - hasNarrowSynonym + val: + type: string + description: The synonym value + examples: + - regulation of blood angiotensin level + xrefs: + type: array + description: A dbxref that describes an analagous term in another vocabulary + items: + type: string + examples: + - ["GOC:TermGenie"] + xrefs: + description: DBxrefs that describes an analagous term in another vocabulary + type: array + items: + type: object + required: + - val + properties: + pred: + type: string + description: The xref predicate + val: + type: string + description: The xref value + examples: + - EC:2.3.1 + - Reactome:REACT_10010 + xrefs: + type: array + description: A dbxref that describes an analagous term in another vocabulary + items: + type: string + examples: + - ["GOC:TermGenie"] + required: + - id + - type + - name + - namespace + - alt_ids + - def + - comments + - subsets + - synonyms + - xrefs \ No newline at end of file diff --git a/spec/collections/PO/PO_edges.yaml b/spec/collections/PO/PO_edges.yaml new file mode 100644 index 00000000..3f917b6e --- /dev/null +++ b/spec/collections/PO/PO_edges.yaml @@ -0,0 +1,37 @@ +delta: true +indexes: +- fields: + - id + - expired + - created + type: persistent +- fields: + - expired + - created + - last_version + type: persistent +name: PO_edges +schema: + $schema: http://json-schema.org/draft-07/schema# + description: A entry for edges in the PO ontology hierarchy + properties: + from: + description: PO id + type: string + id: + description: an edge ID, consisting of from::to::type + type: string + to: + description: PO id + type: string + type: + description: PO edge type + type: string + required: + - id + - type + - from + - to + title: PO_edges + type: object +type: edge diff --git a/spec/collections/PO/PO_merges.yaml b/spec/collections/PO/PO_merges.yaml new file mode 100644 index 00000000..63b6fc98 --- /dev/null +++ b/spec/collections/PO/PO_merges.yaml @@ -0,0 +1,37 @@ +delta: true +indexes: +- fields: + - id + - expired + - created + type: persistent +- fields: + - expired + - created + - last_version + type: persistent +name: PO_merges +schema: + $schema: http://json-schema.org/draft-07/schema# + description: A entry for merge edges in the PO ontology hierarchy + properties: + from: + description: PO id + type: string + id: + description: an edge ID, consisting of from::to::type + type: string + to: + description: PO id + type: string + type: + description: PO merge edge type + type: string + required: + - id + - type + - from + - to + title: PO_merges + type: object +type: edge diff --git a/spec/collections/PO/PO_terms.yaml b/spec/collections/PO/PO_terms.yaml new file mode 100644 index 00000000..f874c9fc --- /dev/null +++ b/spec/collections/PO/PO_terms.yaml @@ -0,0 +1,121 @@ +delta: true +indexes: +- fields: + - id + - expired + - created + type: persistent +- fields: + - expired + - created + - last_version + type: persistent +name: PO_terms +schema: + $schema: http://json-schema.org/draft-07/schema# + description: A entry for vertices in the PO ontology hierarchy + properties: + alt_ids: + description: Defines an alternate id for this term. A term may have any number + of alternate ids. + items: + type: string + type: array + comments: + description: Comments for this term. + items: + type: string + type: array + def: + description: The definition of the current term. + properties: + pred: + description: The definition predicate + type: string + val: + description: The definition value + type: string + xrefs: + description: A dbxref that describes an analagous term in another vocabulary + items: + type: string + type: array + required: + - val + type: + - 'null' + - object + id: + description: The unique id of the current term. + type: string + name: + description: The term name. + type: + - 'null' + - string + namespace: + description: The namespace of the term. + type: + - 'null' + - string + subsets: + description: This tag indicates a term subset to which this term belongs. + items: + type: string + type: array + synonyms: + description: This tag gives a synonym for this term, some xrefs to describe + the origins of the synonym, and may indicate a synonym category or scope information. + items: + properties: + pred: + description: The synonym predicate + type: string + val: + description: The synonym value + type: string + xrefs: + description: A dbxref that describes an analagous term in another vocabulary + items: + type: string + type: array + required: + - val + type: object + type: array + type: + description: The type of the node. + type: string + xrefs: + description: DBxrefs that describes an analagous term in another vocabulary + items: + properties: + pred: + description: The xref predicate + type: string + val: + description: The xref value + type: string + xrefs: + description: A dbxref that describes an analagous term in another vocabulary + items: + type: string + type: array + required: + - val + type: object + type: array + required: + - id + - type + - name + - namespace + - alt_ids + - def + - comments + - subsets + - synonyms + - xrefs + title: PO_terms + type: object +type: vertex diff --git a/spec/collections/README.md b/spec/collections/README.md new file mode 100644 index 00000000..82379e44 --- /dev/null +++ b/spec/collections/README.md @@ -0,0 +1,32 @@ +# Relation Engine Document Schemas + +Document schemas define a required format for each collection in the database. Schemas use the +[JSON Schema](https://json-schema.org/specification.html) specification. + +## Guidelines + +- Every schema file should have `name`, `type` ("vertex" or "edge"), and `schema` (JSON schema) fields +- Every JSON schema should have a "$schema" field +- You can add reusable JSON schema definitions by placing them in the [`./definitions`](/src/schemas/definitions) directory. + +## Testing your schema format + +Run `make test` in the root of the repo, which will validate all the schemas in this directory. You +can also run `make test-schemas` or `make test-schema ` to test schemas specifically. + +## Resources + +- Quickly validate JSON schemas: https://www.jsonschemavalidator.net/ + +## Data + +### Ontologies + +* Gene Ontology (GO): **[go/](/spec/collections/GO)** +* Environmental Ontology (ENVO): **[envo/](/spec/collections/ENVO)** + +### Taxonomies + +* Genome Taxonomy Database (GTDB): **[gtdb/](/spec/collections/gtdb)** +* Ribosomal Database Project (RDP): **[rdp/](/spec/collections/rdp)** +* SILVA: **[silva/](/spec/collections/silva)** diff --git a/spec/collections/UO/UO_edges.yaml b/spec/collections/UO/UO_edges.yaml new file mode 100644 index 00000000..525ff5b1 --- /dev/null +++ b/spec/collections/UO/UO_edges.yaml @@ -0,0 +1,37 @@ +delta: true +indexes: +- fields: + - id + - expired + - created + type: persistent +- fields: + - expired + - created + - last_version + type: persistent +name: UO_edges +schema: + $schema: http://json-schema.org/draft-07/schema# + description: A entry for edges in the UO ontology hierarchy + properties: + from: + description: UO id + type: string + id: + description: an edge ID, consisting of from::to::type + type: string + to: + description: UO id + type: string + type: + description: UO edge type + type: string + required: + - id + - type + - from + - to + title: UO_edges + type: object +type: edge diff --git a/spec/collections/UO/UO_merges.yaml b/spec/collections/UO/UO_merges.yaml new file mode 100644 index 00000000..42303972 --- /dev/null +++ b/spec/collections/UO/UO_merges.yaml @@ -0,0 +1,37 @@ +delta: true +indexes: +- fields: + - id + - expired + - created + type: persistent +- fields: + - expired + - created + - last_version + type: persistent +name: UO_merges +schema: + $schema: http://json-schema.org/draft-07/schema# + description: A entry for merge edges in the UO ontology hierarchy + properties: + from: + description: UO id + type: string + id: + description: an edge ID, consisting of from::to::type + type: string + to: + description: UO id + type: string + type: + description: UO merge edge type + type: string + required: + - id + - type + - from + - to + title: UO_merges + type: object +type: edge diff --git a/spec/collections/UO/UO_terms.yaml b/spec/collections/UO/UO_terms.yaml new file mode 100644 index 00000000..0d38690a --- /dev/null +++ b/spec/collections/UO/UO_terms.yaml @@ -0,0 +1,121 @@ +delta: true +indexes: +- fields: + - id + - expired + - created + type: persistent +- fields: + - expired + - created + - last_version + type: persistent +name: UO_terms +schema: + $schema: http://json-schema.org/draft-07/schema# + description: A entry for vertices in the UO ontology hierarchy + properties: + alt_ids: + description: Defines an alternate id for this term. A term may have any number + of alternate ids. + items: + type: string + type: array + comments: + description: Comments for this term. + items: + type: string + type: array + def: + description: The definition of the current term. + properties: + pred: + description: The definition predicate + type: string + val: + description: The definition value + type: string + xrefs: + description: A dbxref that describes an analagous term in another vocabulary + items: + type: string + type: array + required: + - val + type: + - 'null' + - object + id: + description: The unique id of the current term. + type: string + name: + description: The term name. + type: + - 'null' + - string + namespace: + description: The namespace of the term. + type: + - 'null' + - string + subsets: + description: This tag indicates a term subset to which this term belongs. + items: + type: string + type: array + synonyms: + description: This tag gives a synonym for this term, some xrefs to describe + the origins of the synonym, and may indicate a synonym category or scope information. + items: + properties: + pred: + description: The synonym predicate + type: string + val: + description: The synonym value + type: string + xrefs: + description: A dbxref that describes an analagous term in another vocabulary + items: + type: string + type: array + required: + - val + type: object + type: array + type: + description: The type of the node. + type: string + xrefs: + description: DBxrefs that describes an analagous term in another vocabulary + items: + properties: + pred: + description: The xref predicate + type: string + val: + description: The xref value + type: string + xrefs: + description: A dbxref that describes an analagous term in another vocabulary + items: + type: string + type: array + required: + - val + type: object + type: array + required: + - id + - type + - name + - namespace + - alt_ids + - def + - comments + - subsets + - synonyms + - xrefs + title: UO_terms + type: object +type: vertex diff --git a/spec/collections/deltaloader/delta_load_registry.yaml b/spec/collections/deltaloader/delta_load_registry.yaml new file mode 100644 index 00000000..419b2a78 --- /dev/null +++ b/spec/collections/deltaloader/delta_load_registry.yaml @@ -0,0 +1,14 @@ +name: delta_load_registry +type: vertex + +schema: + "$schema": http://json-schema.org/draft-07/schema# + title: delta_load_registry + type: object + description: Don't touch this. It's for the exclusive use of delta loaders. + properties: + _key: + type: string + description: required to pass validator. + required: + - _key \ No newline at end of file diff --git a/spec/collections/djornl/djornl_edge.yaml b/spec/collections/djornl/djornl_edge.yaml new file mode 100644 index 00000000..8576e811 --- /dev/null +++ b/spec/collections/djornl/djornl_edge.yaml @@ -0,0 +1,30 @@ +name: djornl_edge +type: edge +delta: false + +indexes: + - type: hash + fields: [edge_type] + - type: persistent + fields: [score] + +schema: + $schema: http://json-schema.org/draft-07/schema# + title: Arabidopsis gene-gene or gene-phenotype edge + description: Generic gene-to-gene or gene-to-phenotype edge for Dan Jacobson Arabidopsis data + type: object + required: [score, edge_type, _from, _to, _key, directed] + additionalProperties: false + properties: + _key: + $ref: ../../datasets/djornl/definitions.yaml#/definitions/djornl_edge/_key + _from: + $ref: ../../datasets/djornl/definitions.yaml#/definitions/djornl_edge/_from + _to: + $ref: ../../datasets/djornl/definitions.yaml#/definitions/djornl_edge/_to + score: + $ref: ../../datasets/djornl/definitions.yaml#/definitions/djornl_edge/score + edge_type: + $ref: ../../datasets/djornl/definitions.yaml#/definitions/djornl_edge/edge_type + directed: + $ref: ../../datasets/djornl/definitions.yaml#/definitions/djornl_edge/directed diff --git a/spec/collections/djornl/djornl_node.yaml b/spec/collections/djornl/djornl_node.yaml new file mode 100644 index 00000000..eb85040b --- /dev/null +++ b/spec/collections/djornl/djornl_node.yaml @@ -0,0 +1,58 @@ +name: djornl_node +type: vertex +delta: false + +indexes: + - type: hash + fields: ["clusters[*]"] + +schema: + $schema: http://json-schema.org/draft-07/schema# + title: Gene and Phenotype Vertices + description: Arabidopsis gene and phenotype nodes from the Dan Jacobson Lab + type: object + required: [_key] + additionalProperties: false + properties: + _key: + $ref: ../../datasets/djornl/definitions.yaml#/definitions/djornl_node/_key + clusters: + $ref: ../../datasets/djornl/definitions.yaml#/definitions/djornl_node/clusters + node_type: + $ref: ../../datasets/djornl/definitions.yaml#/definitions/djornl_node/node_type + transcript: + $ref: ../../datasets/djornl/definitions.yaml#/definitions/djornl_node/transcript + gene_symbol: + $ref: ../../datasets/djornl/definitions.yaml#/definitions/djornl_node/gene_symbol + gene_full_name: + $ref: ../../datasets/djornl/definitions.yaml#/definitions/djornl_node/gene_full_name + gene_model_type: + $ref: ../../datasets/djornl/definitions.yaml#/definitions/djornl_node/gene_model_type + tair_computational_description: + $ref: ../../datasets/djornl/definitions.yaml#/definitions/djornl_node/tair_computational_description + tair_curator_summary: + $ref: ../../datasets/djornl/definitions.yaml#/definitions/djornl_node/tair_curator_summary + tair_short_description: + $ref: ../../datasets/djornl/definitions.yaml#/definitions/djornl_node/tair_short_description + go_description: + $ref: ../../datasets/djornl/definitions.yaml#/definitions/djornl_node/go_description + go_terms: + $ref: ../../datasets/djornl/definitions.yaml#/definitions/djornl_node/go_terms + mapman_bin: + $ref: ../../datasets/djornl/definitions.yaml#/definitions/djornl_node/mapman_bin + mapman_name: + $ref: ../../datasets/djornl/definitions.yaml#/definitions/djornl_node/mapman_name + mapman_description: + $ref: ../../datasets/djornl/definitions.yaml#/definitions/djornl_node/mapman_description + pheno_aragwas_id: + $ref: ../../datasets/djornl/definitions.yaml#/definitions/djornl_node/pheno_aragwas_id + pheno_description: + $ref: ../../datasets/djornl/definitions.yaml#/definitions/djornl_node/pheno_description + pheno_pto_name: + $ref: ../../datasets/djornl/definitions.yaml#/definitions/djornl_node/pheno_pto_name + pheno_pto_description: + $ref: ../../datasets/djornl/definitions.yaml#/definitions/djornl_node/pheno_pto_description + pheno_reference: + $ref: ../../datasets/djornl/definitions.yaml#/definitions/djornl_node/pheno_reference + user_notes: + $ref: ../../datasets/djornl/definitions.yaml#/definitions/djornl_node/user_notes diff --git a/spec/collections/gtdb/README.md b/spec/collections/gtdb/README.md new file mode 100644 index 00000000..05d97d6c --- /dev/null +++ b/spec/collections/gtdb/README.md @@ -0,0 +1,7 @@ +# Genome Taxonomy Database + +KBase Relation Engine schemas for GTDB taxonomy data + +References: + +* http://gtdb.ecogenomic.org/ diff --git a/spec/collections/gtdb/gtdb_child_of_taxon.yaml b/spec/collections/gtdb/gtdb_child_of_taxon.yaml new file mode 100644 index 00000000..78261151 --- /dev/null +++ b/spec/collections/gtdb/gtdb_child_of_taxon.yaml @@ -0,0 +1,25 @@ +name: gtdb_child_of_taxon +type: edge +delta: true + +indexes: + - type: persistent + fields: [id, expired, created] + - type: persistent + fields: [expired, created, last_version] + +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + required: [from, to, id] + description: Edges which create the taxonomy tree for GTDB taxons. + properties: + id: + type: string + description: The id of the edge. This is the same as the from ID for GTDB. + from: + type: string + description: The child taxon. + to: + type: string + description: The parent taxon. diff --git a/spec/collections/gtdb/gtdb_taxon.yaml b/spec/collections/gtdb/gtdb_taxon.yaml new file mode 100644 index 00000000..80d5bec6 --- /dev/null +++ b/spec/collections/gtdb/gtdb_taxon.yaml @@ -0,0 +1,33 @@ +name: gtdb_taxon +type: vertex +delta: true + +indexes: + - type: fulltext + fields: [scientific_name] + minLength: 1 + - type: persistent + fields: [id, expired, created] + - type: persistent + fields: [expired, created, last_version] + +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + description: Template for a vertex entry in the GTDB taxonomy tree. + required: [id, scientific_name, rank] + properties: + id: + type: string + description: GTDB Taxon id. For a non-organism node, this is the rank abbreviation joined + with ':' and the taxon name with spaces replaced by underscores. For an organism node, + it is the accession ID. + examples: ['p:Firmicutes', 's:Sediminibacterium_sp002786355', 'RS_GCF_000169355.1'] + scientific_name: + type: string + description: The name of the taxon + examples: ['Firmicutes', 'Sediminibacterium sp002786355'] + rank: + type: string + title: Taxonomic rank + examples: [domain, phylum] diff --git a/spec/collections/mash/mash_genome_similar_to.yaml b/spec/collections/mash/mash_genome_similar_to.yaml new file mode 100644 index 00000000..445e23c3 --- /dev/null +++ b/spec/collections/mash/mash_genome_similar_to.yaml @@ -0,0 +1,14 @@ +name: mash_genome_similar_to +type: edge +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + required: [_from, _to] + description: The workspace object is similar to another object + properties: + _from: + type: string + examples: ["wsprov_object/1:2:3"] + _to: + type": string + examples": ["wsprov_object/1:2:3"] diff --git a/spec/collections/ncbi/README.md b/spec/collections/ncbi/README.md new file mode 100644 index 00000000..d5dabfab --- /dev/null +++ b/spec/collections/ncbi/README.md @@ -0,0 +1 @@ +# NCBI genbank data diff --git a/spec/collections/ncbi/ncbi_child_of_taxon.yaml b/spec/collections/ncbi/ncbi_child_of_taxon.yaml new file mode 100644 index 00000000..77210a1b --- /dev/null +++ b/spec/collections/ncbi/ncbi_child_of_taxon.yaml @@ -0,0 +1,25 @@ +name: ncbi_child_of_taxon +type: edge +delta: true + +indexes: + - type: persistent + fields: [id, expired, created] + - type: persistent + fields: [expired, created, last_version] + +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + required: [from, to, id] + description: Edges which create the taxonomy tree for NCBI taxons. + properties: + id: + type: string + description: The id of the edge. This is the same as the from ID for NCBI. + from: + type: string + description: The child taxon. + to: + type: string + description: The parent taxon. diff --git a/spec/collections/ncbi/ncbi_gene.yaml b/spec/collections/ncbi/ncbi_gene.yaml new file mode 100644 index 00000000..3eef69df --- /dev/null +++ b/spec/collections/ncbi/ncbi_gene.yaml @@ -0,0 +1,85 @@ +name: ncbi_gene +type: vertex +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + additionalProperties: false + description: A component of a DNA sequence, such as a CDS, mRNA, etc. + required: [_key, type, location] + properties: + _key: + type: string + title: Content hash + description: Hash of the DNA sequence for this feature. + protein_translation: + type: string + title: Protein translation + description: Longest coded protein (representative protein for splice variants) + protein_translation_length: + type: integer + description: Length of protein_translation + md5_hash: + type: string + title: DNA content hash + description: md5 hash of the dna sequence that this feature encodes. + note: + type: string + description: Free-text description of this feature + functions: + type: array + title: Gene functions + items: {type: string} + functional_descriptions: + type: array + title: Gene function descriptions + items: {type: string} + type: + type: string + examples: [Gene, ncRNA, repeat, CDS, mRNA] + location: + type: array + description: A list of segments of sequence that comprise this feature + items: + contig: + type: string + description: Contig ID where this segment occurs + strand: + type: string + description: Strand where this segment occurs + enum: [+, -, ?] + start: + type: integer + description: Index in the genome sequence where this segment of the feature starts + length: + type: integer + description: Length of this segment of the feature + flags: + type: array + description: Additional flags about the feature such trans_splicing + items: {type: string} + warnings: + type: array + description: Warnings generated by the uploader about this feature + items: {type: string} + dna_sequence: + type: string + description: Nucleotide sequence for this feature. + dna_sequence_length: + type: integer + description: Total character/nucleotide length of dna_sequence + db_xrefs: + title: Database cross-references + description: IDs for these feature in other databases, grouped by database + type: object + patternProperties: + ".*": + type: array + items: {type: string} + aliases: + description: Aliases for these feature, grouped by alias type + type: object + description: All values are arrays of strings + patternProperties: + ".*": + type: array + items: {type: string} diff --git a/spec/collections/ncbi/ncbi_gene_within_genome.yaml b/spec/collections/ncbi/ncbi_gene_within_genome.yaml new file mode 100644 index 00000000..1d897984 --- /dev/null +++ b/spec/collections/ncbi/ncbi_gene_within_genome.yaml @@ -0,0 +1,13 @@ +name: ncbi_gene_within_genome +type: edge +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + required: [_from, _to] + properties: + _from: + type: string + description: The ncbi_gene that is part of a genome. + _to: + type: string + description: The ncbi_genome that contains a gene. diff --git a/spec/collections/ncbi/ncbi_genome.yaml b/spec/collections/ncbi/ncbi_genome.yaml new file mode 100644 index 00000000..c22bb697 --- /dev/null +++ b/spec/collections/ncbi/ncbi_genome.yaml @@ -0,0 +1,100 @@ +name: ncbi_genome +type: vertex +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + additionalProperties: false + description: Whole-genome metadata (genes are separate vertices) + required: + - _key + - scientific_name + - domain + properties: + _key: + type: string + description: Hash of the full set of data contained in this genome. + refseq_id: + type: string + examples: + - NC_008270.1 + description: RefSeq database accession id + scientific_name: + type: string + examples: + - Haloferax Volcanii + domain: + type: string + enum: + - Archaea + - Bacteria + - Eukarya + - Unknown + feature_counts: + type: object + additionalProperties: true + description: A count of the number of instances of each feature type such as CDSs, + repeats etc. + patternProperties: + ".*": + type: integer + dna_size: + type: integer + title: Nucleotide count + num_contigs: + type: integer + title: Number of contigs + description: Number of consensus regions of the DNA. + molecule_type: + type: string + title: Molecule type + examples: + - DNA + description: Can include genomic DNA, genomic RNA, precursor RNA, mRNA (cDNA), + ribosomal RNA, transfer RNA, small nuclear RNA, and small cytoplasmic RNA + contig_lengths: + type: array + description: Nucleotide length of each contig + items: + type: integer + contig_ids: + type: array + description: The ids of each contig in the associated assembly + items: + type: string + source: + type: string + description: The tool or database that produced the genome + examples: + - RefSeq + - Ensembl + - Phytozome + - RAST + - Prokka + - User_upload + source_id: + type: string + description: The ID assigned the to the genome by that source + release: + type: string + description: The release version of the source database for this genome if applicable + taxonomy: + type: array + description: Full taxonomy parent-to-child linkage up to the domain + examples: + - - Bacteria + - Actinobacteria + - Corynebacteriales + - Nocardiaceae + - Rhodococcus + items: + type: string + gc_content: + type: number + description: Fraction of GC pairs in the genome + is_suspect: + type: boolean + description: Flag indicating that the genome has failed to pass one or more validation + tests + notes: + type: string + description: Free text notes from the genome upload diff --git a/spec/collections/ncbi/ncbi_taxon.yaml b/spec/collections/ncbi/ncbi_taxon.yaml new file mode 100644 index 00000000..18810eba --- /dev/null +++ b/spec/collections/ncbi/ncbi_taxon.yaml @@ -0,0 +1,65 @@ +name: ncbi_taxon +type: vertex +delta: true + +indexes: + - type: fulltext + fields: [scientific_name] + minLength: 1 + - type: persistent + fields: [id, expired, created] + - type: persistent + fields: [expired, created, last_version] + +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + description: Template for a vertex entry in the NCBI taxonomy tree. + required: [id, scientific_name, rank, strain] + properties: + id: + type: string + description: NCBI Taxon id (positive integer) + examples: ['1', '2053699'] + scientific_name: + type: string + title: Taxon name. + examples: ['Methylophilus methylotrophus', 'Bacteria', 'Firmicutes'] + aliases: + type: array + description: Aliases + examples: + - - category: authority + name: Borreliella burgdorferi (Johnson et al. 1984) Adeolu and Gupta 2015 + - category: genbank common name + name: Lyme disease spirochet + - category: synonym + name: Borrelia burgdorferi + - - category: common name + name: E. coli + - category: authority + name: '"Bacterium coli commune" Escherich 1885' + - category: synonym + name: Bacterium coli + items: + type: object + required: ['category', 'name'] + properties: + category: {type: string} + name: {type: string} + rank: + type: string + title: Taxonomic rank + examples: ["Domain", "Phylum", "no rank"] + strain: + type: boolean + title: Strain flag + description: Whether this node corresponds to a strain. Strains are considered to be nodes + that have a rank of "no rank" and whose parents' rank is either species or subspecies or + where the parent's strain flag is true. + ncbi_taxon_id: + type: integer + title: The NCBI taxon ID as a number + gencode: + type: integer + title: The numerc ID of the genetic code for this organism. diff --git a/spec/collections/ncbi/ncbi_taxon_merges.yaml b/spec/collections/ncbi/ncbi_taxon_merges.yaml new file mode 100644 index 00000000..18d6b0fb --- /dev/null +++ b/spec/collections/ncbi/ncbi_taxon_merges.yaml @@ -0,0 +1,35 @@ +name: ncbi_taxon_merges +type: edge +delta: true + +indexes: + - type: persistent + fields: [id, expired, created] + - type: persistent + fields: [expired, created, last_version] + +schema: + "$schema": http://json-schema.org/draft-07/schema# + title: ncbi_taxon_merges + type: object + description: A entry for merge edges in the NCBI hierarchy + properties: + id: + type: string + description: an edge ID, consisting of the ID of the child node + examples: + - '51633' + from: + type: string + description: NCBI id + examples: + - '51633' + to: + type: string + title: NCBI id + examples: + - '5467' + required: + - id + - from + - to diff --git a/spec/collections/rdp/README.md b/spec/collections/rdp/README.md new file mode 100644 index 00000000..1ac639d1 --- /dev/null +++ b/spec/collections/rdp/README.md @@ -0,0 +1,7 @@ +# Ribosomal Database Project + +KBase Relation Engine schemas for RDP taxonomy data + +References: + +* https://rdp.cme.msu.edu/ diff --git a/spec/collections/rdp/rdp_child_of_taxon.yaml b/spec/collections/rdp/rdp_child_of_taxon.yaml new file mode 100644 index 00000000..d3d211a9 --- /dev/null +++ b/spec/collections/rdp/rdp_child_of_taxon.yaml @@ -0,0 +1,26 @@ +name: rdp_child_of_taxon +type: edge +delta: true + +indexes: + - type: persistent + fields: [id, expired, created] + - type: persistent + fields: [expired, created, last_version] + +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + required: [from, to, id] + description: Edges which create the taxonomy tree for RDP taxons. + properties: + id: + type: string + description: The id of the edge. This is the id of the from node (e.g. the child node in + the tree) for the edge. + from: + type: string + description: The child taxon. The id of the from node for the edge. + to: + type: string + description: The parent taxon. The id of the to node for the edge. diff --git a/spec/collections/rdp/rdp_taxon.yaml b/spec/collections/rdp/rdp_taxon.yaml new file mode 100644 index 00000000..b5523db2 --- /dev/null +++ b/spec/collections/rdp/rdp_taxon.yaml @@ -0,0 +1,53 @@ +name: rdp_taxon +type: vertex +delta: true + +indexes: + - type: fulltext + fields: [name] + minLength: 1 + - type: persistent + fields: [id, expired, created] + - type: persistent + fields: [expired, created, last_version] + +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + description: Template for a vertex entry in the RDP taxonomy tree. + required: [id, name, rank, molecule, unclassified, incertae_sedis] + properties: + id: + type: string + description: RDP Taxon id. For a non-sequence node, this is the rank joined + with ':' and the taxon name with spaces and slashes replaced by underscores. + If the node is an incertae sedis insertion, ':is' is appended. For a sequence (e.g. leaf) + node, it is the locus ID. + examples: ['phylum:Actinobacteria', 'S000494589'] + name: + type: string + description: The name of the taxon. For sequences this is the strain name. Some sequences + in RDP have no name, in which case the name will be an empty string. + If '[ _][Ii]ncertae[ _][Ss]edis' exists in the name for internal nodes, that + text is removed and the node is marked as an incertae sedis node (see below). + examples: ['Acidimicrobium', 'uncultured bacterium; YRM60L1D06060904'] + rank: + type: string + title: Taxonomic rank. Sequence node rank is always 'sequence_example'. + examples: [domain, sequence_example] + molecule: + type: ['string', 'null'] + description: The type of molecule for the RDP sequence. Either 16S or 28S. Null + for non-leaf nodes. + examples: [16S, 28S] + unclassified: + type: boolean + description: Denotes an sequence that does not have a full lineage. In the RDP files, + these sequence are denoted via a truncated linage string where the last entry starts + with the string 'unclassified_'. The lineage string is a list of tuples of rank and name + all separated by semicolons, so if there are an odd number of entries in the lineage + string an unclassified organism is expected. Always false for internal nodes. + incertae_sedis: + type: ['boolean', 'null'] + description: Denotes a taxa node that is an inceratae sedis insertion. Always null for + sequence (e.g. leaf) nodes. diff --git a/spec/collections/rxn/README.md b/spec/collections/rxn/README.md new file mode 100644 index 00000000..4bac805e --- /dev/null +++ b/spec/collections/rxn/README.md @@ -0,0 +1 @@ +# Reaction homology diff --git a/spec/collections/rxn/rxn_compound.yaml b/spec/collections/rxn/rxn_compound.yaml new file mode 100644 index 00000000..216f0c0f --- /dev/null +++ b/spec/collections/rxn/rxn_compound.yaml @@ -0,0 +1,70 @@ +name: rxn_compound +type: vertex +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + additionalProperties: true + description: Chemical reactions + required: [_key] + properties: + _key: + examples: [cpd02201] + pattern: ^cpd\d+$ + title: ModelSeed ID + type: string + abbreviation: + examples: [phpyr] + type: string + aliases: + examples: ["AraCyc:PYRUVATE;BiGG:pyr;BrachyCyc:PYRUVATE;KEGG:C00022"] + type: string + charge: + examples: ['-1'] + type: integer + deltag: + description: The change in Free Energy of Formation + type: [number, 'null'] + deltagerr: + description: The error associated with the Free Energy of Formation + type: [number, 'null'] + formula: + examples: [C6H6] + type: string + id: + examples: [cpd02201] + pattern: ^cpd\d+$ + title: ModelSeed ID + type: string + inchikey: + examples: [LCTONWCANYUPML-UHFFFAOYSA-M] + type: string + is_cofactor: + description: The compound is a cofactor + type: integer + is_core: + description: The compound is involved in core metabolism + type: integer + is_obsolete: + description: The compound is a deprecated + type: integer + linked_compound: + description: If the compound is deprecated, the compound that supersedes this entry + type: [string, 'null'] + mass: + description: Molecular mass of compound + type: [number, 'null'] + name: + type: string + pka: + description: Acid dissociation constants of compound + type: string + pkb: + description: Base dissociation constants of compound + type: string + smiles: + description: Structure of the compound in Simplified Molecular Input Line Entry + System + type: string + source: + description: Does this compound come from a primary database or a metabolic model? + type: string diff --git a/spec/collections/rxn/rxn_compound_linked_to_compound.yaml b/spec/collections/rxn/rxn_compound_linked_to_compound.yaml new file mode 100644 index 00000000..6fbb3bea --- /dev/null +++ b/spec/collections/rxn/rxn_compound_linked_to_compound.yaml @@ -0,0 +1,16 @@ +name: rxn_compound_linked_to_compound +type: edge +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + required: [_from, _to] + description: Generally these linkages indicate a that one compound has been made obsolete + and replaced with the linked compound. This may arise from duplicates in the database + or errors in the obsolete entity + properties: + _from: + type: string + description: A reaction + _to: + type: string + description: Another reaction diff --git a/spec/collections/rxn/rxn_compound_within_reaction.yaml b/spec/collections/rxn/rxn_compound_within_reaction.yaml new file mode 100644 index 00000000..3211e562 --- /dev/null +++ b/spec/collections/rxn/rxn_compound_within_reaction.yaml @@ -0,0 +1,18 @@ +name: rxn_compound_within_reaction +type: edge +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + required: [_from, _to] + description: A compound is a member of a reaction + additionalProperties: true + properties: + _from: + type: string + description: The ID of the compound + _to: + type: string + description: The ID of the reaction + stoichiometry: + type: number + description: The stochiometry of compound in the reaction diff --git a/spec/collections/rxn/rxn_gene_complex.yaml b/spec/collections/rxn/rxn_gene_complex.yaml new file mode 100644 index 00000000..1202772f --- /dev/null +++ b/spec/collections/rxn/rxn_gene_complex.yaml @@ -0,0 +1,24 @@ +name: rxn_gene_complex +type: vertex +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + additionalProperties: false + description: Groups of genes that take part in producing a chemical reaction in the + cell. + required: [_key, genes] + properties: + _key: + type: string + description: Hash of the conjunctions. + genes: + type: array + examples: [[SO_0001, SO_0001]] + description: Array of genes. + items: + type: string + description: Gene vertex _key + source: + type: string + examples: [ModelSEED, KEGG] + description: The source of the gene complex information. diff --git a/spec/collections/rxn/rxn_gene_within_complex.yaml b/spec/collections/rxn/rxn_gene_within_complex.yaml new file mode 100644 index 00000000..04ba6f32 --- /dev/null +++ b/spec/collections/rxn/rxn_gene_within_complex.yaml @@ -0,0 +1,13 @@ +name: rxn_gene_within_complex +type: edge +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + required: [_from, _to] + properties: + _from: + type: string + description: The ncbi_gene contained within a rxn_gene_complex. + _to: + type: string + description: The rxn_gene_complex that contains the gene. diff --git a/spec/collections/rxn/rxn_reaction.yaml b/spec/collections/rxn/rxn_reaction.yaml new file mode 100644 index 00000000..a8af663c --- /dev/null +++ b/spec/collections/rxn/rxn_reaction.yaml @@ -0,0 +1,53 @@ +name: rxn_reaction +type: vertex +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + additionalProperties: true + required: [_key] + description: Chemical reactions + properties: + _key: + type: string + examples: [rxn02201] + title: ModelSeed ID + pattern: "^rxn\\d+$" + direction: + type: string + enum: [">", "<", "="] + name: + type: string + description: Chemical names + examples: ["trans-2-Methyl-5-isopropylhexa-2,5-dienal dehydrogenase_c0"] + gpr: + type: string + examples: [PGN_RS01070] + ec_number: + type: string + examples: ["2.7.3.7"] + title: Enzyme Commission Number + pattern: "^\\d+\\.\\d+\\.\\d+\\.\\d+$" + bbcwn: + type: number + examples: [-108] + equation: + type: string + description: Reaction formula using compound IDs (eg. cd00443) + examples: + - "(1) cpd00443[c0] + (1) cpd02920[c0] => (1) cpd00012[c0] + (1) cpd00067[c0] + (1) cpd00683[c0]" + definition: + type: string + description: Reaction formula. Same as equation, but with compound IDs replaced with chemical names. + bigg_id: + type: string + examples: [DHPS2] + kegg_id: + type: string + examples: [R03067] + kegg_pathways: + type: string + examples: ["Folate biosynthesis"] + metacyc_pathways: + type: array + items: {type: string} + examples: [["AMINE-DEG", "Creatinine-Degradation", "Degradation"]] diff --git a/spec/collections/rxn/rxn_reaction_linked_to_reaction.yaml b/spec/collections/rxn/rxn_reaction_linked_to_reaction.yaml new file mode 100644 index 00000000..44a78778 --- /dev/null +++ b/spec/collections/rxn/rxn_reaction_linked_to_reaction.yaml @@ -0,0 +1,17 @@ +name: rxn_reaction_linked_to_reaction +type: edge +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + required: [_from, _to] + description: | + Generally these linkages indicate a that one reaction has been made obsolete + and replaced with the linked reaction. This may arise from duplicates in the database + or errors in the obsolete entity + properties: + _from: + type: string + description: A reaction + _to: + type: string + description: Another reaction diff --git a/spec/collections/rxn/rxn_reaction_within_complex.yaml b/spec/collections/rxn/rxn_reaction_within_complex.yaml new file mode 100644 index 00000000..41706aab --- /dev/null +++ b/spec/collections/rxn/rxn_reaction_within_complex.yaml @@ -0,0 +1,13 @@ +name: rxn_reaction_within_complex +type: edge +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + required: [_from, _to] + properties: + _from: + type: string + description: The rxn_reaction contained within a rxn_gene_complex. + _to: + type: string + description: The rxn_gene_complex that produces a reaction. diff --git a/spec/collections/rxn/rxn_similar_to_reaction.yaml b/spec/collections/rxn/rxn_similar_to_reaction.yaml new file mode 100644 index 00000000..fb13b090 --- /dev/null +++ b/spec/collections/rxn/rxn_similar_to_reaction.yaml @@ -0,0 +1,15 @@ +name: rxn_similar_to_reaction +type: edge +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + required: [_from, _to] + description: A generic similarity association between rections. + additionalProperties: true + properties: + _from: + type: string + description: The ID of a vertex + _to: + type: string + description: The ID of a vertex diff --git a/spec/collections/samples/sample_ontology_link.yaml b/spec/collections/samples/sample_ontology_link.yaml new file mode 100644 index 00000000..8af7ae92 --- /dev/null +++ b/spec/collections/samples/sample_ontology_link.yaml @@ -0,0 +1,59 @@ +name: sample_ontology_link +type: edge +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + required: [_from, _to, created, expired, sample_metadata_term] + description: minimum necessary terms for sample -> ontology edge link. + properties: + _from: + type: string + description: the sample node _id (as found in sample_nodes collection) + examples: ["samples_nodes/465b1476-3699-4e6c-a06b-8d384fcc41f3_6d5999ee-42fb-4bad-a3b9-901aa1b490c5_f4bc367798eb923f77d7405031723908"] + _to: + type: string + description: The _id of an ontology vertex, such as from ENVO, GO, etc. + examples: ["ENVO_terms/ENVO:01000221_v2019-03-14", 'GO_terms/GO:0047161_v2019-01-01'] + created: + type: integer + description: unix epoch of when the link was created + minimum: 0 + createdby: + type: string + description: Who made this sample-ontology link? + expired: + type: integer + description: unix epoch of when this link expires + minimum: 0 + sample_id: + type: string + description: uuid identifier for sample object. corresponds to sample id provided by SampleService + examples: ['465b1476-3699-4e6c-a06b-8d384fcc41f3'] + sample_version: + type: integer + description: integer version of sample object (1, 2, etc.) + examples: [1, 2, 3] + sample_version_uuid: + type: string + description: uuid identifier for sample object version in sample version collection + examples: ['6d5999ee-42fb-4bad-a3b9-901aa1b490c5'] + sample_node_name: + type: string + description: name of sample node in Sample + examples: ['HRV003M16'] + sample_node_uuid: + type: string + description: uuid identifier for sample node in sample nodes collection + examples: ['f4bc367798eb923f77d7405031723908'] + sample_metadata_term: + type: string + description: metadata term in sample associated with ontology term + examples: ['biome', 'ENIGMA:material', 'feature'] + ontology_term: + type: string + description: identifier for term in ontology_collection + examples: ['ENVO:01000221', 'GO:0047161'] + ontology_collection: + type: string + description: name of collection containing ontology_term + examples: ['ENVO_terms', 'GO_terms'] diff --git a/spec/collections/silva/README.md b/spec/collections/silva/README.md new file mode 100644 index 00000000..415d1202 --- /dev/null +++ b/spec/collections/silva/README.md @@ -0,0 +1,7 @@ +# SILVA rRNA Database Project + +KBase Relation Engine schemas for SILVA taxonomy data + +References: + +* https://www.arb-silva.de/ diff --git a/spec/collections/silva/silva_child_of_taxon.yaml b/spec/collections/silva/silva_child_of_taxon.yaml new file mode 100644 index 00000000..d86d4a17 --- /dev/null +++ b/spec/collections/silva/silva_child_of_taxon.yaml @@ -0,0 +1,25 @@ +name: silva_child_of_taxon +type: edge +delta: true + +indexes: + - type: persistent + fields: [id, expired, created] + - type: persistent + fields: [expired, created, last_version] + +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + required: [from, to, id] + description: Edges in the SILVA taxonomy tree in direction from leaf to root. + properties: + id: + type: string + description: The id of the edge, which is identical to the edge's `from` field + from: + type: string + description: The taxon id of the edge's source node + to: + type: string + description: The taxon id of the edge's sink node diff --git a/spec/collections/silva/silva_taxon.yaml b/spec/collections/silva/silva_taxon.yaml new file mode 100644 index 00000000..64915433 --- /dev/null +++ b/spec/collections/silva/silva_taxon.yaml @@ -0,0 +1,55 @@ +name: silva_taxon +type: vertex +delta: true + +indexes: + - type: fulltext + fields: [name] + minLength: 1 + - type: persistent + fields: [id, expired, created] + - type: persistent + fields: [expired, created, last_version] + +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + description: Template for a vertex entry in the SILVA SSU taxonomy tree. + required: [id, name, rank] + properties: + id: + type: string + description: For taxon nodes, the SILVA taxon id. These will be "mostly stable in upcoming + releases" as of SILVA 138. For sequence nodes, the INSDC primary accession identifier, and + the start and stop of the 16S gene within the entry. See SILVA documentation for more + details. + examples: ['2', '44', '50000', 'CP010838.1980157.1981698'] + name: + type: string + description: For taxon nodes, the name of the taxon. For sequence nodes, the organism name + given to the sequence. + examples: ['Ewamiania TS0513', 'Methyloligellaceae', 'BCP clade', 'uncultured', + 'Bordetella pertussis'] + rank: + type: string + description: SILVA's taxonomic rank, with addition of 'root_rank' and 'sequence' for + root and sequence nodes, respectively + enum: ['superfamily', 'subphylum', 'subfamily', 'phylum', 'order', 'major_clade', + 'infraclass', 'suborder', 'family', 'superkingdom', 'domain', 'superphylum', 'superorder', + 'superclass', 'infraphylum', 'subclass', 'genus', 'class', 'kingdom', 'subkingdom', + 'root_rank', 'sequence'] + release: + type: number + description: SILVA release number, primarily for taxon nodes + examples: [138.1, 138, 132, 128, 123.1, 123, 119.1, 119] + sequence: + type: string + description: rRNA sequence for sequence nodes + dataset: + type: array + items: + type: string + description: The datasets that a sequence node is from. Composed of 'parc', 'ref', and + 'nr99', corresponding to the Parc, Ref and Ref NR99 datasets, respectively. + Parc > Ref > Ref NR99, with > denoting superset. + enum: [['parc'], ['parc', 'ref'], ['parc', 'ref', 'nr99']] diff --git a/spec/collections/test/test_edge.yaml b/spec/collections/test/test_edge.yaml new file mode 100644 index 00000000..fab7ad6e --- /dev/null +++ b/spec/collections/test/test_edge.yaml @@ -0,0 +1,10 @@ +name: test_edge +type: edge +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + required: [_from, _to] + description: Example edge schema for testing. + properties: + _from: {type: string} + _to: {type: string} diff --git a/spec/collections/test/test_vertex.yaml b/spec/collections/test/test_vertex.yaml new file mode 100644 index 00000000..b2d34668 --- /dev/null +++ b/spec/collections/test/test_vertex.yaml @@ -0,0 +1,11 @@ +name: test_vertex +type: vertex +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + required: [_key] + description: An example vertex schema for testing + properties: + _key: {type: string} + is_public: {type: boolean} + ws_id: {type: integer} diff --git a/spec/collections/ws/README.md b/spec/collections/ws/README.md new file mode 100644 index 00000000..a276aecb --- /dev/null +++ b/spec/collections/ws/README.md @@ -0,0 +1,5 @@ +# Workspace schemas (full details) + +These schemas comprise a full, detailed sync of all the data from the KBase workspace. + +For import code, see: https://github.com/kbaseapps/relation_engine_sync diff --git a/spec/collections/ws/ws_copied_from.yaml b/spec/collections/ws/ws_copied_from.yaml new file mode 100644 index 00000000..2595db32 --- /dev/null +++ b/spec/collections/ws/ws_copied_from.yaml @@ -0,0 +1,10 @@ +name: ws_copied_from +type: edge +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + required: [_from, _to] + description: The _from object was created as an exact copy of the _to object. + properties: + _from: {type: string} + _to: {type: string} diff --git a/spec/collections/ws/ws_feature_has_GO_annotation.yaml b/spec/collections/ws/ws_feature_has_GO_annotation.yaml new file mode 100644 index 00000000..f59b246f --- /dev/null +++ b/spec/collections/ws/ws_feature_has_GO_annotation.yaml @@ -0,0 +1,26 @@ +name: ws_feature_has_GO_annotation +type: edge +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + description: A feature in a workspace genome has a Gene Ontology annotation. + required: [_key, _from, _to, source] + properties: + _key: + type: string + examples: ['75:82:3_RSP_4039::GO:0000002_v2018-03-06::kbase_RE_indexer'] + description: a unique ID for this edge, consisting of the keys of the feature and term + vertices and the source of the edge data. The latter allows other sources to add + the same edge with potentially different fields. + _from: + type: string + examples: ['ws_genome_feature/75:82:3_RSP_4039'] + description: The unique, permanent ID of a genome feature in a version of a workspace object. + _to: + type: string + examples: ['GO_terms/GO:0000002_v2018-03-06'] + description: A Gene Ontology term. + source: + type: string + examples: ['kbase_RE_indexer', 'user_name', 'user_name:app_name'] + description: The source that created this edge \ No newline at end of file diff --git a/spec/collections/ws/ws_genome_features.yaml b/spec/collections/ws/ws_genome_features.yaml new file mode 100644 index 00000000..be7d320a --- /dev/null +++ b/spec/collections/ws/ws_genome_features.yaml @@ -0,0 +1,34 @@ +name: ws_genome_features +type: vertex +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + required: + - _key + - feature_id + - workspace_id + - object_id + - version + properties: + _key: + type: string + description: The UPA and feature ID for this data + examples: ["35414:73:1_RSP_4039"] + # see https://www.arangodb.com/docs/stable/data-modeling-naming-conventions-document-keys.html + pattern: "^\\d+:\\d+:\\d+_[a-zA-Z0-9_\\-:\\.@\\(\\)\\+,=;\\$!\\*'%]*$" + feature_id: + type: string + description: The unique ID of the feature within the genome + examples: ["RSP_4039"] + workspace_id: + type: integer + description: The workspace ID for the genome containing this feature + minimum: 1 + object_id: + type: integer + description: The permanent object id for the genome containing this feature + minimum: 1 + version: + type: integer + description: The version of the object containing this feature + minimum: 1 diff --git a/spec/collections/ws/ws_genome_has_feature.yaml b/spec/collections/ws/ws_genome_has_feature.yaml new file mode 100644 index 00000000..4fe3da88 --- /dev/null +++ b/spec/collections/ws/ws_genome_has_feature.yaml @@ -0,0 +1,22 @@ +name: ws_genome_has_feature +type: edge +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + description: A workspace genome has a feature. + required: [_from, _to, _key] + properties: + _key: + type: string + examples: ['75:82:3_RSP_4039'] + description: The unique, permanent ID of this edge. Identical to the feature _key entry. + # see https://www.arangodb.com/docs/stable/data-modeling-naming-conventions-document-keys.html + pattern: "^\\d+:\\d+:\\d+_[a-zA-Z0-9_\\-:\\.@\\(\\)\\+,=;\\$!\\*'%]*$" + _from: + type: string + examples: ['ws_object_version/75:82:3'] + description: The unique, permanent ID of a version of a workspace object. + _to: + type: string + examples: ['ws_genome_feature/75:82:3_RSP_4039'] + description: A genome feature \ No newline at end of file diff --git a/spec/collections/ws/ws_has_perm.yaml b/spec/collections/ws/ws_has_perm.yaml new file mode 100644 index 00000000..8bfda1c3 --- /dev/null +++ b/spec/collections/ws/ws_has_perm.yaml @@ -0,0 +1,23 @@ +name: ws_has_perm +type: edge +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + description: The user has permissions on a workspace. + required: [_from, _to, perm] + properties: + perm: + type: string + enum: [a, w, r] + title: Permissions + description: | + Represents the permissions a user has on a workspace, where 'a' is + 'administrator', 'w' is read/write, 'r' is readonly. + _from: + type: string + examples: ['ws_user/jjeffryes'] + description: A username + _to: + type: string + examples: ['ws_workspace/35414'] + description: A workspace diff --git a/spec/collections/ws/ws_latest_version_of.yaml b/spec/collections/ws/ws_latest_version_of.yaml new file mode 100644 index 00000000..f461be94 --- /dev/null +++ b/spec/collections/ws/ws_latest_version_of.yaml @@ -0,0 +1,25 @@ +name: ws_latest_version_of +type: edge +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + additionalProperties: false + required: [_from, _to] + properties: + _from: + type: string + examples: + - ws_type_version/KBaseGenomes.Genome‑9.0 + - ws_module_version/kb_uploadmethods:8ebb66e4f2c27bc4a9b7cddff7d7b0f27f4ee433 + description: | + A versioned entity, representing the most recent version of an entity + in a group (most likely a workspace object, module, or workspace type). + _to: + type: string + examples: + - ws_type/KBaseGenomes.Genome + - ws_module/kb_uploadmethods + description: | + The non-versioned entity group, where all members of the group are + different versions of something (eg. a workspace object, module, or workspace + type) diff --git a/spec/collections/ws/ws_method.yaml b/spec/collections/ws/ws_method.yaml new file mode 100644 index 00000000..db6b4360 --- /dev/null +++ b/spec/collections/ws/ws_method.yaml @@ -0,0 +1,14 @@ +name: ws_method +type: vertex +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + description: SDK module method (unversioned). + additionalProperties: false + required: [_key] + properties: + _key: + type: string + examples: ["kb_uploadmethods.import_fasta_as_assembly_from_staging"] + description: "." + pattern: "^\\w+\\.\\w+$" diff --git a/spec/collections/ws/ws_method_version.yaml b/spec/collections/ws/ws_method_version.yaml new file mode 100644 index 00000000..59e02dc2 --- /dev/null +++ b/spec/collections/ws/ws_method_version.yaml @@ -0,0 +1,40 @@ +name: ws_method_version +type: vertex +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + additionalProperties: false + description: A specific method within a version of an SDK module. + required: [_key, module_name, method_name, commit, ver, code_url] + properties: + _key: + type: string + examples: + - module_name:version_hash.method_name + - module_name:UNKNOWN.method_name + - ws_method_version/kb_uploadmethods:8ebb66e4f2c27bc4a9b7cddff7d7b0f27f4ee433.import_genbank_from_staging + description: ":." + pattern: "^\\w+:\\w+\\.\\w+$" + module_name: + type: string + examples: [kb_uploadmethods] + pattern: "^\\w+$" + method_name: + type: string + examples: [import_genbank_from_staging] + pattern: "^\\w+$" + commit: + type: string + examples: [8ebb66e4f2c27bc4a9b7cddff7d7b0f27f4ee433] + title: Git commit hash + pattern: "^\\w+$" + ver: + type: string + examples: [1.0.13] + title: Version + description: Semantic version of the module + pattern: "^\\d+\\.\\d+\\.\\d+$" + code_url: + type: string + examples: ["https://github.com/kbaseapps/kb_uploadmethods"] + title: URL of source code diff --git a/spec/collections/ws/ws_module.yaml b/spec/collections/ws/ws_module.yaml new file mode 100644 index 00000000..26c3dd17 --- /dev/null +++ b/spec/collections/ws/ws_module.yaml @@ -0,0 +1,20 @@ +name: ws_module +type: vertex +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + description: SDK module (unversioned). + additionalProperties: false + required: [_key, language, dynamic_service] + properties: + _key: + type: string + examples: [kb_uploadmethods] + pattern: "^\\w+$" + language: + type: string + enum: [python, perl, java, r] + description: The programing language the module is written in + dynamic_service: + type: boolean + description: Indicates if the module can be run as a webservice diff --git a/spec/collections/ws/ws_module_contains_method.yaml b/spec/collections/ws/ws_module_contains_method.yaml new file mode 100644 index 00000000..b254b7c8 --- /dev/null +++ b/spec/collections/ws/ws_module_contains_method.yaml @@ -0,0 +1,14 @@ +name: ws_module_contains_method +type: edge +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + description: A module contains an SDK method + required: [_from, _to] + properties: + _from: + type: string + description: The module ID. + _to: + type: string + description: The SDK method ID diff --git a/spec/collections/ws/ws_module_version.yaml b/spec/collections/ws/ws_module_version.yaml new file mode 100644 index 00000000..e8a25e70 --- /dev/null +++ b/spec/collections/ws/ws_module_version.yaml @@ -0,0 +1,38 @@ +name: ws_module_version +type: vertex +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + description: Versioned SDK Module. + additionalProperties: false + required: [_key, name, commit, ver, code_url] + properties: + _key: + type: string + examples: + - kb_uploadmethods:8ebb66e4f2c27bc4a9b7cddff7d7b0f27f4ee433 + description: ":" + pattern: "^\\w+:\\w+$" + name: + type: string + title: Module name + examples: + - kb_uploadmethods + pattern: "^\\w+$" + commit: + type: string + examples: + - 8ebb66e4f2c27bc4a9b7cddff7d7b0f27f4ee433 + description: Git commit hash + pattern: "^\\w+$" + ver: + type: string + examples: + - 1.0.13 + description: Module semantic version + pattern: "^\\d+\\.\\d+\\.\\d+$" + code_url: + type: string + examples: + - https://github.com/kbaseapps/kb_uploadmethods + description: URL of source code diff --git a/spec/collections/ws/ws_obj_created_with_method.yaml b/spec/collections/ws/ws_obj_created_with_method.yaml new file mode 100644 index 00000000..4ab80b0d --- /dev/null +++ b/spec/collections/ws/ws_obj_created_with_method.yaml @@ -0,0 +1,23 @@ +name: ws_obj_created_with_method +type: edge +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + additionalProperties: false + required: [_from, _to, method_params] + description: The _from WS versioned object was created by the _to SDK versioned method. + properties: + _from: + type: string + examples: + - ws_object_version/35414:73:1 + description: A versioned workspace object. + _to: + type: string + examples: + - ws_method_version/kb_uploadmethods:8ebb66e4f2c27bc4a9b7cddff7d7b0f27f4ee433.import_genbank_from_staging + - ws_method_version/kb_uploadmethods:UNKNOWN.import_genbank_from_staging + description: A version of a module with a method. + method_params: + type: [array, object, 'null'] + description: The input parameters for the method used to create the object. diff --git a/spec/collections/ws/ws_obj_created_with_module.yaml b/spec/collections/ws/ws_obj_created_with_module.yaml new file mode 100644 index 00000000..2d00353b --- /dev/null +++ b/spec/collections/ws/ws_obj_created_with_module.yaml @@ -0,0 +1,16 @@ +name: ws_obj_created_with_module +type: edge +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + additionalProperties: false + required: [_from, _to] + description: The _from WS versioned object was created with the _to SDK versioned + module. + properties: + _from: + type: string + description: The WS versioned object that was created. + _to: + type: string + description: The SDK versioned module that created the object. diff --git a/spec/collections/ws/ws_obj_instance_of_type.yaml b/spec/collections/ws/ws_obj_instance_of_type.yaml new file mode 100644 index 00000000..8adf6553 --- /dev/null +++ b/spec/collections/ws/ws_obj_instance_of_type.yaml @@ -0,0 +1,11 @@ +name: ws_obj_instance_of_type +type: edge +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + additionalProperties: false + required: [_from, _to] + description: The _from WS versioned object is an instance of the _to versioned type. + properties: + _from: {type: string} + _to: {type: string} diff --git a/spec/collections/ws/ws_obj_version_has_taxon.yaml b/spec/collections/ws/ws_obj_version_has_taxon.yaml new file mode 100644 index 00000000..529141c0 --- /dev/null +++ b/spec/collections/ws/ws_obj_version_has_taxon.yaml @@ -0,0 +1,17 @@ +name: ws_obj_version_has_taxon +type: edge +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + required: [_from, _to, assigned_by] + description: The _from WS versioned object is associated with the _to taxon. + properties: + assigned_by: + type: string + description: Who made this taxon assignment? + _from: + type: string + description: The WS versioned object that was created. + _to: + type: string + description: The _id of a taxon vertex, such as from NCBI, GTDB, etc. diff --git a/spec/collections/ws/ws_object.yaml b/spec/collections/ws/ws_object.yaml new file mode 100644 index 00000000..bfbbf553 --- /dev/null +++ b/spec/collections/ws/ws_object.yaml @@ -0,0 +1,26 @@ +name: ws_object +type: vertex +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + additionalProperties: false + required: [_key, workspace_id, object_id, deleted] + properties: + _key: + type: string + description: The wsid/objectid for this data + examples: ["35414:73"] + pattern: "^\\d+:\\d+$" + workspace_id: + type: integer + description: The workspace_id for this object + minimum: 1 + object_id: + type: integer + description: The permanent object id + minimum: 1 + deleted: + type: boolean + is_public: + type: boolean + deafult: false diff --git a/spec/collections/ws/ws_object_hash.yaml b/spec/collections/ws/ws_object_hash.yaml new file mode 100644 index 00000000..5db8b511 --- /dev/null +++ b/spec/collections/ws/ws_object_hash.yaml @@ -0,0 +1,16 @@ +name: ws_object_hash +type: vertex +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + additionalProperties: false + required: [_key, type] + properties: + _key: + type: string + description: The key is the hash + examples: [2406642b28312b3ccbfb2e17e231e2c7] + type: + type: string + description: The hashing algorithm used + examples: [MD5] diff --git a/spec/collections/ws/ws_object_version.yaml b/spec/collections/ws/ws_object_version.yaml new file mode 100644 index 00000000..ed647df7 --- /dev/null +++ b/spec/collections/ws/ws_object_version.yaml @@ -0,0 +1,55 @@ +name: ws_object_version +type: vertex +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + required: + - _key + - workspace_id + - object_id + - version + - name + - hash + - size + - epoch + - deleted + properties: + _key: + type: string + description: The UPA for this data + examples: ["35414:73:1"] + pattern: "^\\d+:\\d+:\\d+$" + workspace_id: + type: integer + description: The workspace_id for this object + minimum: 1 + object_id: + type: integer + description: The permanent object id + minimum: 1 + version: + type: integer + description: The object's version + minimum: 1 + name: + type: string + description: The user supplied name for this object + hash: + type: string + description: The md5 hash of the workspace object + examples: [94edd584731298befa53119cb151d82e] + size: + type: integer + description: Size in bytes + default: 0 + minimum: 0 + epoch: + type: integer + description: Creation time in UTC epoch + default: 0 + minimum: 0 + deleted: + type: boolean + is_public: + type: boolean + default: false diff --git a/spec/collections/ws/ws_owner_of.yaml b/spec/collections/ws/ws_owner_of.yaml new file mode 100644 index 00000000..3c3976fd --- /dev/null +++ b/spec/collections/ws/ws_owner_of.yaml @@ -0,0 +1,19 @@ +name: ws_owner_of +type: edge +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + additionalProperties: false + description: The user is an owner of a workspace or type module. + required: [_from, _to] + properties: + _from: + type: string + examples: ["ws_user/jjeffryes"] + description: A username + _to: + type: string + examples: + - ws_type_module/KBaseGenomes + - ws_workspace/35414 + description: A Workspace or Type Module diff --git a/spec/collections/ws/ws_prov_descendant_of.yaml b/spec/collections/ws/ws_prov_descendant_of.yaml new file mode 100644 index 00000000..bc546eb6 --- /dev/null +++ b/spec/collections/ws/ws_prov_descendant_of.yaml @@ -0,0 +1,10 @@ +name: ws_prov_descendant_of +type: edge +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + required: [_from, _to] + description: The _from object is a provenance descendant of the _to object (eg. Assembly->Reads). + properties: + _from: {type: string} + _to: {type: string} diff --git a/spec/collections/ws/ws_refers_to.yaml b/spec/collections/ws/ws_refers_to.yaml new file mode 100644 index 00000000..8865ae80 --- /dev/null +++ b/spec/collections/ws/ws_refers_to.yaml @@ -0,0 +1,10 @@ +name: ws_refers_to +type: edge +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + required: [_from, _to] + description: The _from object has a reference to the _to object (eg. Genome->Assembly). + properties: + _from: {type: string} + _to: {type: string} diff --git a/spec/collections/ws/ws_type.yaml b/spec/collections/ws/ws_type.yaml new file mode 100644 index 00000000..f8026cc8 --- /dev/null +++ b/spec/collections/ws/ws_type.yaml @@ -0,0 +1,18 @@ +name: ws_type +type: vertex +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + additionalProperties: false + required: [_key] + properties: + _key: + type: string + examples: [KBaseGenomes.Genome] + pattern: "^\\w+\\.\\w+$" + module_name: + type: string + examples: ['KBaseGenomes'] + type_name: + type: string + examples: ['Genome'] diff --git a/spec/collections/ws/ws_type_consumed_by_method.yaml b/spec/collections/ws/ws_type_consumed_by_method.yaml new file mode 100644 index 00000000..6ee25fc8 --- /dev/null +++ b/spec/collections/ws/ws_type_consumed_by_method.yaml @@ -0,0 +1,15 @@ +name: ws_type_consumed_by_method +type: edge +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + required: [_from, _to] + description: The _from type is consumed by the _to SDK method + additionalProperties: true + properties: + _from: + type: string + description: The ID of the type that is consumed. + _to: + type: string + description: The ID of the SDK method that consumes the type. diff --git a/spec/collections/ws/ws_type_module.yaml b/spec/collections/ws/ws_type_module.yaml new file mode 100644 index 00000000..ac5f527e --- /dev/null +++ b/spec/collections/ws/ws_type_module.yaml @@ -0,0 +1,12 @@ +name: ws_type_module +type: vertex +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + additionalProperties: false + required: [_key] + properties: + _key: + type: string + examples: [KBaseGenomes] + pattern: "^\\w+$" diff --git a/spec/collections/ws/ws_type_version.yaml b/spec/collections/ws/ws_type_version.yaml new file mode 100644 index 00000000..8595de34 --- /dev/null +++ b/spec/collections/ws/ws_type_version.yaml @@ -0,0 +1,24 @@ +name: ws_type_version +type: vertex +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + additionalProperties: false + required: [_key] + properties: + _key: + type: string + examples: ['KBaseGenomes.Genome‑9.0'] + pattern: "^\\w+\\.\\w+-\\d+\\.\\d+$" + module_name: + type: string + examples: ['KBaseGenomes'] + type_name: + type: string + examples: ['Genome'] + maj_ver: + type: integer + min: 0 + min_ver: + type: integer + min: 0 diff --git a/spec/collections/ws/ws_user.yaml b/spec/collections/ws/ws_user.yaml new file mode 100644 index 00000000..4bf96834 --- /dev/null +++ b/spec/collections/ws/ws_user.yaml @@ -0,0 +1,14 @@ +name: ws_user +type: vertex +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + additionalProperties: false + required: [_key] + properties: + _key: + type: string + description: The username for this user + examples: + - jjeffryes + - sean-mccorkle3 diff --git a/spec/collections/ws/ws_version_of.yaml b/spec/collections/ws/ws_version_of.yaml new file mode 100644 index 00000000..573d6943 --- /dev/null +++ b/spec/collections/ws/ws_version_of.yaml @@ -0,0 +1,28 @@ +name: ws_version_of +type: edge +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + additionalProperties: false + description: The _from entity is a version of a the _to entity (eg. type, method, module). + required: [_from, _to] + properties: + _from: + type: string + examples: + - ws_type_version/KBaseGenomes.Genome‑9.0 + - ws_method_version/kb_uploadmethods:8ebb66e4f2c27bc4a9b7cddff7d7b0f27f4ee433 + description: A versioned entity (eg. a workspace object, module, or workspace type) + _to: + type: string + examples: + - ws_type/KBaseGenomes.Genome + - ws_method/kb_uploadmethods + description: | + The non-versioned entity group, where all members of the group are + different versions of something (eg. a workspace object, module, or workspace + type). + tag: + type: string + description: Tags for entities managed by catalog + enum: [release, beta, dev] diff --git a/spec/collections/ws/ws_workspace.yaml b/spec/collections/ws/ws_workspace.yaml new file mode 100644 index 00000000..4ed3f5c2 --- /dev/null +++ b/spec/collections/ws/ws_workspace.yaml @@ -0,0 +1,33 @@ +name: ws_workspace +type: vertex +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + required: [_key, owner, max_obj_id, lock_status, name, mod_epoch, is_public, is_deleted, metadata] + properties: + _key: + type: string + description: The workspace ID for this workspace + examples: ['35414'] + pattern: "^\\d+$" + owner: + type: string + title: Username of workspace owner + max_obj_id: + type: integer + title: Maximum object ID in this workspace + lock_status: + type: string + title: Status of the workspace lock + name: + type: string + description: The workspace name for this workspace + examples: ["jjeffryes:narrative_1534187093329"] + mod_epoch: + type: integer + title: Modified date epoch + description: Timestamp of when the workspace was last modified + minimum: 0 + is_public: {type: boolean} + is_deleted: {type: boolean} + metadata: {type: object} diff --git a/spec/collections/ws/ws_workspace_contains_obj.yaml b/spec/collections/ws/ws_workspace_contains_obj.yaml new file mode 100644 index 00000000..45b41339 --- /dev/null +++ b/spec/collections/ws/ws_workspace_contains_obj.yaml @@ -0,0 +1,14 @@ +name: ws_workspace_contains_obj +type: edge +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + description: A workspace contains an object + required: [_from, _to] + properties: + _from: + type: string + description: The ID of the workspace + _to: + type: string + description: The ID of the object diff --git a/spec/collections/wsprov/README.md b/spec/collections/wsprov/README.md new file mode 100644 index 00000000..d6154877 --- /dev/null +++ b/spec/collections/wsprov/README.md @@ -0,0 +1 @@ +# Simple workspace provenance data diff --git a/spec/collections/wsprov/wsprov_action.yaml b/spec/collections/wsprov/wsprov_action.yaml new file mode 100644 index 00000000..2ab86c82 --- /dev/null +++ b/spec/collections/wsprov/wsprov_action.yaml @@ -0,0 +1,18 @@ +name: wsprov_action +type: vertex +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + required: [_key, workspace_id, runner] + properties: + _key: + type: string + description: Slugified name of the action with its timestamp and workspace id + examples: ['copy:123123123:42'] + workspace_id: + type: integer + description: The workspace_id in which this action was performed + minimum: 1 + runner: + type: string + description: The person who ran this action diff --git a/spec/collections/wsprov/wsprov_copied_into.yaml b/spec/collections/wsprov/wsprov_copied_into.yaml new file mode 100644 index 00000000..8bbc9b98 --- /dev/null +++ b/spec/collections/wsprov/wsprov_copied_into.yaml @@ -0,0 +1,14 @@ +name: wsprov_copied_into +type: edge +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + required: [_from, _to] + description: The object was copied into another object + properties: + _from: + type: string + examples: ['wsprov_object/1:2:3'] + _to: + type: string + examples: ['wsprov_object/1:2:3'] diff --git a/spec/collections/wsprov/wsprov_input_in.yaml b/spec/collections/wsprov/wsprov_input_in.yaml new file mode 100644 index 00000000..97912af8 --- /dev/null +++ b/spec/collections/wsprov/wsprov_input_in.yaml @@ -0,0 +1,14 @@ +name: wsprov_input_in +type: edge +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + required: [_from, _to] + description: The workspace object was input in a provenance action + properties: + _from: + type: string + examples: ['wsprov_object/1:2:3'] + _to: + type: string + examples: ['wsprov_action/1:2:3'] diff --git a/spec/collections/wsprov/wsprov_links.yaml b/spec/collections/wsprov/wsprov_links.yaml new file mode 100644 index 00000000..a7610c8a --- /dev/null +++ b/spec/collections/wsprov/wsprov_links.yaml @@ -0,0 +1,14 @@ +name: wsprov_links +type: edge +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + required: [_from, _to] + description: The object is linked to another object, through references, provenance, etc + properties: + _from: + type: string + examples: ['wsprov_object/1:2:3'] + _to: + type: string + examples: ['wsprov_object/1:2:3'] diff --git a/spec/collections/wsprov/wsprov_object.yaml b/spec/collections/wsprov/wsprov_object.yaml new file mode 100644 index 00000000..b40ead82 --- /dev/null +++ b/spec/collections/wsprov/wsprov_object.yaml @@ -0,0 +1,18 @@ +name: wsprov_object +type: vertex +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + required: [_key, workspace_id, owner] + properties: + _key: + type: string + description: The workspace reference for this object + examples: ['1:2:3'] + workspace_id: + type: integer + description: The workspace_id for this object + minimum: 1 + owner: + type: string + description: The owner of this workspace object diff --git a/spec/collections/wsprov/wsprov_produced.yaml b/spec/collections/wsprov/wsprov_produced.yaml new file mode 100644 index 00000000..41c9e0d4 --- /dev/null +++ b/spec/collections/wsprov/wsprov_produced.yaml @@ -0,0 +1,14 @@ +name: wsprov_produced +type: edge +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + required: [_from, _to] + description: The provenance action produced the workspace object + properties: + _from: + type: string + examples: ['wsprov_action/1:2:3'] + _to: + type: string + examples: ['wsprov_object/1:2:3'] diff --git a/spec/data_source_schema.yaml b/spec/data_source_schema.yaml new file mode 100644 index 00000000..49beaca1 --- /dev/null +++ b/spec/data_source_schema.yaml @@ -0,0 +1,27 @@ +name: data_source_schema +type: object +required: ['name', 'title', 'category'] +properties: + name: + type: string + title: Abbreviated data source name + format: regex + pattern: ^\w+$ + category: + type: string + title: Data source category + title: + type: string + title: Full data source name + home_url: + type: string + title: Data source home page + format: uri + data_url: + type: string + title: URL where data can be downloaded + format: uri + logo_path: + type: string + title: Path to logo +additionalProperties: false diff --git a/spec/data_sources/djornl.yaml b/spec/data_sources/djornl.yaml new file mode 100644 index 00000000..1fb27f23 --- /dev/null +++ b/spec/data_sources/djornl.yaml @@ -0,0 +1,5 @@ +name: djornl +category: network +title: Jacobson Lab Exascale Networking data +home_url: "https://github.com/kbase/exascale_data" +data_url: "https://github.com/kbase/exascale_data/releases/latest" diff --git a/spec/data_sources/envo_ontology.yaml b/spec/data_sources/envo_ontology.yaml new file mode 100644 index 00000000..03cce27f --- /dev/null +++ b/spec/data_sources/envo_ontology.yaml @@ -0,0 +1,6 @@ +name: envo_ontology +category: ontology +title: Environment Ontology +home_url: "http://www.obofoundry.org/ontology/envo.html" +data_url: "https://github.com/EnvironmentOntology/envo/releases" +logo_path: "/images/third-party-data-sources/envo/logo-119-64.png" diff --git a/spec/data_sources/gaz_ontology.yaml b/spec/data_sources/gaz_ontology.yaml new file mode 100644 index 00000000..f2c82185 --- /dev/null +++ b/spec/data_sources/gaz_ontology.yaml @@ -0,0 +1,5 @@ +category: ontology +data_url: http://purl.obolibrary.org/obo/gaz.obo +home_url: http://environmentontology.github.io/gaz/ +name: gaz_ontology +title: Gazetteer Ontology diff --git a/spec/data_sources/go_ontology.yaml b/spec/data_sources/go_ontology.yaml new file mode 100644 index 00000000..b47b00a8 --- /dev/null +++ b/spec/data_sources/go_ontology.yaml @@ -0,0 +1,6 @@ +name: go_ontology +category: ontology +title: Gene Ontology +home_url: "http://geneontology.org/" +data_url: "http://release.geneontology.org/" +logo_path: "/images/third-party-data-sources/go/logo-248-64.png" diff --git a/spec/data_sources/gtdb.yaml b/spec/data_sources/gtdb.yaml new file mode 100644 index 00000000..bbc39037 --- /dev/null +++ b/spec/data_sources/gtdb.yaml @@ -0,0 +1,6 @@ +name: gtdb +category: taxonomy +title: GTDB Taxonomy +home_url: "https://gtdb.ecogenomic.org" +data_url: "https://data.ace.uq.edu.au/public/gtdb/data/releases/" +logo_path: "/images/third-party-data-sources/gtdb/logo-128-64.png" diff --git a/spec/data_sources/ncbi_taxonomy.yaml b/spec/data_sources/ncbi_taxonomy.yaml new file mode 100644 index 00000000..781bd2d0 --- /dev/null +++ b/spec/data_sources/ncbi_taxonomy.yaml @@ -0,0 +1,6 @@ +name: ncbi_taxonomy +category: taxonomy +title: NCBI Taxonomy +home_url: "https://www.ncbi.nlm.nih.gov/taxonomy" +data_url: "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/" +logo_path: "/images/third-party-data-sources/ncbi/logo-51-64.png" diff --git a/spec/data_sources/po_ontology.yaml b/spec/data_sources/po_ontology.yaml new file mode 100644 index 00000000..c6206071 --- /dev/null +++ b/spec/data_sources/po_ontology.yaml @@ -0,0 +1,5 @@ +category: ontology +data_url: http://purl.obolibrary.org/obo/po.obo +home_url: http://browser.planteome.org/amigo +name: po_ontology +title: Plant Ontology diff --git a/spec/data_sources/rdp_taxonomy.yaml b/spec/data_sources/rdp_taxonomy.yaml new file mode 100644 index 00000000..953a0873 --- /dev/null +++ b/spec/data_sources/rdp_taxonomy.yaml @@ -0,0 +1,6 @@ +name: rdp_taxonomy +category: taxonomy +title: Ribosomal Database Project +home_url: "http://rdp.cme.msu.edu/taxomatic/main.spr" +data_url: "http://rdp.cme.msu.edu/misc/resources.jsp" +logo_path: "/images/third-party-data-sources/ncbi/logo-51-64.png" diff --git a/spec/data_sources/silva_taxonomy.yaml b/spec/data_sources/silva_taxonomy.yaml new file mode 100644 index 00000000..ad6c5664 --- /dev/null +++ b/spec/data_sources/silva_taxonomy.yaml @@ -0,0 +1,6 @@ +name: silva_taxonomy +category: taxonomy +title: SILVA Taxonomy +home_url: "https://arb-silva.de" +data_url: "https://arb-silva.de/no_cache/download/archive/" + diff --git a/spec/data_sources/uo_ontology.yaml b/spec/data_sources/uo_ontology.yaml new file mode 100644 index 00000000..547ed20b --- /dev/null +++ b/spec/data_sources/uo_ontology.yaml @@ -0,0 +1,5 @@ +category: ontology +data_url: http://purl.obolibrary.org/obo/uo.obo +home_url: https://github.com/bio-ontology-research-group/unit-ontology +name: uo_ontology +title: Units of measurement ontology diff --git a/spec/datasets/distance.yaml b/spec/datasets/distance.yaml new file mode 100644 index 00000000..20d35b40 --- /dev/null +++ b/spec/datasets/distance.yaml @@ -0,0 +1,7 @@ +name: distance +type: integer +title: Traversal Distance +description: How many hops to find neighbors and neighbors-of-neighbors +default: 0 +minimum: 0 +maximum: 100 diff --git a/spec/datasets/djornl/csv_cluster.yaml b/spec/datasets/djornl/csv_cluster.yaml new file mode 100644 index 00000000..9f36b08c --- /dev/null +++ b/spec/datasets/djornl/csv_cluster.yaml @@ -0,0 +1,15 @@ +$schema: http://json-schema.org/draft-07/schema# +name: csv_cluster +title: Cluster file syntax +description: Jacobson lab cluster ID to node ID mappings +type: object +required: [cluster_id, node_ids] +additionalProperties: false +properties: + cluster_id: + type: string + format: regex + pattern: ^Cluster\d+$ + # pre-transform node_ids + node_ids: + type: string diff --git a/spec/datasets/djornl/csv_edge.yaml b/spec/datasets/djornl/csv_edge.yaml new file mode 100644 index 00000000..a5495c75 --- /dev/null +++ b/spec/datasets/djornl/csv_edge.yaml @@ -0,0 +1,25 @@ +$schema: http://json-schema.org/draft-07/schema# +name: csv_edge +title: CSV edge file syntax +description: Jacobson lab Arabidopsis edge data file columns for generic node-to-node edges with scores +type: object +required: [node1, node2, score, edge_type] +properties: + node1: + $ref: definitions.yaml#/definitions/djornl_edge/_from + node2: + $ref: definitions.yaml#/definitions/djornl_edge/_to + score: + # pre-transformation, the parser treats this as a string + type: string + format: regex + pattern: ^\d+(\.\d+)?$ + edge_type: + $ref: edge_type.yaml + directed: + # pre-transform: parser treats this as a string + type: string + default: "0" + enum: + - "1" + - "0" diff --git a/spec/datasets/djornl/csv_node.yaml b/spec/datasets/djornl/csv_node.yaml new file mode 100644 index 00000000..99d33a68 --- /dev/null +++ b/spec/datasets/djornl/csv_node.yaml @@ -0,0 +1,56 @@ +$schema: http://json-schema.org/draft-07/schema# +name: csv_node +title: CSV node file syntax +description: Jacobson lab Arabidopsis gene and phenotype data file columns +type: object +required: [node_id, node_type] +additionalProperties: false +properties: + node_id: + $ref: definitions.yaml#/definitions/djornl_node/_key + node_type: + $ref: definitions.yaml#/definitions/djornl_node/node_type + # comma-separated array of cluster IDs + clusters: + type: string + format: regex + pattern: ^(\w+:\d+, ?)*(\w+:\d+)?$ + transcript: + $ref: definitions.yaml#/definitions/djornl_node/transcript + gene_symbol: + $ref: definitions.yaml#/definitions/djornl_node/gene_symbol + gene_full_name: + $ref: definitions.yaml#/definitions/djornl_node/gene_full_name + gene_model_type: + $ref: definitions.yaml#/definitions/djornl_node/gene_model_type + tair_computational_description: + $ref: definitions.yaml#/definitions/djornl_node/tair_computational_description + tair_curator_summary: + $ref: definitions.yaml#/definitions/djornl_node/tair_curator_summary + tair_short_description: + $ref: definitions.yaml#/definitions/djornl_node/tair_short_description + # comma-separated array of GO terms + go_terms: + type: string + format: regex + pattern: ^(GO:\d{7}, ?)*(GO:\d{7})?$ + go_description: + $ref: definitions.yaml#/definitions/djornl_node/go_description + mapman_bin: + $ref: definitions.yaml#/definitions/djornl_node/mapman_bin + mapman_name: + $ref: definitions.yaml#/definitions/djornl_node/mapman_name + mapman_description: + $ref: definitions.yaml#/definitions/djornl_node/mapman_description + pheno_aragwas_id: + $ref: definitions.yaml#/definitions/djornl_node/pheno_aragwas_id + pheno_description: + $ref: definitions.yaml#/definitions/djornl_node/pheno_description + pheno_pto_name: + $ref: definitions.yaml#/definitions/djornl_node/pheno_pto_name + pheno_pto_description: + $ref: definitions.yaml#/definitions/djornl_node/pheno_pto_description + pheno_reference: + $ref: definitions.yaml#/definitions/djornl_node/pheno_reference + user_notes: + $ref: definitions.yaml#/definitions/djornl_node/user_notes diff --git a/spec/datasets/djornl/definitions.yaml b/spec/datasets/djornl/definitions.yaml new file mode 100644 index 00000000..cc59e82a --- /dev/null +++ b/spec/datasets/djornl/definitions.yaml @@ -0,0 +1,135 @@ +$schema: "http://json-schema.org/draft-07/schema#" +name: definitions +title: DJORNL schema definitions +description: Node and edge metadata definitions for the Dan Jacobson Exascale dataset +definitions: + cluster_id: + type: string + format: regex + pattern: ^\w+:\d+$ + examples: ["markov_i2:1", "markov_i4:5", "markov_i6:3"] + go_term: + type: string + format: regex + pattern: ^GO:\d{7}$ + examples: ["GO:0003700", "GO:0005515"] + djornl_edge: + _key: + type: string + title: Key + format: regex + pattern: ^(\S+__){3}(\S+)$ + _from: + type: string + title: Node ID + format: regex + pattern: ^\S{2,}.*$ + _to: + type: string + title: Node ID + format: regex + pattern: ^\S{2,}.*$ + score: + title: Edge Score (Weight) + # (float) + type: number + edge_type: + $ref: edge_type.yaml + directed: + type: boolean + title: Directed edge + description: Whether or not the edge is directed + default: false + djornl_node: + _key: + type: string + title: Key + format: regex + pattern: ^\S{2,}.*$ + examples: ["AT1G01010", "As2"] + clusters: + type: array + title: Clusters + description: Clusters to which the node has been assigned + uniqueItems: true + items: + $ref: "#/definitions/cluster_id" + examples: [["markov_i2:1", "markov_i4:5"], ["markov_i6:3"]] + node_type: + $ref: node_type.yaml + transcript: + type: string + title: Transcript + examples: ["AT1G01010.1"] + gene_symbol: + type: string + title: Gene symbol + examples: ["NTL10"] + gene_full_name: + type: string + title: Gene full name + examples: ["NAC domain containing protein 1"] + gene_model_type: + type: string + title: Gene model type + examples: ["protein_coding"] + tair_computational_description: + type: string + title: TAIR computational description + examples: ["NAC domain containing protein 1;(source:Araport11)"] + tair_curator_summary: + type: string + title: TAIR curator summary + examples: ["Encodes a plasma membrane-localized amino acid transporter likely involved in amino acid export in the developing seed."] + tair_short_description: + type: string + title: TAIR short description + examples: ["NAC domain containing protein 1"] + go_description: + type: string + title: GO descriptions + examples: ["DNA-binding transcription factor activity"] + go_terms: + type: array + title: GO term IDs + uniqueItems: true + items: + $ref: "#/definitions/go_term" + mapman_bin: + type: string + title: Mapman bin + examples: ["15.5.17"] + mapman_name: + type: string + title: Mapman name + examples: [".RNA biosynthesis.transcriptional regulation.transcription factor (NAC)"] + mapman_description: + type: string + title: Mapman description + examples: ["transcription factor (NAC) (original description: pep chromosome:TAIR10:1:3631:5899:1 gene:AT1G01010 transcript:AT1G01010.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NAC001 description:NAC domain-containing protein 1 [Source:UniProtKB/Swiss-Prot;Acc:Q0WV96])"] + pheno_aragwas_id: + type: string + title: AraGWAS ID + examples: ["10.21958/phenotype:67"] + pheno_description: + type: string + title: Phenotype description + examples: ["Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008"] + pheno_pto_name: + type: string + title: PTO name + description: Plant Trait Ontology name + examples: ["arsenic concentration"] + pheno_pto_description: + type: string + title: PTO description + description: Plant Trait Ontology description + examples: ["A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik]"] + pheno_reference: + type: string + title: Phenotype reference + examples: ["Atwell et. al, Nature 2010"] + user_notes: + type: string + title: User Notes + examples: ["flowering time related"] diff --git a/spec/datasets/djornl/edge_type.yaml b/spec/datasets/djornl/edge_type.yaml new file mode 100644 index 00000000..a5dd72a8 --- /dev/null +++ b/spec/datasets/djornl/edge_type.yaml @@ -0,0 +1,97 @@ +$schema: "http://json-schema.org/draft-07/schema#" +name: edge_type +title: Edge Type +description: Edge types in Dan Jacobson Arabidopsis Exascale dataset +type: string +oneOf: + - const: phenotype-association_AraGWAS + title: AraGWAS phenotype association + description: GWAS associations produced by analyzing a subset of phenotypes and SNPs in the Arabidopsis 1001 Genomes database. Edge values are significant association scores after FDR correction. + + - const: pairwise-gene-coexpression_AraNet_v2 + title: AraNetv2 pairwise gene coexpression + description: A subset of pairwise gene coexpression values from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated from Pearson correlation coefficients to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015). + + - const: domain-co-occurrence_AraNet_v2 + title: AraNetv2 domain co-occurrence + description: A layer of protein domain co-occurrence values from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated from weighted mutual information scores to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015). + + - const: protein-protein-interaction_high-throughput_AraNet_v2 + title: AraNetv2 high-throughput protein-protein interaction + description: Log likelihood score. A layer of protein-protein interaction values derived from four high-throughput PPI screening experiments; from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015). + + - const: protein-protein-interaction_literature-curated_AraNet_v2 + title: AraNetv2 literature-curated protein-protein interaction + description: A layer of protein-protein interaction values from literature-curated small- to medium-scale experimental data; from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015). + + - const: protein-protein-interaction_biogrid_date/release3.5.188 + title: BIOGRID ORGANISM Arabidopsis thaliana Columbia 3.5.188 tab3 PPI + description: BioGRID interactions are relationships between two proteins or genes; the term 'interaction' includes direct physical binding of two proteins, and co-existence in a stable complex and genetic interaction. https://wiki.thebiogrid.org/doku.php/experimental_systems. + + - const: protein-protein-interaction_AtPIN + title: AtPIN PPI + description: The interactions database includes all interactions present at the Arabidopsis thaliana Protein Interactome Database, the Predicted Interactome for Arabidopsis, Arabidopsis protein-protein interaction data curated from the literature by TAIR curators, BIOGRID and IntAct. https://atpin.bioinfoguy.net/cgi-bin/atpin.pl + + - const: protein-protein-interaction_Mentha_A_thaliana_3702_040319 + title: Mentha AT 3702 040319 PPI + description: Mentha archives evidence collected from different sources and presents these data in a complete and comprehensive way. Data comes from manually curated protein-protein interaction databases that have adhered to the IMEx consortium and assigns to each interaction a reliability score that takes into account all the supporting evidence. https://mentha.uniroma2.it/about.php + + - const: protein-protein-interaction_literature_curated_AraNet_v2_subnet + title: AraNetv2 subnet AT-LC PPI + description: Literature curated PPI of A. thaliana. https://www.inetbio.org/aranet/dl.php?f=AT-LC + + - const: phenotype-association_GWAS_gene_to_metabolite_10.1371/journal.pgen.1006363 + title: Wu2016 s015 Gene-to-Metab GeneToPhenotype + description: GWAS hits ftom a Gene-to-Metaboiltes GWAS. Phenotypes (metabolites) have a unique ID from the Wu 2016 study (doi:10.1371/journal.pgen.1006363) and need to be given our own UID for future use. + + - const: phenotype-association_AraGWAS_subnet_permsig_geni + title: AraGWAS subnet permsig geni GeneToPhenotype + description: Phenotypes mapped to SNP position and GeneID if available from AraGWAS database filtered for overFDR threshold. + + - const: transcription-factor-regulatory-interaction_literature_curated_ATRM_01082020 + title: ATRM TF to Target LitCurated 01082020 TranscriptionFactorToGene + description: Contains literature mined and manually curated TF regulatory interactions for A.thaliana from 1701 TFFs from PlantTFDB 2.0 and 4663 TF-associated interactions. These were manually filtered (e.g. FPs, PPI interactions removed). They then added some from other sources. Downloaded from http://atrm.cbi.pku.edu.cn/download.php + + - const : AT-UU-GO-05-AA-01 + title : GO + description: GeneA connects to GeneB if the two genes have semantically similar GO terms (with a similarity score > 0). This network is used to evaluate other networks for biological functional content. + + - const : AT-UU-KS-00-AA-01 + title : Knockout Similarity + description: GeneA connects to GeneB if the phenotypic effect of knocking out GeneA is similar to the phenotypic effect of knocking out GeneB. Similarity is based on Phenotype Ontology semantic similarity. + + - const : AT-UU-PX-01-AA-01 + title : PEN-Diversity + description: GeneA connects to GeneB if the expression vector of GeneA is an important predictor of the expression vector of GeneB in an iRF model, where all other genes’ expression are included as covariates. The iRF model is a feature-selection version of Random Forest. + + - const : AT-UU-GA-01-AA-01 + title : Coex Gene-Atlas + description: Coexpression network obtained from AtGenie.org. It uses expression array data from multiple tissues to calculate the correlation between genes. + + - const : AT-UU-PP-00-AA-01 + title : PPI-6merged + description: "GeneA connects to GeneB if their protein products have been shown to bind to interact with each other, typically through experimental evidence. The PPI-6merged network is the union of 6 different A.thaliana PPI networks: AraNet2 LC, AraNet2 HT, AraPPInet2 0.60, BIOGRID 4.3.194 physical, AtPIN, Mentha. These 6 were all relatively high scoring with GOintersect. StringDB scored badly so was not included" + + - const : AT-UU-RE-00-AA-01 + title : Regulation-ATRM + description: GeneA connects to GeneB if GeneA is a Transcription Factor (TF) that is shown to interact with GeneB (which may or may not be a TF). This dataset contains literature mined and manually curated TF regulatory interactions for A.thaliana. Started from 1701 TFs from PlantTFDB 2.0 and retrieved 4663 TF-associated interactions. These were manually filtered (e.g. FPs, PPI interactions removed). They then added some from other sources. Final result is 1431 confirmed TF regulatory interactions, of which 637 are TF-TF. + + - const : AT-UU-RP-03-AA-01 + title : Regulation-Plantregmap + description: This network contains computationally predicted TF-Target relationships based on motifs, binding sites, ChipSeq data + + - const : AT-UU-DU-67-AA-01 + title : CoEvolution-DUO + description: GeneA connects to GeneB if a SNP in GeneA is correlated with a SNP in GeneB using the DUO metric (https://doi.org/10.1101/2020.01.28.923730). SNP data is from the full 1001 Genomes. + + - const : AT-UU-CD-00-AA-01 + title : CoDomain + description: GeneA connects to GeneB if they share one or more common protein domains. Network was obtained from AraNet2. + + - const : AT-UU-RX-00-AA-01 + title : Metabolic-AraCyc + description: GeneA connects to GeneB if they are both enzymatic and are linked by a common substrate or product. E.g. RXNA (GeneA) → Compound1 → RXNB (GeneB). Here GeneA connects to GeneB due to Compound1. + + - const : AT-UU-PY-01-LF-01 + title : Predictive CG Methylation + description: GeneA connects to GeneB if the CG methylation vector of GeneA is an important predictor of the CG methylation vector of GeneB in an iRF model, where all other genes' CG methylation states are included as covariates. The iRF model is an expansion on Random Forest, a feature selection model. diff --git a/spec/datasets/djornl/edge_types_filter.yaml b/spec/datasets/djornl/edge_types_filter.yaml new file mode 100644 index 00000000..dae0bd71 --- /dev/null +++ b/spec/datasets/djornl/edge_types_filter.yaml @@ -0,0 +1,13 @@ +$schema: "http://json-schema.org/draft-07/schema#" +name: edge_types_filter +title: Edge Types +description: Edge types to filter on +type: array +items: + $ref: edge_type.yaml +default: [] +uniqueItems: true +examples: + - ['protein-protein-interaction_high-throughput_AraNet_v2', 'protein-protein-interaction_literature-curated_AraNet_v2'] + - ['phenotype-association_AraGWAS'] + - [] diff --git a/spec/datasets/djornl/manifest.schema.json b/spec/datasets/djornl/manifest.schema.json new file mode 100644 index 00000000..beb7309f --- /dev/null +++ b/spec/datasets/djornl/manifest.schema.json @@ -0,0 +1,102 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Exascale parser file manifest", + "type": "object", + "required": ["name", "file_list"], + "properties": { + "name": { + "title": "Dataset name", + "type": "string", + "description": "The name of the dataset", + "examples": ["Dan Jacobson Exascale dataset"] + }, + "release_date": { + "title": "Release date", + "type": "string", + "description": "Date of the dataset release, in YYYY-MM-DD format", + "format": "date" + }, + "notes": { + "type": "string", + "title": "Release notes", + "description": "Free text describing the release and any notes, or comments relevant to consumers of the data." + }, + "file_list": { + "type": "array", + "items": { + "type": "object", + "required": ["data_type", "path"], + "oneOf": [{ + "properties": { + "data_type": { + "enum": ["cluster"] + } + }, + "required": ["cluster_prefix"] + }, + { + "properties": { + "data_type": { + "enum": ["node", "edge"] + } + } + } + ], + "anyOf": [{ + "properties": { + "file_format": { + "enum": ["tsv", "csv"] + } + }, + "required": ["file_format"] + }, + { + "properties": { + "path": { + "format": "regex", + "pattern": ".[ct]sv" + } + }, + "required": ["path"] + } + ], + "properties": { + "path": { + "title": "File path", + "type": "string" + }, + "data_type": { + "title": "Data type", + "type": "string", + "enum": ["node", "edge", "cluster"] + }, + "file_format": { + "title": "File format", + "type": "string", + "enum": ["tsv", "csv"] + }, + "date": { + "title": "File creation date", + "description": "date of file creation in the format YYYY-MM-DD", + "type": "string", + "format": "date" + }, + "description": { + "title": "Description of the file contents", + "type": "string" + }, + "cluster_prefix": { + "title": "Prefix", + "type": "string", + "description": "The prefix to be used for clusters, e.g. markov_i2:4. Required for cluster data, not used for node or edge data" + }, + "cluster_title": { + "title": "Cluster set name", + "description": "Human-readable name of the cluster set. Not used for edge or node data", + "type": "string" + } + } + } + } + } +} diff --git a/spec/datasets/djornl/node_type.yaml b/spec/datasets/djornl/node_type.yaml new file mode 100644 index 00000000..f81f0a96 --- /dev/null +++ b/spec/datasets/djornl/node_type.yaml @@ -0,0 +1,13 @@ +$schema: "http://json-schema.org/draft-07/schema#" +name: node_type +title: Node Type +description: Node types in Dan Jacobson Exascale dataset +type: string +examples: + - gene + - phenotype +oneOf: + - const: gene + title: Gene + - const: pheno + title: Phenotype diff --git a/spec/datasets/djornl/tsv_node.yaml b/spec/datasets/djornl/tsv_node.yaml new file mode 100644 index 00000000..35d6b4ad --- /dev/null +++ b/spec/datasets/djornl/tsv_node.yaml @@ -0,0 +1,45 @@ +$schema: http://json-schema.org/draft-07/schema# +name: tsv_node +title: TSV node file syntax +description: Jacobson lab Arabidopsis gene and phenotype data file columns +type: object +required: [gid] +additionalProperties: false +properties: + defline: + examples: ["NAC domain containing protein 1"] + title: Defline + type: string + gid: + examples: ["AT1G01010", "AT4G09995", "AT4G03060-CVI", "SU(RGN)"] + format: regex + pattern: ^(AT[0-9CM][0-9G]+|[- ().0-9A-Z]{,13})$ + title: Gene ID + type: string + go: + pattern: ^((NA|GO:\d{7})\|?)+$ + type: string + godesc: + pattern: ^((NA|[- ()+,'./0-9>:A-z_])\|?)+$ + type: string + ko_effect: + pattern: ^((NA|[- %,()/0-9:A-z])\|?)+$ + type: string + mapman_code: + pattern: ^((NA|[0-9]{,2}\.?))+$ + type: string + mapman_desc: + type: string + mapman_name: + type: string + names: + pattern: !!str "^\\|?(([\ + - \\#&\"'()+,./0-9:;?A-Z\\[\\\\\\]_a-z{}\ + \xa0\xad\xb3\xc2\xc3\xdf\xf3\ + ])\\|?)*$" + type: string + symbols: + examples: ["ANAC001 | NAC001 | NTL10", "NA"] + format: regex + pattern: ^((NA|[- \#&'()*,./0-9:;A-Z\[\]_a-z{}])\|?)+$ + type: string diff --git a/spec/stored_queries/GO/GO_get_ancestors.yaml b/spec/stored_queries/GO/GO_get_ancestors.yaml new file mode 100644 index 00000000..c29f746a --- /dev/null +++ b/spec/stored_queries/GO/GO_get_ancestors.yaml @@ -0,0 +1,37 @@ +# Get all ancestors (all parent's parents) for this term + +name: GO_get_ancestors +params: + type: object + required: [id, ts] + properties: + id: + type: string + title: Document ID + description: GO id of the term you want to get all the ancestors of + limit: + type: integer + default: 20 + description: Maximum result limit + maximum: 1000 + offset: + type: integer + default: 0 + description: Result offset for pagination + maximum: 100000 + ts: + type: integer + title: Versioning timestamp +query_prefix: WITH GO_terms +query: | + FOR t in GO_terms + FILTER t.id == @id + FILTER t.created <= @ts AND t.expired >= @ts + limit 1 + FOR v, e, p IN 1..100 OUTBOUND t GO_edges + FILTER p.edges[*].created ALL <= @ts + AND p.edges[*].expired ALL >= @ts + AND p.edges[*].type ALL == "is_a" + SORT v.id ASC + LIMIT @offset, @limit + RETURN {term: v, edge: e} diff --git a/spec/stored_queries/GO/GO_get_associated_ws_features.yaml b/spec/stored_queries/GO/GO_get_associated_ws_features.yaml new file mode 100644 index 00000000..34f93aea --- /dev/null +++ b/spec/stored_queries/GO/GO_get_associated_ws_features.yaml @@ -0,0 +1,53 @@ +# Get the associated ws features of this term + +name: GO_get_associated_ws_features +params: + type: object + required: [id, ts] + properties: + id: + type: string + title: Document ID + description: GO id of the term you want to get the ws object of + obj_ref: + type: string + title: Workspace versioned object reference + limit: + type: integer + title: Maximum result limit + default: 20 + maximum: 1000 + offset: + type: integer + title: Result offset for pagination + default: 0 + maximum: 100000 + ts: + type: integer + title: Versioning timestamp +query_prefix: WITH ws_genome_features, ws_object_version +query: | + LET obj_ref_null=IS_NULL(@obj_ref) OR LENGTH(@obj_ref) == 0 + LET results=( + FOR t in GO_terms + FILTER t.id == @id + FILTER t.created <= @ts AND t.expired >= @ts + limit 1 + FOR v, e, p IN 2 INBOUND t ws_feature_has_GO_annotation, ws_genome_has_feature + FILTER p.edges[0].created <= @ts AND p.edges[0].expired >= @ts + FILTER v.is_public OR v.workspace_id IN ws_ids + FILTER obj_ref_null OR v._key == @obj_ref + SORT v.workspace_id ASC, p.vertices[1].feature_id ASC + RETURN DISTINCT { + ws_obj: KEEP(v, ['workspace_id', 'object_id', 'version', 'name']), + feature: KEEP(p.vertices[1], ['feature_id', 'updated_at']) + } + ) + LET total_count=COUNT(results) + LET limited=( + FOR r in results + LIMIT @offset, @limit + COLLECT ws_obj=r.ws_obj INTO features=r.feature + RETURN {ws_obj, features} + ) + RETURN {results: limited, total_count} diff --git a/spec/stored_queries/GO/GO_get_associated_ws_genomes.yaml b/spec/stored_queries/GO/GO_get_associated_ws_genomes.yaml new file mode 100644 index 00000000..195f53b3 --- /dev/null +++ b/spec/stored_queries/GO/GO_get_associated_ws_genomes.yaml @@ -0,0 +1,48 @@ +# Get the associated ws objects of this term + +name: GO_get_associated_ws_genomes +params: + type: object + required: [id, ts] + properties: + id: + type: string + title: Document ID + description: GO id of the term you want to get the ws object of + limit: + type: integer + title: Maximum result limit + default: 20 + maximum: 1000 + offset: + type: integer + title: Result offset for pagination + default: 0 + maximum: 100000 + ts: + type: integer + title: Versioning timestamp +query_prefix: WITH ws_genome_features, ws_object_version +query: | + LET results=( + FOR t in GO_terms + FILTER t.id == @id + FILTER t.created <= @ts AND t.expired >= @ts + LIMIT 1 + FOR v, e, p IN 2 INBOUND t ws_feature_has_GO_annotation, ws_genome_has_feature + FILTER p.edges[0].created <= @ts AND p.edges[0].expired >= @ts + FILTER v.is_public OR v.workspace_id IN ws_ids + SORT v.workspace_id ASC, p.vertices[1].feature_id ASC + COLLECT ws_obj = v WITH COUNT INTO feature_count + RETURN DISTINCT { + ws_obj: KEEP(ws_obj, ['workspace_id', 'object_id', 'version', 'name']), + feature_count + } + ) + LET total_count=COUNT(results) + LET limited=( + FOR r in results + LIMIT @offset, @limit + RETURN r + ) + RETURN {results: limited, total_count} diff --git a/spec/stored_queries/GO/GO_get_children.yaml b/spec/stored_queries/GO/GO_get_children.yaml new file mode 100644 index 00000000..93720e80 --- /dev/null +++ b/spec/stored_queries/GO/GO_get_children.yaml @@ -0,0 +1,36 @@ +# Get the children of this term + +name: GO_get_children +params: + type: object + required: [id, ts] + properties: + id: + type: string + title: Document ID + description: GO id of the term you want to get the children of + limit: + type: integer + default: 20 + description: Maximum result limit + maximum: 1000 + offset: + type: integer + default: 0 + description: Result offset for pagination + maximum: 100000 + ts: + type: integer + title: Versioning timestamp +query_prefix: WITH GO_terms +query: | + FOR t in GO_terms + FILTER t.id == @id + FILTER t.created <= @ts AND t.expired >= @ts + limit 1 + FOR v, e IN 1..1 INBOUND t GO_edges + FILTER e.created <= @ts AND e.expired >= @ts + FILTER e.type == "is_a" + SORT v.id ASC + LIMIT @offset, @limit + RETURN {term: v, edge: e} diff --git a/spec/stored_queries/GO/GO_get_descendants.yaml b/spec/stored_queries/GO/GO_get_descendants.yaml new file mode 100644 index 00000000..6d36a174 --- /dev/null +++ b/spec/stored_queries/GO/GO_get_descendants.yaml @@ -0,0 +1,36 @@ +# Get all descendents of this term +name: GO_get_descendants +params: + type: object + required: [id, ts] + properties: + id: + type: string + title: Document ID + description: GO id of the term you want to get all the descendants of + limit: + type: integer + default: 20 + description: Maximum result limit + maximum: 1000 + offset: + type: integer + default: 0 + description: Result offset for pagination + maximum: 100000 + ts: + type: integer + title: Versioning timestamp +query_prefix: WITH GO_terms +query: | + FOR t in GO_terms + FILTER t.id == @id + FILTER t.created <= @ts AND t.expired >= @ts + limit 1 + FOR v, e, p IN 1..100 INBOUND t GO_edges + FILTER p.edges[*].created ALL <= @ts + AND p.edges[*].expired ALL >= @ts + AND p.edges[*].type ALL == "is_a" + SORT v._key ASC + LIMIT @offset, @limit + RETURN {term: v, edge: e} diff --git a/spec/stored_queries/GO/GO_get_hierarchicalAncestors.yaml b/spec/stored_queries/GO/GO_get_hierarchicalAncestors.yaml new file mode 100644 index 00000000..26e0c33c --- /dev/null +++ b/spec/stored_queries/GO/GO_get_hierarchicalAncestors.yaml @@ -0,0 +1,40 @@ +# Get all hierarchical ancestors +# (all parents’s parents) resources for this term. Hierarchical +# ancestors include is-a and other related parents, such as +# part-of/develops-from, that imply a hierarchical relationship + +name: GO_get_hierarchicalAncestors +params: + type: object + required: [id, ts] + properties: + id: + type: string + title: Document ID + description: GO id of the term you want to get all the hierarchical ancestors of + limit: + type: integer + default: 20 + description: Maximum result limit + maximum: 1000 + offset: + type: integer + default: 0 + description: Result offset for pagination + maximum: 100000 + ts: + type: integer + title: Versioning timestamp +query_prefix: WITH GO_terms +query: | + FOR t in GO_terms + FILTER t.id == @id + FILTER t.created <= @ts AND t.expired >= @ts + limit 1 + FOR v, e, p IN 1..100 OUTBOUND t GO_edges + FILTER p.edges[*].created ALL <= @ts + AND p.edges[*].expired ALL >= @ts + AND p.edges[*].type ALL != NULL + SORT v.id ASC + LIMIT @offset, @limit + RETURN {term: v, edge: e} diff --git a/spec/stored_queries/GO/GO_get_hierarchicalChildren.yaml b/spec/stored_queries/GO/GO_get_hierarchicalChildren.yaml new file mode 100644 index 00000000..af8f731c --- /dev/null +++ b/spec/stored_queries/GO/GO_get_hierarchicalChildren.yaml @@ -0,0 +1,35 @@ +# Get the direct hierarchical children for this term. Hierarchical children include is-a and other related children, such as part-of/develops-from, that imply a hierarchical relationship +name: GO_get_hierarchicalChildren +params: + type: object + required: [id, ts] + properties: + id: + type: string + title: Document ID + description: GO id of the term you want to get the direct hierarchical children of + limit: + type: integer + default: 20 + description: Maximum result limit + maximum: 1000 + offset: + type: integer + default: 0 + description: Result offset for pagination + maximum: 100000 + ts: + type: integer + title: Versioning timestamp +query_prefix: WITH GO_terms +query: | + FOR t in GO_terms + FILTER t.id == @id + FILTER t.created <= @ts AND t.expired >= @ts + limit 1 + FOR v, e IN 1..1 INBOUND t GO_edges + FILTER e.created <= @ts AND e.expired >= @ts + FILTER e.type != NULL + SORT v.id ASC + LIMIT @offset, @limit + RETURN {term: v, edge: e} diff --git a/spec/stored_queries/GO/GO_get_hierarchicalDescendants.yaml b/spec/stored_queries/GO/GO_get_hierarchicalDescendants.yaml new file mode 100644 index 00000000..fd2f555d --- /dev/null +++ b/spec/stored_queries/GO/GO_get_hierarchicalDescendants.yaml @@ -0,0 +1,39 @@ +# Get all hierarchical descendants +# resources for this term. Hierarchical children include is-a +# and other related children, such as part-of/develops-from, +# that imply a hierarchical relationship +name: GO_get_hierarchicalDescendants +params: + type: object + required: [id, ts] + properties: + id: + type: string + title: Document ID + description: GO id of the term you want to get all the hierarchical descendants of + limit: + type: integer + default: 20 + description: Maximum result limit + maximum: 1000 + offset: + type: integer + default: 0 + description: Result offset for pagination + maximum: 100000 + ts: + type: integer + title: Versioning timestamp +query_prefix: WITH GO_terms +query: | + FOR t in GO_terms + FILTER t.id == @id + FILTER t.created <= @ts AND t.expired >= @ts + limit 1 + FOR v, e, p IN 1..100 INBOUND t GO_edges + FILTER p.edges[*].created ALL <= @ts + AND p.edges[*].expired ALL >= @ts + AND p.edges[*].type ALL != NULL + SORT v.id ASC + LIMIT @offset, @limit + RETURN {term: v, edge: e} diff --git a/spec/stored_queries/GO/GO_get_hierarchicalParents.yaml b/spec/stored_queries/GO/GO_get_hierarchicalParents.yaml new file mode 100644 index 00000000..f8621c6f --- /dev/null +++ b/spec/stored_queries/GO/GO_get_hierarchicalParents.yaml @@ -0,0 +1,39 @@ +# Get the direct hierarchical parent +# resources for this term. Hierarchical parents include is-a and +# other related parents, such as part-of/develops-from, that imply +# a hierarchical relationship. + +name: GO_get_hierarchicalParents +params: + type: object + required: [id, ts] + properties: + id: + type: string + title: Document ID + description: GO id of the term you want to get all the hierarchical parents of + limit: + type: integer + default: 20 + description: Maximum result limit + maximum: 1000 + offset: + type: integer + default: 0 + description: Result offset for pagination + maximum: 100000 + ts: + type: integer + title: Versioning timestamp +query_prefix: WITH GO_terms +query: | + FOR t in GO_terms + FILTER t.id == @id + FILTER t.created <= @ts AND t.expired >= @ts + limit 1 + FOR v, e IN 1..1 OUTBOUND t GO_edges + FILTER e.created <= @ts AND e.expired >= @ts + FILTER e.type != NULL + SORT v.id ASC + LIMIT @offset, @limit + RETURN {term: v, edge: e} diff --git a/spec/stored_queries/GO/GO_get_merges_from.yaml b/spec/stored_queries/GO/GO_get_merges_from.yaml new file mode 100644 index 00000000..84a65a4a --- /dev/null +++ b/spec/stored_queries/GO/GO_get_merges_from.yaml @@ -0,0 +1,24 @@ +# Get a set of 'replaced_by' merge edges for a set of terms by the term IDs, maximum 10000 + +name: GO_get_merges_from +params: + type: object + required: [froms] + properties: + froms: + type: array + items: + type: string + title: GO term IDs + description: The list of Go term IDs from which merge edges should emanate + maxItems: 10000 +# It'd be nice if there was a way to get the most recent edge for each from +# But that seems like something that's easy to do client side and removes load from the db, if +# it's even possible +# In any case, having more than one replace by edge per term is going to be extremely unlikely +query: | + FOR d IN GO_merges + FILTER d.from in @froms + FILTER d.type == 'replaced_by' + SORT d.from + RETURN d diff --git a/spec/stored_queries/GO/GO_get_metadata.yaml b/spec/stored_queries/GO/GO_get_metadata.yaml new file mode 100644 index 00000000..f09047d3 --- /dev/null +++ b/spec/stored_queries/GO/GO_get_metadata.yaml @@ -0,0 +1,21 @@ +# Get information/metadata of a particular ontology term (see spec for available fields) + +name: GO_get_metadata +params: + type: object + required: [id, ts] + properties: + id: + type: string + title: Document ID + description: Get information/metadata of a particular ontology term + ts: + type: integer + title: Versioning timestamp +query_prefix: WITH GO_terms +query: | + FOR t IN GO_terms + FILTER t.id == @id + FILTER t.created <= @ts AND t.expired >= @ts + limit 1 + RETURN t diff --git a/spec/stored_queries/GO/GO_get_parents.yaml b/spec/stored_queries/GO/GO_get_parents.yaml new file mode 100644 index 00000000..ec63c4eb --- /dev/null +++ b/spec/stored_queries/GO/GO_get_parents.yaml @@ -0,0 +1,36 @@ +# Get the direct parents for a specific term + +name: GO_get_parents +params: + type: object + required: [id, ts] + properties: + id: + type: string + title: Document ID + description: GO id of the term you want to get all the direct parents of + limit: + type: integer + default: 20 + description: Maximum result limit + maximum: 1000 + offset: + type: integer + default: 0 + description: Result offset for pagination + maximum: 100000 + ts: + type: integer + title: Versioning timestamp +query_prefix: WITH GO_terms +query: | + FOR t in GO_terms + FILTER t.id == @id + FILTER t.created <= @ts AND t.expired >= @ts + limit 1 + FOR v, e IN 1..1 OUTBOUND t GO_edges + FILTER e.created <= @ts AND e.expired >= @ts + FILTER e.type == "is_a" + SORT v.id ASC + LIMIT @offset, @limit + RETURN {term: v, edge: e} diff --git a/spec/stored_queries/GO/GO_get_related.yaml b/spec/stored_queries/GO/GO_get_related.yaml new file mode 100644 index 00000000..ceee04e4 --- /dev/null +++ b/spec/stored_queries/GO/GO_get_related.yaml @@ -0,0 +1,34 @@ +# Get all immediate related terms for this term +name: GO_get_related +params: + type: object + required: [id, ts] + properties: + id: + type: string + title: Document ID + description: GO id of the term you want to get all the directly related nodes of + limit: + type: integer + default: 20 + description: Maximum result limit + maximum: 1000 + offset: + type: integer + default: 0 + description: Result offset for pagination + maximum: 100000 + ts: + type: integer + title: Versioning timestamp +query_prefix: WITH GO_terms +query: | + FOR t in GO_terms + FILTER t.id == @id + FILTER t.created <= @ts AND t.expired >= @ts + limit 1 + FOR v, e IN 1 ANY t GO_edges + FILTER e.created <= @ts AND e.expired >= @ts + SORT v.id ASC + LIMIT @offset, @limit + RETURN {term: v, edge: e} diff --git a/spec/stored_queries/GO/GO_get_siblings.yaml b/spec/stored_queries/GO/GO_get_siblings.yaml new file mode 100644 index 00000000..db1cf541 --- /dev/null +++ b/spec/stored_queries/GO/GO_get_siblings.yaml @@ -0,0 +1,39 @@ +# Get all siblings of this term +name: GO_get_siblings +params: + type: object + required: [id, ts] + properties: + id: + type: string + title: Document ID + description: Get all siblings of this term + limit: + type: integer + default: 20 + description: Maximum result limit + maximum: 1000 + offset: + type: integer + default: 0 + description: Result offset for pagination + maximum: 100000 + ts: + type: integer + title: Versioning timestamp +query_prefix: WITH GO_terms +query: | + FOR t in GO_terms + FILTER t.id == @id + FILTER t.created <= @ts AND t.expired >= @ts + limit 1 + FOR v_parent, e_parent IN 1..1 OUTBOUND t GO_edges + FILTER e_parent.created <= @ts AND e_parent.expired >= @ts + FILTER e_parent.type == "is_a" + FOR v_child, e_child in 1..1 INBOUND v_parent GO_edges + FILTER e_child.created <= @ts AND e_child.expired >= @ts + FILTER e_child.type == "is_a" + FILTER v_child != t + SORT v_child.id ASC + LIMIT @offset, @limit + RETURN v_child diff --git a/spec/stored_queries/GO/GO_get_terms.yaml b/spec/stored_queries/GO/GO_get_terms.yaml new file mode 100644 index 00000000..79efc1d0 --- /dev/null +++ b/spec/stored_queries/GO/GO_get_terms.yaml @@ -0,0 +1,22 @@ +# Get a set of terms by the term ID and a timestamp, maximum 10000 + +name: GO_get_terms +params: + type: object + required: [ids, ts] + properties: + ids: + type: array + items: + type: string + title: GO term IDs + description: The list of Go term IDs to be fetched + maxItems: 10000 + ts: + type: integer + title: Versioning timestamp in milliseconds since the Unix epoch +query: | + FOR d IN GO_terms + FILTER d.id in @ids + FILTER d.expired >= @ts AND d.created <= @ts + RETURN d diff --git a/spec/stored_queries/GO/GO_get_terms_from_ws_feature.yaml b/spec/stored_queries/GO/GO_get_terms_from_ws_feature.yaml new file mode 100644 index 00000000..531c23b6 --- /dev/null +++ b/spec/stored_queries/GO/GO_get_terms_from_ws_feature.yaml @@ -0,0 +1,47 @@ +# Get the terms from a feature + +name: GO_get_terms_from_ws_feature +params: + type: object + required: [feature_id, ts] + properties: + feature_id: + type: string + title: Workspace feature id + limit: + type: integer + title: Maximum result limit + default: 20 + maximum: 1000 + offset: + type: integer + title: Result offset for pagination + default: 0 + maximum: 100000 + ts: + type: integer + title: Versioning timestamp +query_prefix: WITH ws_object_version, GO_terms +query: | + LET go_term_results=( + FOR f in ws_genome_features + FILTER f._key == @feature_id + LIMIT 1 + FOR o, oe, op IN 1 INBOUND f ws_genome_has_feature + FILTER o.is_public OR o.workspace_id IN ws_ids + LIMIT 1 + FOR t, te, tp IN 1 OUTBOUND op.vertices[0] ws_feature_has_GO_annotation + FILTER te.created <= @ts AND te.expired >= @ts + FILTER t.created <= @ts AND t.expired >= @ts + LIMIT @offset, @limit + RETURN DISTINCT { + term: KEEP(t, 'id', 'name', 'namespace', 'alt_ids', 'def', 'comments', 'synonyms', 'xrefs', 'created', 'expired'), + feature: KEEP(f, ['feature_id', 'updated_at', 'workspace_id', 'object_id', 'version']) + } + ) + FOR r IN go_term_results + COLLECT feature=r.feature INTO terms=r.term + RETURN { + feature: feature, + terms: terms + } diff --git a/spec/stored_queries/GO/GO_get_terms_from_ws_object.yaml b/spec/stored_queries/GO/GO_get_terms_from_ws_object.yaml new file mode 100644 index 00000000..ba2a40a7 --- /dev/null +++ b/spec/stored_queries/GO/GO_get_terms_from_ws_object.yaml @@ -0,0 +1,45 @@ +# Get the terms from a workspace object reference + +name: GO_get_terms_from_ws_object +params: + type: object + required: [obj_ref, ts] + properties: + obj_ref: + type: string + title: Workspace versioned object reference + limit: + type: integer + title: Maximum result limit + default: 20 + maximum: 1000 + offset: + type: integer + title: Result offset for pagination + default: 0 + maximum: 100000 + ts: + type: integer + title: Versioning timestamp +query_prefix: WITH ws_genome_features, GO_terms +query: | + LET results=( + FOR o in ws_object_version + FILTER o._key == @obj_ref + FILTER o.is_public OR o.workspace_id IN ws_ids + LIMIT 1 + FOR v, e, p IN 2 OUTBOUND o ws_genome_has_feature, ws_feature_has_GO_annotation + FILTER p.edges[1].created <= @ts AND p.edges[1].expired >= @ts + FILTER v.created <= @ts AND v.expired >= @ts + LIMIT @offset, @limit + RETURN DISTINCT { + term: KEEP(v, 'id', 'name', 'namespace', 'alt_ids', 'def', 'comments', 'synonyms', 'xrefs', 'created', 'expired'), + feature: KEEP(p.vertices[1], ['feature_id', 'updated_at', 'workspace_id', 'object_id', 'version']) + } + ) + FOR r IN results + COLLECT feature=r.feature INTO terms=r.term + RETURN { + feature: feature, + terms: terms + } diff --git a/spec/stored_queries/README.md b/spec/stored_queries/README.md new file mode 100644 index 00000000..128a1b66 --- /dev/null +++ b/spec/stored_queries/README.md @@ -0,0 +1,15 @@ +# Relation Engine Stored Queries + +Stored queries are templated AQL queries that fetch data from the database. + +Variables in stored queries are prefixed with `@`. + +The params field in each stored query should be a JSON schema of the query's parameters. + +## Required format + +Each stored query file should have a set of comments at the top describing the purpose of the query. + +## Using stored queries from the API + +See the [API docs](https://github.com/kbase/relation_engine_api) to see how to run these queries using the API. diff --git a/spec/stored_queries/djornl/djornl_fetch_all.yaml b/spec/stored_queries/djornl/djornl_fetch_all.yaml new file mode 100644 index 00000000..b190bb6b --- /dev/null +++ b/spec/stored_queries/djornl/djornl_fetch_all.yaml @@ -0,0 +1,29 @@ +name: djornl_fetch_all +description: Fetch all node and edge data, optionally filtering on edge type. +params: + type: object + additionalProperties: false + properties: + edge_types: + title: Edge Types + description: Edge types to filter on + type: array + items: + $ref: ../../datasets/djornl/edge_type.yaml + default: [] + uniqueItems: true + examples: + - ['protein-protein-interaction_high-throughput_AraNet_v2', 'protein-protein-interaction_literature-curated_AraNet_v2'] + - ['phenotype-association_AraGWAS'] + - [] +query: | + LET nodes = ( + FOR v IN djornl_node + RETURN v + ) + LET edges = ( + FOR e IN djornl_edge + FILTER length(@edge_types) == 0 || e.edge_type IN @edge_types + RETURN e + ) + RETURN {nodes, edges} diff --git a/spec/stored_queries/djornl/djornl_fetch_clusters.yaml b/spec/stored_queries/djornl/djornl_fetch_clusters.yaml new file mode 100644 index 00000000..9442767d --- /dev/null +++ b/spec/stored_queries/djornl/djornl_fetch_clusters.yaml @@ -0,0 +1,58 @@ +name: djornl_fetch_clusters +description: Fetch all nodes that are members of the specified cluster(s), and the edges and nodes within the specified distance (number of hops) of those nodes. +params: + type: object + additionalProperties: false + required: [cluster_ids] + properties: + cluster_ids: + type: array + title: Cluster IDs + description: Fetch nodes by cluster ID, in the form "clustering_system_name:cluster_id" + items: + $ref: ../../datasets/djornl/definitions.yaml#/definitions/cluster_id + minItems: 1 + uniqueItems: true + examples: + - ['markov_i2:5', 'markov_i6:2'] + - ['markov_i6:1'] + distance: + type: integer + title: Traversal Distance + description: Number of hops to find neighbors and neighbors-of-neighbors + default: 0 + minimum: 0 + maximum: 100 + edge_types: + title: Edge Types + description: Edge types to filter on + type: array + items: + $ref: ../../datasets/djornl/edge_type.yaml + default: [] + uniqueItems: true + examples: + - ['protein-protein-interaction_high-throughput_AraNet_v2', 'protein-protein-interaction_literature-curated_AraNet_v2'] + - ['phenotype-association_AraGWAS'] + - [] +query: | + LET node_ids = ( + FOR n IN djornl_node + FILTER n.clusters ANY IN @cluster_ids + FOR v, e, p IN 0..@distance ANY n djornl_edge + OPTIONS {bfs: true, uniqueVertices: "global"} + FILTER length(@edge_types) == 0 || p.edges[*].edge_type ALL IN @edge_types + RETURN DISTINCT v._id + ) + LET edges = ( + FOR edge IN djornl_edge + FILTER edge._from IN node_ids AND edge._to IN node_ids + AND (length(@edge_types) == 0 || edge.edge_type IN @edge_types) + RETURN edge + ) + LET nodes = ( + FOR node IN djornl_node + FILTER node._id IN node_ids + RETURN node + ) + RETURN {nodes, edges} diff --git a/spec/stored_queries/djornl/djornl_fetch_genes.yaml b/spec/stored_queries/djornl/djornl_fetch_genes.yaml new file mode 100644 index 00000000..5f4ddb0e --- /dev/null +++ b/spec/stored_queries/djornl/djornl_fetch_genes.yaml @@ -0,0 +1,58 @@ +name: djornl_fetch_genes +description: Fetch a gene or list of genes by key, and the edges and nodes within the specified distance (number of hops) of those genes. +params: + type: object + additionalProperties: false + required: [gene_keys] + properties: + gene_keys: + type: array + title: Gene Keys + description: Fetch a gene or list of genes by ID + items: + $ref: ../../datasets/djornl/definitions.yaml#/definitions/djornl_node/_key + minItems: 1 + uniqueItems: true + examples: + - ["AT1G01020","AT1G01070"] + - ["AT1G01010"] + distance: + type: integer + title: Traversal Distance + description: Number of hops to find neighbors and neighbors-of-neighbors + default: 0 + minimum: 0 + maximum: 100 + edge_types: + title: Edge Types + description: Edge types to filter on + type: array + items: + $ref: ../../datasets/djornl/edge_type.yaml + default: [] + uniqueItems: true + examples: + - ['protein-protein-interaction_high-throughput_AraNet_v2', 'protein-protein-interaction_literature-curated_AraNet_v2'] + - ['phenotype-association_AraGWAS'] + - [] +query: | + LET node_ids = ( + FOR n IN djornl_node + FILTER n._key IN @gene_keys AND n.node_type == 'gene' + FOR v, e, p IN 0..@distance ANY n djornl_edge + OPTIONS {bfs: true, uniqueVertices: "global"} + FILTER length(@edge_types) == 0 || p.edges[*].edge_type ALL IN @edge_types + RETURN DISTINCT v._id + ) + LET edges = ( + FOR edge IN djornl_edge + FILTER edge._from IN node_ids AND edge._to IN node_ids + AND (length(@edge_types) == 0 || edge.edge_type IN @edge_types) + RETURN edge + ) + LET nodes = ( + FOR node IN djornl_node + FILTER node._id IN node_ids + RETURN node + ) + RETURN {nodes, edges} diff --git a/spec/stored_queries/djornl/djornl_fetch_phenotypes.yaml b/spec/stored_queries/djornl/djornl_fetch_phenotypes.yaml new file mode 100644 index 00000000..13c79fed --- /dev/null +++ b/spec/stored_queries/djornl/djornl_fetch_phenotypes.yaml @@ -0,0 +1,58 @@ +name: djornl_fetch_phenotypes +description: Fetch a phenotype or list of phenotypes by key, and the edges and nodes within the specified distance (number of hops) of those phenotype nodes. +params: + type: object + additionalProperties: false + required: [phenotype_keys] + properties: + phenotype_keys: + type: array + title: Phenotype Keys + description: Fetch a phenotype or list of phenotypes by ID + items: + $ref: ../../datasets/djornl/definitions.yaml#/definitions/djornl_node/_key + minItems: 1 + uniqueItems: true + examples: + - ["As2", "Na23"] + - ["As2"] + distance: + type: integer + title: Traversal Distance + description: Number of hops to find neighbors and neighbors-of-neighbors + default: 0 + minimum: 0 + maximum: 100 + edge_types: + title: Edge Types + description: Edge types to filter on + type: array + items: + $ref: ../../datasets/djornl/edge_type.yaml + default: [] + uniqueItems: true + examples: + - ['protein-protein-interaction_high-throughput_AraNet_v2', 'protein-protein-interaction_literature-curated_AraNet_v2'] + - ['phenotype-association_AraGWAS'] + - [] +query: | + LET node_ids = ( + FOR n IN djornl_node + FILTER n._key IN @phenotype_keys AND n.node_type == 'pheno' + FOR v, e, p IN 0..@distance ANY n djornl_edge + OPTIONS {bfs: true, uniqueVertices: "global"} + FILTER length(@edge_types) == 0 || p.edges[*].edge_type ALL IN @edge_types + RETURN DISTINCT v._id + ) + LET edges = ( + FOR edge IN djornl_edge + FILTER edge._from IN node_ids AND edge._to IN node_ids + AND (length(@edge_types) == 0 || edge.edge_type IN @edge_types) + RETURN edge + ) + LET nodes = ( + FOR node IN djornl_node + FILTER node._id IN node_ids + RETURN node + ) + RETURN {nodes, edges} diff --git a/spec/stored_queries/djornl/djornl_search_nodes.yaml b/spec/stored_queries/djornl/djornl_search_nodes.yaml new file mode 100644 index 00000000..7e377848 --- /dev/null +++ b/spec/stored_queries/djornl/djornl_search_nodes.yaml @@ -0,0 +1,68 @@ +name: djornl_search_nodes +description: Search for nodes using a simple fuzzy search on node metadata; return the matching nodes, and the edges and nodes within the specified distance (number of hops) of those nodes. +params: + type: object + additionalProperties: false + required: [search_text] + properties: + search_text: + type: string + title: Search text + description: Search nodes and their metadata for the search string + examples: + - 'GO:0005515' + - 'organelle machinery' + distance: + type: integer + title: Traversal Distance + description: Number of hops to find neighbors and neighbors-of-neighbors + default: 0 + minimum: 0 + maximum: 100 + edge_types: + title: Edge Types + description: Edge types to filter on + type: array + items: + $ref: ../../datasets/djornl/edge_type.yaml + default: [] + uniqueItems: true + examples: + - ['protein-protein-interaction_high-throughput_AraNet_v2', 'protein-protein-interaction_literature-curated_AraNet_v2'] + - ['phenotype-association_AraGWAS'] + - [] +query: | + LET node_ids = ( + FOR g IN djornl_node_view + SEARCH ANALYZER( + PHRASE(g.tair_computational_description, @search_text) OR + PHRASE(g.tair_short_description, @search_text) OR + PHRASE(g.mapman_description, @search_text) OR + PHRASE(g.go_description, @search_text) OR + PHRASE(g.mapman_name, @search_text) OR + PHRASE(g.mapman_description, @search_text) OR + PHRASE(g.pheno_description, @search_text) OR + PHRASE(g.pheno_pto_name, @search_text) OR + PHRASE(g.pheno_pto_description, @search_text) OR + PHRASE(g.user_notes, @search_text), + 'text_en' + ) + OR PHRASE(g.go_terms, @search_text) + OR PHRASE(g.transcript, @search_text) + FOR v, e, p IN 0..@distance ANY g djornl_edge + OPTIONS {bfs: true, uniqueVertices: "global"} + FILTER length(@edge_types) == 0 || p.edges[*].edge_type ALL IN @edge_types + RETURN DISTINCT v._id + ) + LET edges = ( + FOR edge IN djornl_edge + FILTER edge._from IN node_ids AND edge._to IN node_ids + AND (length(@edge_types) == 0 || edge.edge_type IN @edge_types) + RETURN edge + ) + LET nodes = ( + FOR node IN djornl_node + FILTER node._id IN node_ids + RETURN node + ) + RETURN {nodes, edges} diff --git a/spec/stored_queries/generic/fulltext_search.yaml b/spec/stored_queries/generic/fulltext_search.yaml new file mode 100644 index 00000000..6859add4 --- /dev/null +++ b/spec/stored_queries/generic/fulltext_search.yaml @@ -0,0 +1,94 @@ +# Should be REVISED or DEPRECATED. +# Is currently unused outside testing. +# +# Search a collection with a fulltext index with an attribute name and search text +# Also supports filtering by outer-level attributes +# Not recommended for fast searching because it can be very slow and even timeout at 60s +name: fulltext_search +params: + type: object + required: ["@coll", search_attrkey, search_text] + additionalProperties: false + properties: + "@coll": + type: string + title: Collection name + examples: [ncbi_taxon, gtdb_taxon] + search_attrkey: + type: string + title: Search attribute key + examples: [scientific_name, name] + search_text: + type: string + title: Search text + examples: [escherichia, es] + description: Text to search on the search attribute values + ts: + type: [integer, "null"] + title: Versioning timestamp + default: null + filter_attr_expr: + type: [array, "null"] + title: Filter by document attribute equality + items: + type: object + maxItems: 50 + examples: [ + [{"rank": "species"}, {"rank": "strain"}, {"strain": true}], + [{"rank": "species", "strain": false}] + ] + default: null + description: | + An array of single-level objects. + In each item object, the key-value pairs would restrict the documents to those containing all the attribute key-value pairs. + But if any item object in the array satisfies the document, the document is filtered into the results. + Basically works like a boolean expression where each key-value pair is a boolean value, each item object is a boolean term, and the array is a sum of boolean terms + Null or empty arrays have no filtering effect. + offset: + type: [integer, "null"] + title: Paging offset + maximum: 100000 + default: 0 + limit: + type: [integer, "null"] + title: Max results to return + default: 20 + maximum: 1000 + select: + type: [string, array, "null"] + items: + type: string + examples: [scientific_name, [scientific_name, id]] + default: null + description: Document attributes to keep in the results +query: | + LET search_text__norm = REGEX_REPLACE(LOWER(TRIM(@search_text)), "\\s+", " ") + LET search_text__first_exact_tok = REGEX_SPLIT(search_text__norm, " ")[0] + LET search_text__icu_toks = TOKENS(@search_text, "icu_tokenize") /* db analyzer icu_tokenize */ + LET search_text__wordboundmod_icu_toks = ( + FOR tok IN search_text__icu_toks + RETURN REGEX_REPLACE(tok, ",.*", "") /* commas cannot be escaped */ + ) + LET search_text__fulltext = CONCAT_SEPARATOR(", ", + FOR tok IN search_text__wordboundmod_icu_toks + RETURN CONCAT("prefix:", tok) + ) + LET filter_attr_expr = @filter_attr_expr ? @filter_attr_expr : [] /* null to [] */ + LET search_text__wildcard = CONCAT("%", CONCAT_SEPARATOR("%", search_text__icu_toks), "%") /* e.g., %tok0%tok1%tokn% */ + FOR doc IN FULLTEXT(@@coll, @search_attrkey, search_text__fulltext) + FILTER @ts ? doc.created <= @ts AND doc.expired >= @ts : true + /* keep doc if any obj in filter_attr_expr is a sub-obj of doc */ + FILTER LENGTH(filter_attr_expr) > 0 ? ( + FOR term IN filter_attr_expr + RETURN MATCHES(doc, term) + ) ANY == true : true + LET attrval__norm = REGEX_REPLACE(LOWER(TRIM(doc.@search_attrkey)), "\\s+", " ") + LET attrval__icu_toks = TOKENS(doc.@search_attrkey, "icu_tokenize") + SORT LIKE(doc.@search_attrkey, search_text__wildcard, true) DESC, /* icu tok ordering */ + /* TODO - icu tok ordering with no insertions? */ + CONTAINS(attrval__icu_toks[0], search_text__icu_toks[0], true) == 0 DESC, /* first icu tok */ + CONTAINS(attrval__norm, search_text__first_exact_tok, true) == 0 DESC, /* first exact tok */ + CONTAINS(attrval__norm, search_text__norm, true) == 0 DESC, /* exact match */ + doc.@search_attrkey /* lexical */ + LIMIT @offset ? @offset : 0, @limit ? @limit : 20 + RETURN @select ? KEEP(doc, @select) : doc diff --git a/spec/stored_queries/list_genes_for_similar_reactions.yaml b/spec/stored_queries/list_genes_for_similar_reactions.yaml new file mode 100644 index 00000000..76d9d00b --- /dev/null +++ b/spec/stored_queries/list_genes_for_similar_reactions.yaml @@ -0,0 +1,52 @@ +# Return genes associated with reactions similar to a query reaction + +name: list_genes_for_similar_reactions + +params: + type: object + required: [sf_sim, df_sim, rid] + properties: + rid: + type: string + title: Reaction id (rxn_reaction vertex id) + sf_sim: + type: number + title: Minimum structural fingerprint similarity score + df_sim: + type: number + title: Minimum difference fingerprint similarity score + exclude_self: + type: boolean + description: If true, don't include the query reactions genes + default: false + +query_prefix: WITH rxn_reaction +query: | + LET start = @exclude_self ? 1 : 0 + LET rxns = ( + FOR v, e IN start..1 + ANY @rid rxn_similar_to_reaction + OPTIONS {uniqueVertices: "global", bfs: true} + FILTER !e || e.sf_similarity >= @sf_sim + FILTER !e || e.df_similarity >= @df_sim + RETURN {id: v._id, key: v._key, name: v.name, definition: v.definition, "structural similarity": e.sf_similarity, "difference similarity": e.df_similarity} + ) + LET rxn_ids = rxns[*].id + LET rxn_gene_links = ( + FOR e in rxn_reaction_within_complex + FILTER e._from in rxn_ids + LET linked_gene_ids = FLATTEN( + FOR c in rxn_gene_complex + FILTER c._id == e._to + RETURN c.genes + ) + COLLECT rxn_id = e._from INTO groups KEEP linked_gene_ids + RETURN {rxn_id: rxn_id, linked_gene_ids: UNIQUE(FLATTEN(groups[*].linked_gene_ids))} + ) + LET gene_ids = UNIQUE(FLATTEN(rxn_gene_links[*].linked_gene_ids)) + LET genes = ( + FOR g in ncbi_gene + FILTER g._key IN gene_ids + RETURN {key: g._key, product: g.product, function: CONCAT_SEPARATOR(', ', g.functions), sequence: g.protein_translation} + ) + RETURN {rxns: rxns, rxn_gene_links: rxn_gene_links, genes: genes} diff --git a/spec/stored_queries/ncbi_tax/ncbi_fetch_taxon.yaml b/spec/stored_queries/ncbi_tax/ncbi_fetch_taxon.yaml new file mode 100644 index 00000000..3a9c4170 --- /dev/null +++ b/spec/stored_queries/ncbi_tax/ncbi_fetch_taxon.yaml @@ -0,0 +1,18 @@ +# Fetch a taxon document by taxonomy ID +name: ncbi_fetch_taxon +params: + type: object + required: [id, ts] + properties: + id: + type: string + title: NCBI Taxonomy ID + ts: + type: integer + title: Versioning timestamp +query: | + for t in ncbi_taxon + filter t.id == @id + filter t.created <= @ts AND t.expired >= @ts + limit 1 + return t diff --git a/spec/stored_queries/ncbi_tax/ncbi_fetch_taxon_by_sciname.yaml b/spec/stored_queries/ncbi_tax/ncbi_fetch_taxon_by_sciname.yaml new file mode 100644 index 00000000..647495fd --- /dev/null +++ b/spec/stored_queries/ncbi_tax/ncbi_fetch_taxon_by_sciname.yaml @@ -0,0 +1,18 @@ +# Fetch a taxon document by exact match on sciname +name: ncbi_fetch_taxon_by_sciname +params: + type: object + required: [sciname, ts] + properties: + sciname: + type: string + title: NCBI scientific name + ts: + type: integer + title: Versioning timestamp +query: | + for t in ncbi_taxon + filter t.scientific_name == @sciname + filter t.created <= @ts AND t.expired >= @ts + limit 1 + return t diff --git a/spec/stored_queries/ncbi_tax/ncbi_taxon_get_associated_ws_objects.yaml b/spec/stored_queries/ncbi_tax/ncbi_taxon_get_associated_ws_objects.yaml new file mode 100644 index 00000000..5360d49d --- /dev/null +++ b/spec/stored_queries/ncbi_tax/ncbi_taxon_get_associated_ws_objects.yaml @@ -0,0 +1,70 @@ +# Get the workspace objects associated with a taxon + +name: ncbi_taxon_get_associated_ws_objects + +params: + type: object + required: [taxon_id, ts] + properties: + taxon_id: + type: string + title: NCBI Taxon ID + description: ID of the taxon vertex to find associated taxa + limit: + type: integer + default: 20 + description: Maximum result limit + maximum: 1000 + offset: + type: integer + default: 0 + description: Result offset for pagination + maximum: 100000 + ts: + type: integer + title: Versioning timestamp + select_obj: + type: [array, "null"] + items: {type: string} + title: WS obj fields to keep in the results + default: null + select_edge: + type: [array, "null"] + items: {type: string} + description: Taxon edge fields to keep in the results + default: null +query_prefix: WITH ws_object_version, ws_type_version, ws_workspace +query: | + LET count = COUNT( + FOR tax IN ncbi_taxon + FILTER tax.id == @taxon_id + FILTER tax.created <= @ts AND tax.expired >= @ts + LIMIT 1 + FOR obj IN 1..1 INBOUND tax ws_obj_version_has_taxon + RETURN 1 + ) + LET results = ( + FOR tax IN ncbi_taxon + FILTER tax.id == @taxon_id + FILTER tax.created <= @ts AND tax.expired >= @ts + LIMIT 1 + FOR obj, e IN 1 INBOUND tax ws_obj_version_has_taxon + FILTER obj.is_public OR obj.workspace_id IN ws_ids + LIMIT @offset, @limit + LET type = first( + FOR type IN 1 OUTBOUND obj ws_obj_instance_of_type + RETURN KEEP(type, ['_key', 'module_name', 'type_name', 'maj_ver', 'min_ver']) + ) + LET unver_id = CONCAT("ws_object/", TO_STRING(obj.workspace_id), ':', TO_STRING(obj.object_id)) + LET ws_info = FIRST( + FOR ws IN 1 INBOUND unver_id ws_workspace_contains_obj + FILTER !ws.is_deleted + RETURN KEEP(ws, ['owner', 'metadata', 'is_public', 'mod_epoch']) + ) + LET o = MERGE(obj, {type, ws_info}) + RETURN { + ws_obj: @select_obj ? KEEP(o, @select_obj) : o, + edge: @select_edge ? KEEP(e, @select_edge) : e + } + ) + RETURN {results, total_count: count} diff --git a/spec/stored_queries/ncbi_tax/ncbi_taxon_get_children.yaml b/spec/stored_queries/ncbi_tax/ncbi_taxon_get_children.yaml new file mode 100644 index 00000000..eaf5b9cc --- /dev/null +++ b/spec/stored_queries/ncbi_tax/ncbi_taxon_get_children.yaml @@ -0,0 +1,57 @@ +# Get the array of direct descendants for any taxon +name: ncbi_taxon_get_children +params: + type: object + required: [id, ts] + properties: + id: + type: string + title: Document ID + description: ID of the taxon vertex for which you want to find descendants + limit: + type: integer + default: 20 + description: Maximum result limit + maximum: 1000 + offset: + type: integer + default: 0 + description: Result offset for pagination + maximum: 100000 + search_text: + type: string + description: Search scientific name + default: '' + ts: + type: integer + title: Versioning timestamp + select: + type: [array, "null"] + items: {type: string} + description: Taxon fields to keep in the results + default: null +query: | + // Fetch the child IDs using the edge attributes + let child_ids = ( + for e in ncbi_child_of_taxon + filter e.to == @id + filter e.created <= @ts AND e.expired >= @ts + return e.from + ) + // Sort and filter the children + // Should only get evaluated if search_text is truthy + let searched = ( + for tax in FULLTEXT(ncbi_taxon, "scientific_name", @search_text) + filter tax.id in child_ids + return tax.id + ) + let filtered = @search_text ? searched : child_ids + let results = ( + for tax in ncbi_taxon + filter tax.id in filtered + filter tax.created <= @ts AND tax.expired >= @ts + sort tax.scientific_name asc + limit @offset, @limit + return (@select ? KEEP(tax, @select) : tax) + ) + return {total_count: COUNT(filtered), results: results} diff --git a/spec/stored_queries/ncbi_tax/ncbi_taxon_get_children_cursor.yaml b/spec/stored_queries/ncbi_tax/ncbi_taxon_get_children_cursor.yaml new file mode 100644 index 00000000..9aefb751 --- /dev/null +++ b/spec/stored_queries/ncbi_tax/ncbi_taxon_get_children_cursor.yaml @@ -0,0 +1,25 @@ +# Get all children for a taxon, using a cursor +name: ncbi_taxon_get_children_cursor +params: + type: object + required: [id, ts] + properties: + id: + type: string + title: Document ID + description: ID of the taxon vertex for which you want to find descendants + ts: + type: integer + title: Versioning timestamp + select: + type: [array, "null"] + items: {type: string} + description: Taxon fields to keep in the results + default: null +query: | + for tax in ncbi_taxon + filter tax.id == @id + filter tax.created <= @ts AND tax.expired >= @ts + limit 1 + for child in 1..1 inbound tax ncbi_child_of_taxon + return @select ? KEEP(tax, @select) : tax diff --git a/spec/stored_queries/ncbi_tax/ncbi_taxon_get_lineage.yaml b/spec/stored_queries/ncbi_tax/ncbi_taxon_get_lineage.yaml new file mode 100644 index 00000000..7d2f9544 --- /dev/null +++ b/spec/stored_queries/ncbi_tax/ncbi_taxon_get_lineage.yaml @@ -0,0 +1,33 @@ +# Get the lineage array for a taxon +# Returns an array where the top-most (closest to the root) taxon is at the beginning +name: ncbi_taxon_get_lineage +params: + type: object + required: [id, ts] + properties: + id: + type: string + title: Document id + description: ID of the taxon vertex for which you want to find ancestors + ts: + type: integer + title: Versioning timestamp + select: + type: [array, "null"] + items: {type: string} + description: Taxon fields to keep in the results + default: null +query: | + let ps = ( + for t in ncbi_taxon + filter t.id == @id + filter t.created <= @ts AND t.expired >= @ts + limit 1 + for ancestor, e, path in 1..100 outbound t ncbi_child_of_taxon + options {bfs: true} + filter path.edges[*].created ALL <= @ts AND path.edges[*].expired ALL >= @ts + return (@select ? KEEP(ancestor, @select) : ancestor) + ) + // doing return reverse(ps) returns an array of an array for some reason, + // which we don't want + for d in reverse(ps) return d diff --git a/spec/stored_queries/ncbi_tax/ncbi_taxon_get_siblings.yaml b/spec/stored_queries/ncbi_tax/ncbi_taxon_get_siblings.yaml new file mode 100644 index 00000000..36ade8d5 --- /dev/null +++ b/spec/stored_queries/ncbi_tax/ncbi_taxon_get_siblings.yaml @@ -0,0 +1,55 @@ +# Get the array of siblings for a taxon +# Results are limited to 10k +name: ncbi_taxon_get_siblings +params: + type: object + required: [id, ts] + properties: + id: + type: string + title: Document id + description: ID of the taxon vertex for which you want to find siblings + limit: + type: integer + default: 20 + description: Maximum result limit + maximum: 1000 + offset: + type: integer + default: 0 + description: Result offset for pagination + maximum: 100000 + ts: + type: integer + title: Versioning timestamp + select: + type: [array, "null"] + items: {type: string} + description: Taxon fields to keep in the results + default: null +query: | + // Fetch the siblings + let parent_id = first( + for e in ncbi_child_of_taxon + filter e.from == @id + filter e.created <= @ts and e.expired >= @ts + limit 1 + return e.to + ) + let sibling_ids = ( + for e in ncbi_child_of_taxon + filter e.to == parent_id + filter e.created <= @ts and e.expired >= @ts + filter e.from != @id + return e.from + ) + // Apply sort and limits to the results + let siblings = ( + for tax in ncbi_taxon + filter tax.id in sibling_ids + filter tax.created <= @ts AND tax.expired >= @ts + sort tax.scientific_name asc + limit @offset, @limit + return (@select ? KEEP(tax, @select) : tax) + ) + return {total_count: COUNT(sibling_ids), results: siblings} diff --git a/spec/stored_queries/ncbi_tax/ncbi_taxon_get_taxon_from_ws_obj.yaml b/spec/stored_queries/ncbi_tax/ncbi_taxon_get_taxon_from_ws_obj.yaml new file mode 100644 index 00000000..c25450cc --- /dev/null +++ b/spec/stored_queries/ncbi_tax/ncbi_taxon_get_taxon_from_ws_obj.yaml @@ -0,0 +1,22 @@ +# Fetch a taxon document from a workspace object reference +name: ncbi_taxon_get_taxon_from_ws_obj +params: + type: object + required: [obj_ref, ts] + properties: + obj_ref: + type: string + title: Workspace versioned object reference + ts: + type: integer + title: Versioning timestamp +query_prefix: with ncbi_taxon +query: | + for obj in ws_object_version + filter obj._key == @obj_ref + filter obj.is_public or obj.workspace_id IN ws_ids + for tax in 1 outbound obj ws_obj_version_has_taxon + filter tax.created <= @ts AND tax.expired >= @ts + limit 1 + return tax + diff --git a/spec/stored_queries/ncbi_tax/ncbi_taxon_search_sci_name.yaml b/spec/stored_queries/ncbi_tax/ncbi_taxon_search_sci_name.yaml new file mode 100644 index 00000000..e81f74c0 --- /dev/null +++ b/spec/stored_queries/ncbi_tax/ncbi_taxon_search_sci_name.yaml @@ -0,0 +1,56 @@ +# Search for an ncbi taxon with a scientific name +# Offset is limited to 10k +name: ncbi_taxon_search_sci_name +params: + type: object + required: [search_text, ts] + properties: + search_text: + type: string + title: Search text + description: Text to search on for the scientific name + ranks: + description: Filter the query to include only these ranks. An empty array is ignored. + type: array + default: [] + items: + type: string + include_strains: + description: true to include strains in the result, regardless of the ranks field. false + to perform no special filtering on strains. + type: boolean + default: false + offset: + type: integer + default: 0 + maximum: 100000 + limit: + type: integer + default: 20 + maximum: 1000 + ts: + type: integer + title: Versioning timestamp + select: + type: [array, "null"] + items: {type: string} + description: Taxon fields to keep in the results + default: null +query: | + // Search using the fulltext index on scientific_name + // Don't limit the results yet so we can get the total_count below + LET results = ( + FOR doc IN FULLTEXT(ncbi_taxon, "scientific_name", @search_text) + // Filter non-expired docs + FILTER doc.created <= @ts AND doc.expired >= @ts + FILTER LENGTH(@ranks) > 0 ? + (@include_strains ? (doc.rank in @ranks OR doc.strain) : doc.rank in @ranks) : true + RETURN doc + ) + // Limit the results + LET limited = ( + FOR r IN results + LIMIT @offset, @limit + RETURN @select ? KEEP(r, @select) : r + ) + RETURN {results: limited, total_count: COUNT(results)} diff --git a/spec/stored_queries/ontology/ontology_get_ancestors.yaml b/spec/stored_queries/ontology/ontology_get_ancestors.yaml new file mode 100644 index 00000000..01d65374 --- /dev/null +++ b/spec/stored_queries/ontology/ontology_get_ancestors.yaml @@ -0,0 +1,43 @@ +# Get all ancestors (all parent's parents) for this term + +name: ontology_get_ancestors +params: + type: object + required: [id, ts, "@onto_terms", "@onto_edges"] + properties: + id: + type: string + title: Document ID + description: Ontology id of the term you want to get all the ancestors of + limit: + type: integer + default: 20 + description: Maximum result limit + maximum: 1000 + offset: + type: integer + default: 0 + description: Result offset for pagination + maximum: 100000 + ts: + type: integer + title: Versioning timestamp + "@onto_terms": + type: string + title: Ontology terms collection name + "@onto_edges": + type: string + title: Ontology edges collection name +query_prefix: WITH @@onto_terms +query: | + FOR t in @@onto_terms + FILTER t.id == @id + FILTER t.created <= @ts AND t.expired >= @ts + limit 1 + FOR v, e, p IN 1..100 OUTBOUND t @@onto_edges + FILTER p.edges[*].created ALL <= @ts + AND p.edges[*].expired ALL >= @ts + AND p.edges[*].type ALL == "is_a" + SORT v.id ASC + LIMIT @offset, @limit + RETURN {term: v, edge: e} diff --git a/spec/stored_queries/ontology/ontology_get_associated_samples.yaml b/spec/stored_queries/ontology/ontology_get_associated_samples.yaml new file mode 100644 index 00000000..50ff2715 --- /dev/null +++ b/spec/stored_queries/ontology/ontology_get_associated_samples.yaml @@ -0,0 +1,66 @@ +# Get all samples reference this term + +name: ontology_get_associated_samples +params: + type: object + required: [id, ts, user_id, "@onto_terms"] + properties: + id: + type: string + title: Document ID + description: Ontology ID of the term you want to get all the associated samples + user_id: + type: string + title: User ID + description: User ID used for checking samples access control + limit: + type: integer + default: 20 + description: Maximum result limit + maximum: 1000 + offset: + type: integer + default: 0 + description: Result offset for pagination + maximum: 100000 + ts: + type: integer + title: Versioning timestamp + "@onto_terms": + type: string + title: Ontology terms collection name + description: the name of the vertex collection holding the ontology term data + examples: [ENVO_terms, GO_terms] +query_prefix: WITH samples_nodes, samples_version, samples_sample +query: | + LET results=( + FOR t in @@onto_terms + FILTER t.id == @id + FILTER t.created <= @ts AND t.expired > @ts + limit 1 + FOR v, e, p IN 3 OUTBOUND t INBOUND sample_ontology_link, samples_nodes_edge, samples_ver_edge + FILTER p.vertices[1].saved >= t.created AND p.vertices[1].saved < t.expired + AND p.edges[0].created <= @ts AND p.edges[0].expired > @ts + SORT p.vertices[1].id ASC + RETURN { + sample: p.vertices[1], + sample_metadata_key: p.edges[0].sample_metadata_term, + sample_access: v + } + ) + LET total_count=COUNT(results) + LET filtered=( + FOR r in results + FILTER @user_id == r.sample_access.acls.owner + OR @user_id IN r.sample_access.acls.admin + OR @user_id IN r.sample_access.acls.read + OR r.sample_access.acls.pubread + RETURN r + ) + LET total_accessible_count=COUNT(filtered) + LET limited=( + FOR r in filtered + LIMIT @offset, @limit + RETURN r + ) + RETURN {results: limited, total_count, total_accessible_count} diff --git a/spec/stored_queries/ontology/ontology_get_children.yaml b/spec/stored_queries/ontology/ontology_get_children.yaml new file mode 100644 index 00000000..06d6afb6 --- /dev/null +++ b/spec/stored_queries/ontology/ontology_get_children.yaml @@ -0,0 +1,42 @@ +# Get the children of this term + +name: ontology_get_children +params: + type: object + required: [id, ts, "@onto_terms", "@onto_edges"] + properties: + id: + type: string + title: Document ID + description: Ontology id of the term you want to get the children of + limit: + type: integer + default: 20 + description: Maximum result limit + maximum: 1000 + offset: + type: integer + default: 0 + description: Result offset for pagination + maximum: 100000 + ts: + type: integer + title: Versioning timestamp + "@onto_terms": + type: string + title: Ontology terms collection name + "@onto_edges": + type: string + title: Ontology edges collection name +query_prefix: WITH @@onto_terms +query: | + FOR t in @@onto_terms + FILTER t.id == @id + FILTER t.created <= @ts AND t.expired >= @ts + limit 1 + FOR v, e IN 1..1 INBOUND t @@onto_edges + FILTER e.created <= @ts AND e.expired >= @ts + FILTER e.type == "is_a" + SORT v.id ASC + LIMIT @offset, @limit + RETURN {term: v, edge: e} diff --git a/spec/stored_queries/ontology/ontology_get_descendants.yaml b/spec/stored_queries/ontology/ontology_get_descendants.yaml new file mode 100644 index 00000000..93ec7260 --- /dev/null +++ b/spec/stored_queries/ontology/ontology_get_descendants.yaml @@ -0,0 +1,43 @@ +# Get all descendents of this term + +name: ontology_get_descendants +params: + type: object + required: [id, ts, "@onto_terms", "@onto_edges"] + properties: + id: + type: string + title: Document ID + description: Ontology id of the term you want to get all the descendants of + limit: + type: integer + default: 20 + description: Maximum result limit + maximum: 1000 + offset: + type: integer + default: 0 + description: Result offset for pagination + maximum: 100000 + ts: + type: integer + title: Versioning timestamp + "@onto_terms": + type: string + title: Ontology terms collection name + "@onto_edges": + type: string + title: Ontology edges collection name +query_prefix: WITH @@onto_terms +query: | + FOR t in @@onto_terms + FILTER t.id == @id + FILTER t.created <= @ts AND t.expired >= @ts + limit 1 + FOR v, e, p IN 1..100 INBOUND t @@onto_edges + FILTER p.edges[*].created ALL <= @ts + AND p.edges[*].expired ALL >= @ts + AND p.edges[*].type ALL == "is_a" + SORT v._key ASC + LIMIT @offset, @limit + RETURN {term: v, edge: e} diff --git a/spec/stored_queries/ontology/ontology_get_hierarchicalAncestors.yaml b/spec/stored_queries/ontology/ontology_get_hierarchicalAncestors.yaml new file mode 100644 index 00000000..256459ac --- /dev/null +++ b/spec/stored_queries/ontology/ontology_get_hierarchicalAncestors.yaml @@ -0,0 +1,46 @@ +# Get all hierarchical ancestors +# (all parents’s parents) resources for this term. Hierarchical +# ancestors include is-a and other related parents, such as +# part-of/develops-from, that imply a hierarchical relationship + +name: ontology_get_hierarchicalAncestors +params: + type: object + required: [id, ts, "@onto_terms", "@onto_edges"] + properties: + id: + type: string + title: Document ID + description: Ontology id of the term you want to get all the hierarchical ancestors of + limit: + type: integer + default: 20 + description: Maximum result limit + maximum: 1000 + offset: + type: integer + default: 0 + description: Result offset for pagination + maximum: 100000 + ts: + type: integer + title: Versioning timestamp + "@onto_terms": + type: string + title: Ontology terms collection name + "@onto_edges": + type: string + title: Ontology edges collection name +query_prefix: WITH @@onto_terms +query: | + FOR t in @@onto_terms + FILTER t.id == @id + FILTER t.created <= @ts AND t.expired >= @ts + limit 1 + FOR v, e, p IN 1..100 OUTBOUND t @@onto_edges + FILTER p.edges[*].created ALL <= @ts + AND p.edges[*].expired ALL >= @ts + AND p.edges[*].type ALL != NULL + SORT v.id ASC + LIMIT @offset, @limit + RETURN {term: v, edge: e} diff --git a/spec/stored_queries/ontology/ontology_get_hierarchicalChildren.yaml b/spec/stored_queries/ontology/ontology_get_hierarchicalChildren.yaml new file mode 100644 index 00000000..1702816a --- /dev/null +++ b/spec/stored_queries/ontology/ontology_get_hierarchicalChildren.yaml @@ -0,0 +1,41 @@ +# Get the direct hierarchical children for this term. Hierarchical children include is-a and other related children, such as part-of/develops-from, that imply a hierarchical relationship +name: ontology_get_hierarchicalChildren +params: + type: object + required: [id, ts, "@onto_terms", "@onto_edges"] + properties: + id: + type: string + title: Document ID + description: Ontology id of the term you want to get the direct hierarchical children of + limit: + type: integer + default: 20 + description: Maximum result limit + maximum: 1000 + offset: + type: integer + default: 0 + description: Result offset for pagination + maximum: 100000 + ts: + type: integer + title: Versioning timestamp + "@onto_terms": + type: string + title: Ontology terms collection name + "@onto_edges": + type: string + title: Ontology edges collection name +query_prefix: WITH @@onto_terms +query: | + FOR t in @@onto_terms + FILTER t.id == @id + FILTER t.created <= @ts AND t.expired >= @ts + limit 1 + FOR v, e IN 1..1 INBOUND t @@onto_edges + FILTER e.created <= @ts AND e.expired >= @ts + FILTER e.type != NULL + SORT v.id ASC + LIMIT @offset, @limit + RETURN {term: v, edge: e} diff --git a/spec/stored_queries/ontology/ontology_get_hierarchicalDescendants.yaml b/spec/stored_queries/ontology/ontology_get_hierarchicalDescendants.yaml new file mode 100644 index 00000000..8c749d8e --- /dev/null +++ b/spec/stored_queries/ontology/ontology_get_hierarchicalDescendants.yaml @@ -0,0 +1,45 @@ +# Get all hierarchical descendants +# resources for this term. Hierarchical children include is-a +# and other related children, such as part-of/develops-from, +# that imply a hierarchical relationship +name: ontology_get_hierarchicalDescendants +params: + type: object + required: [id, ts, "@onto_terms", "@onto_edges"] + properties: + id: + type: string + title: Document ID + description: Ontology id of the term you want to get all the hierarchical descendants of + limit: + type: integer + default: 20 + description: Maximum result limit + maximum: 1000 + offset: + type: integer + default: 0 + description: Result offset for pagination + maximum: 100000 + ts: + type: integer + title: Versioning timestamp + "@onto_terms": + type: string + title: Ontology terms collection name + "@onto_edges": + type: string + title: Ontology edges collection name +query_prefix: WITH @@onto_terms +query: | + FOR t in @@onto_terms + FILTER t.id == @id + FILTER t.created <= @ts AND t.expired >= @ts + limit 1 + FOR v, e, p IN 1..100 INBOUND t @@onto_edges + FILTER p.edges[*].created ALL <= @ts + AND p.edges[*].expired ALL >= @ts + AND p.edges[*].type ALL != NULL + SORT v.id ASC + LIMIT @offset, @limit + RETURN {term: v, edge: e} diff --git a/spec/stored_queries/ontology/ontology_get_hierarchicalParents.yaml b/spec/stored_queries/ontology/ontology_get_hierarchicalParents.yaml new file mode 100644 index 00000000..8fd2f71b --- /dev/null +++ b/spec/stored_queries/ontology/ontology_get_hierarchicalParents.yaml @@ -0,0 +1,45 @@ +# Get the direct hierarchical parent +# resources for this term. Hierarchical parents include is-a and +# other related parents, such as part-of/develops-from, that imply +# a hierarchical relationship. + +name: ontology_get_hierarchicalParents +params: + type: object + required: [id, ts, "@onto_terms", "@onto_edges"] + properties: + id: + type: string + title: Document ID + description: Ontology id of the term you want to get all the hierarchical parents of + limit: + type: integer + default: 20 + description: Maximum result limit + maximum: 1000 + offset: + type: integer + default: 0 + description: Result offset for pagination + maximum: 100000 + ts: + type: integer + title: Versioning timestamp + "@onto_terms": + type: string + title: Ontology terms collection name + "@onto_edges": + type: string + title: Ontology edges collection name +query_prefix: WITH @@onto_terms +query: | + FOR t in @@onto_terms + FILTER t.id == @id + FILTER t.created <= @ts AND t.expired >= @ts + limit 1 + FOR v, e IN 1..1 OUTBOUND t @@onto_edges + FILTER e.created <= @ts AND e.expired >= @ts + FILTER e.type != NULL + SORT v.id ASC + LIMIT @offset, @limit + RETURN {term: v, edge: e} diff --git a/spec/stored_queries/ontology/ontology_get_metadata.yaml b/spec/stored_queries/ontology/ontology_get_metadata.yaml new file mode 100644 index 00000000..56294cc9 --- /dev/null +++ b/spec/stored_queries/ontology/ontology_get_metadata.yaml @@ -0,0 +1,24 @@ +# Get information/metadata of a particular ontology term (see spec for available fields) + +name: ontology_get_metadata +params: + type: object + required: [id, ts, "@onto_terms"] + properties: + id: + type: string + title: Document ID + description: Get information/metadata of a particular ontology term + ts: + type: integer + title: Versioning timestamp + "@onto_terms": + type: string + title: Ontology terms collection name +query_prefix: WITH @@onto_terms +query: | + FOR t in @@onto_terms + FILTER t.id == @id + FILTER t.created <= @ts AND t.expired >= @ts + limit 1 + RETURN t diff --git a/spec/stored_queries/ontology/ontology_get_parents.yaml b/spec/stored_queries/ontology/ontology_get_parents.yaml new file mode 100644 index 00000000..dd3f4f1f --- /dev/null +++ b/spec/stored_queries/ontology/ontology_get_parents.yaml @@ -0,0 +1,42 @@ +# Get the direct parents for a specific term + +name: ontology_get_parents +params: + type: object + required: [id, ts, "@onto_terms", "@onto_edges"] + properties: + id: + type: string + title: Document ID + description: Ontology id of the term you want to get all the direct parents of + limit: + type: integer + default: 20 + description: Maximum result limit + maximum: 1000 + offset: + type: integer + default: 0 + description: Result offset for pagination + maximum: 100000 + ts: + type: integer + title: Versioning timestamp + "@onto_terms": + type: string + title: Ontology terms collection name + "@onto_edges": + type: string + title: Ontology edges collection name +query_prefix: WITH @@onto_terms +query: | + FOR t in @@onto_terms + FILTER t.id == @id + FILTER t.created <= @ts AND t.expired >= @ts + limit 1 + FOR v, e IN 1..1 OUTBOUND t @@onto_edges + FILTER e.created <= @ts AND e.expired >= @ts + FILTER e.type == "is_a" + SORT v.id ASC + LIMIT @offset, @limit + RETURN {term: v, edge: e} diff --git a/spec/stored_queries/ontology/ontology_get_related.yaml b/spec/stored_queries/ontology/ontology_get_related.yaml new file mode 100644 index 00000000..5d0adc9f --- /dev/null +++ b/spec/stored_queries/ontology/ontology_get_related.yaml @@ -0,0 +1,40 @@ +# Get all immediate related terms for this term +name: ontology_get_related +params: + type: object + required: [id, ts, "@onto_terms", "@onto_edges"] + properties: + id: + type: string + title: Document ID + description: Ontology id of the term you want to get all the directly related nodes of + limit: + type: integer + default: 20 + description: Maximum result limit + maximum: 1000 + offset: + type: integer + default: 0 + description: Result offset for pagination + maximum: 100000 + ts: + type: integer + title: Versioning timestamp + "@onto_terms": + type: string + title: Ontology terms collection name + "@onto_edges": + type: string + title: Ontology edges collection name +query_prefix: WITH @@onto_terms +query: | + FOR t in @@onto_terms + FILTER t.id == @id + FILTER t.created <= @ts AND t.expired >= @ts + limit 1 + FOR v, e IN 1 ANY t @@onto_edges + FILTER e.created <= @ts AND e.expired >= @ts + SORT v.id ASC + LIMIT @offset, @limit + RETURN {term: v, edge: e} diff --git a/spec/stored_queries/ontology/ontology_get_siblings.yaml b/spec/stored_queries/ontology/ontology_get_siblings.yaml new file mode 100644 index 00000000..9acf4a72 --- /dev/null +++ b/spec/stored_queries/ontology/ontology_get_siblings.yaml @@ -0,0 +1,46 @@ +# Get all siblings of this term + +name: ontology_get_siblings +params: + type: object + required: [id, ts, "@onto_terms", "@onto_edges"] + properties: + id: + type: string + title: Document ID + description: Ontology id of the term you want to get all the siblings of + limit: + type: integer + default: 20 + description: Maximum result limit + maximum: 1000 + offset: + type: integer + default: 0 + description: Result offset for pagination + maximum: 100000 + ts: + type: integer + title: Versioning timestamp + "@onto_terms": + type: string + title: Ontology terms collection name + "@onto_edges": + type: string + title: Ontology edges collection name +query_prefix: WITH @@onto_terms +query: | + FOR t in @@onto_terms + FILTER t.id == @id + FILTER t.created <= @ts AND t.expired >= @ts + limit 1 + FOR v_parent, e_parent IN 1..1 OUTBOUND t @@onto_edges + FILTER e_parent.created <= @ts AND e_parent.expired >= @ts + FILTER e_parent.type == "is_a" + FOR v_child, e_child in 1..1 INBOUND v_parent @@onto_edges + FILTER e_child.created <= @ts AND e_child.expired >= @ts + FILTER e_child.type == "is_a" + FILTER v_child != t + SORT v_child.id ASC + LIMIT @offset, @limit + RETURN v_child diff --git a/spec/stored_queries/ontology/ontology_get_term_by_name.yaml b/spec/stored_queries/ontology/ontology_get_term_by_name.yaml new file mode 100644 index 00000000..3cbe5aea --- /dev/null +++ b/spec/stored_queries/ontology/ontology_get_term_by_name.yaml @@ -0,0 +1,41 @@ +# Get ontology term by searching name + +name: ontology_get_term_by_name +params: + type: object + required: [name, ts, "@onto_terms", "@onto_edges"] + properties: + name: + type: string + title: Ontology term's name + description: Name of the Ontology term you want to get + examples: ["terrestrial biome"] + ancestor_term: + type: string + title: Ancestor ontology term + description: Optional ancestor ontology term + examples: ["ENVO:00000428"] + ts: + type: integer + title: Versioning timestamp + "@onto_terms": + type: string + title: Ontology terms collection name + examples: ["ENVO_terms"] + "@onto_edges": + type: string + title: Ontology edges collection name + examples: ["ENVO_edges"] +query_prefix: WITH @@onto_terms +query: | + LET ancestor_term_null=IS_NULL(@ancestor_term) OR LENGTH(@ancestor_term) == 0 + FOR t in @@onto_terms + FILTER LOWER(t.name) == LOWER(@name) + FILTER t.created <= @ts AND t.expired >= @ts + limit 1 + FOR v, e, p IN 1..100 OUTBOUND t @@onto_edges + FILTER ancestor_term_null OR v.id == @ancestor_term + FILTER p.edges[*].created ALL <= @ts + AND p.edges[*].expired ALL >= @ts + AND p.edges[*].type ALL == "is_a" + RETURN DISTINCT t diff --git a/spec/stored_queries/ontology/ontology_get_terms.yaml b/spec/stored_queries/ontology/ontology_get_terms.yaml new file mode 100644 index 00000000..69459be8 --- /dev/null +++ b/spec/stored_queries/ontology/ontology_get_terms.yaml @@ -0,0 +1,26 @@ +# Get a set of terms by the term ID and a timestamp, maximum 10000 + +name: ontology_get_terms +params: + type: object + required: [ids, ts, "@onto_terms"] + properties: + ids: + type: array + items: + type: string + title: Ontology term IDs + description: The list of ontology term IDs to be fetched + maxItems: 10000 + ts: + type: integer + title: Versioning timestamp in milliseconds since the Unix epoch + "@onto_terms": + type: string + title: Ontology terms collection name +query_prefix: WITH @@onto_terms +query: | + FOR t in @@onto_terms + FILTER t.id IN @ids + FILTER t.expired >= @ts AND t.created <= @ts + RETURN t diff --git a/spec/stored_queries/search_compounds.yaml b/spec/stored_queries/search_compounds.yaml new file mode 100644 index 00000000..4f3265a7 --- /dev/null +++ b/spec/stored_queries/search_compounds.yaml @@ -0,0 +1,35 @@ +# Use ArangoSearch to search documents in the compounds index. Returns documents by ascending id. +name: search_compounds +params: + type: object + properties: + search_text: + default: '' + type: string + description: text to match to document fields + all_documents: + default: false + type: boolean + description: Ignore search text and return all documents + include_obsolete: + default: false + type: boolean + description: should obsolete documents be included? + offset: + default: 0 + type: integer + description: how many documents to skip + result_limit: + default: 10 + type: integer + description: maximum documents to return +query: | + FOR doc IN Compounds + SEARCH ANALYZER(PHRASE(doc.id, @search_text) + OR PHRASE(doc.name, @search_text) + OR PHRASE(doc.abbreviation, @search_text) + OR PHRASE(doc.aliases, @search_text), 'text_en') OR @all_documents + FILTER @include_obsolete || doc.is_obsolete == 0 + SORT doc.id + LIMIT @offset, @result_limit + RETURN doc diff --git a/spec/stored_queries/search_reactions.yaml b/spec/stored_queries/search_reactions.yaml new file mode 100644 index 00000000..14d43e2d --- /dev/null +++ b/spec/stored_queries/search_reactions.yaml @@ -0,0 +1,35 @@ +# Use ArangoSearch to search documents in the Reactions index. Returns documents by ascending id. +name: search_reactions +params: + type: object + properties: + search_text: + default: '' + type: string + description: text to match to document fields + all_documents: + type: boolean + description: ignore the search_text and return all documents + default: false + include_obsolete: + type: boolean + description: should obsolete documents be included + default: false + offset: + type: integer + description: how many documents to skip + default: 0 + result_limit: + type: integer + description: Maximum documents to return + default: 10 +query: | + FOR doc IN Reactions + SEARCH ANALYZER(PHRASE(doc.id, @search_text) + OR PHRASE(doc.name, @search_text) + OR PHRASE(doc.abbreviation, @search_text) + OR PHRASE(doc.aliases, @search_text), 'text_en') OR @all_documents + FILTER @include_obsolete || doc.is_obsolete == 0 + SORT doc.id + LIMIT @offset, @result_limit + RETURN doc diff --git a/spec/stored_queries/taxonomy/taxonomy_fetch_taxon.yaml b/spec/stored_queries/taxonomy/taxonomy_fetch_taxon.yaml new file mode 100644 index 00000000..67cf2ecc --- /dev/null +++ b/spec/stored_queries/taxonomy/taxonomy_fetch_taxon.yaml @@ -0,0 +1,22 @@ +# Fetch a taxon document by taxonomy ID +name: taxonomy_fetch_taxon +params: + type: object + required: [id, ts, "@taxon_coll"] + properties: + id: + type: string + title: NCBI Taxonomy ID + ts: + type: integer + title: Versioning timestamp + "@taxon_coll": + type: string + title: Taxon collection name + examples: [ncbi_taxon, gtdb_taxon] +query: | + for t in @@taxon_coll + filter t.id == @id + filter t.created <= @ts AND t.expired >= @ts + limit 1 + return t diff --git a/spec/stored_queries/taxonomy/taxonomy_fetch_taxon_by_sciname.yaml b/spec/stored_queries/taxonomy/taxonomy_fetch_taxon_by_sciname.yaml new file mode 100644 index 00000000..617a34b2 --- /dev/null +++ b/spec/stored_queries/taxonomy/taxonomy_fetch_taxon_by_sciname.yaml @@ -0,0 +1,26 @@ +# Fetch a taxon document by exact match on sciname +name: taxonomy_fetch_taxon_by_sciname +params: + type: object + required: [sciname, sciname_field, ts, "@taxon_coll"] + properties: + sciname: + type: string + title: NCBI scientific name + sciname_field: + type: string + title: Scientific name field name + examples: [scientific_name, name] + ts: + type: integer + title: Versioning timestamp + "@taxon_coll": + type: string + title: Taxon collection name + examples: [ncbi_taxon, gtdb_taxon] +query: | + for t in @@taxon_coll + filter t.@sciname_field == @sciname + filter t.created <= @ts AND t.expired >= @ts + limit 1 + return t diff --git a/spec/stored_queries/taxonomy/taxonomy_get_associated_ws_objects.yaml b/spec/stored_queries/taxonomy/taxonomy_get_associated_ws_objects.yaml new file mode 100644 index 00000000..36540c0c --- /dev/null +++ b/spec/stored_queries/taxonomy/taxonomy_get_associated_ws_objects.yaml @@ -0,0 +1,74 @@ +# Get the workspace objects associated with a taxon + +name: taxonomy_get_associated_ws_objects + +params: + type: object + required: [taxon_id, ts, "@taxon_coll"] + properties: + "@taxon_coll": + type: string + title: Taxon collection name + examples: [ncbi_taxon, gtdb_taxon] + taxon_id: + type: string + title: NCBI Taxon ID + description: ID of the taxon vertex to find associated taxa + limit: + type: integer + default: 20 + description: Maximum result limit + maximum: 1000 + offset: + type: integer + default: 0 + description: Result offset for pagination + maximum: 100000 + ts: + type: integer + title: Versioning timestamp + select_obj: + type: [array, "null"] + items: {type: string} + title: WS obj fields to keep in the results + default: null + select_edge: + type: [array, "null"] + items: {type: string} + description: Taxon edge fields to keep in the results + default: null +query_prefix: WITH ws_object_version, ws_type_version, ws_workspace +query: | + LET count = COUNT( + FOR tax IN @@taxon_coll + FILTER tax.id == @taxon_id + FILTER tax.created <= @ts AND tax.expired >= @ts + LIMIT 1 + FOR obj IN 1..1 INBOUND tax ws_obj_version_has_taxon + RETURN 1 + ) + LET results = ( + FOR tax IN @@taxon_coll + FILTER tax.id == @taxon_id + FILTER tax.created <= @ts + LIMIT 1 + FOR obj, e IN 1 INBOUND tax ws_obj_version_has_taxon + FILTER obj.is_public OR obj.workspace_id IN ws_ids + LIMIT @offset, @limit + LET type = first( + FOR type IN 1 OUTBOUND obj ws_obj_instance_of_type + RETURN KEEP(type, ['_key', 'module_name', 'type_name', 'maj_ver', 'min_ver']) + ) + LET unver_id = CONCAT("ws_object/", TO_STRING(obj.workspace_id), ':', TO_STRING(obj.object_id)) + LET ws_info = FIRST( + FOR ws IN 1 INBOUND unver_id ws_workspace_contains_obj + FILTER !ws.is_deleted + RETURN KEEP(ws, ['owner', 'metadata', 'is_public', 'mod_epoch']) + ) + LET o = MERGE(obj, {type, ws_info}) + RETURN { + ws_obj: @select_obj ? KEEP(o, @select_obj) : o, + edge: @select_edge ? KEEP(e, @select_edge) : e + } + ) + RETURN {results, total_count: count} diff --git a/spec/stored_queries/taxonomy/taxonomy_get_children.yaml b/spec/stored_queries/taxonomy/taxonomy_get_children.yaml new file mode 100644 index 00000000..94efa1be --- /dev/null +++ b/spec/stored_queries/taxonomy/taxonomy_get_children.yaml @@ -0,0 +1,69 @@ +# Get the array of direct descendants for any taxon +name: taxonomy_get_children +params: + type: object + required: [id, ts, sciname_field, "@taxon_coll", "@taxon_child_of"] + properties: + sciname_field: + type: string + title: Scientific name field name + examples: [scientific_name, name] + "@taxon_coll": + type: string + title: Taxon vertex collection name + examples: [ncbi_taxon, gtdb_taxon] + "@taxon_child_of": + type: string + title: Taxon edge collection name for parent-to-child relationship + examples: [ncbi_child_of_taxon, gtdb_child_of_taxon] + id: + type: string + title: Document ID + description: ID of the taxon vertex for which you want to find descendants + limit: + type: integer + default: 20 + description: Maximum result limit + maximum: 1000 + offset: + type: integer + default: 0 + description: Result offset for pagination + maximum: 100000 + search_text: + type: string + description: Search scientific name + default: '' + ts: + type: integer + title: Versioning timestamp + select: + type: [array, "null"] + items: {type: string} + description: Taxon fields to keep in the results + default: null +query: | + // Fetch the child IDs using the edge attributes + let child_ids = ( + for e in @@taxon_child_of + filter e.to == @id + filter e.created <= @ts AND e.expired >= @ts + return e.from + ) + // Sort and filter the children + // Should only get evaluated if search_text is truthy + let searched = ( + for tax in FULLTEXT(@@taxon_coll, @sciname_field, @search_text) + filter tax.id in child_ids + return tax.id + ) + let filtered = @search_text ? searched : child_ids + let results = ( + for tax in @@taxon_coll + filter tax.id in filtered + filter tax.created <= @ts AND tax.expired >= @ts + sort tax.@sciname_field asc + limit @offset, @limit + return (@select ? KEEP(tax, @select) : tax) + ) + return {total_count: COUNT(filtered), results: results} diff --git a/spec/stored_queries/taxonomy/taxonomy_get_children_cursor.yaml b/spec/stored_queries/taxonomy/taxonomy_get_children_cursor.yaml new file mode 100644 index 00000000..1ec28f00 --- /dev/null +++ b/spec/stored_queries/taxonomy/taxonomy_get_children_cursor.yaml @@ -0,0 +1,33 @@ +# Get all children for a taxon, using a cursor +name: taxonomy_get_children_cursor +params: + type: object + required: [id, ts, "@taxon_coll", "@taxon_child_of"] + properties: + "@taxon_coll": + type: string + title: Taxon vertex collection name + examples: [ncbi_taxon, gtdb_taxon] + "@taxon_child_of": + type: string + title: Taxon edge collection name for parent-to-child relationship + examples: [ncbi_child_of_taxon, gtdb_child_of_taxon] + id: + type: string + title: Document ID + description: ID of the taxon vertex for which you want to find descendants + ts: + type: integer + title: Versioning timestamp + select: + type: [array, "null"] + items: {type: string} + description: Taxon fields to keep in the results + default: null +query: | + for tax in @@taxon_coll + filter tax.id == @id + filter tax.created <= @ts AND tax.expired >= @ts + limit 1 + for child in 1..1 inbound tax @@taxon_child_of + return @select ? KEEP(tax, @select) : tax diff --git a/spec/stored_queries/taxonomy/taxonomy_get_lineage.yaml b/spec/stored_queries/taxonomy/taxonomy_get_lineage.yaml new file mode 100644 index 00000000..11d92656 --- /dev/null +++ b/spec/stored_queries/taxonomy/taxonomy_get_lineage.yaml @@ -0,0 +1,41 @@ +# Get the lineage array for a taxon +# Returns an array where the top-most (closest to the root) taxon is at the beginning +name: taxonomy_get_lineage +params: + type: object + required: [id, ts, "@taxon_coll", "@taxon_child_of"] + properties: + "@taxon_coll": + type: string + title: Taxon collection name + examples: [ncbi_taxon, gtdb_taxon] + "@taxon_child_of": + type: string + title: Taxon edge collection name for parent-to-child relationship + examples: [ncbi_child_of_taxon, gtdb_child_of_taxon] + id: + type: string + title: Document id + description: ID of the taxon vertex for which you want to find ancestors + ts: + type: integer + title: Versioning timestamp + select: + type: [array, "null"] + items: {type: string} + description: Taxon fields to keep in the results + default: null +query: | + let ps = ( + for t in @@taxon_coll + filter t.id == @id + filter t.created <= @ts AND t.expired >= @ts + limit 1 + for ancestor, e, path in 1..100 outbound t @@taxon_child_of + options {bfs: true} + filter path.edges[*].created ALL <= @ts AND path.edges[*].expired ALL >= @ts + return (@select ? KEEP(ancestor, @select) : ancestor) + ) + // doing return reverse(ps) returns an array of an array for some reason, + // which we don't want + for d in reverse(ps) return d diff --git a/spec/stored_queries/taxonomy/taxonomy_get_siblings.yaml b/spec/stored_queries/taxonomy/taxonomy_get_siblings.yaml new file mode 100644 index 00000000..95bfa7ba --- /dev/null +++ b/spec/stored_queries/taxonomy/taxonomy_get_siblings.yaml @@ -0,0 +1,67 @@ +# Get the array of siblings for a taxon +# Results are limited to 10k +name: taxonomy_get_siblings +params: + type: object + required: [id, ts, sciname_field, "@taxon_coll", "@taxon_child_of"] + properties: + sciname_field: + type: string + title: Scientific name field name + examples: [scientific_name, name] + "@taxon_coll": + type: string + title: Taxon vertex collection name + examples: [ncbi_taxon, gtdb_taxon] + "@taxon_child_of": + type: string + title: Taxon edge collection name for parent-to-child relationship + examples: [ncbi_child_of_taxon, gtdb_child_of_taxon] + id: + type: string + title: Document id + description: ID of the taxon vertex for which you want to find siblings + limit: + type: integer + default: 20 + description: Maximum result limit + maximum: 1000 + offset: + type: integer + default: 0 + description: Result offset for pagination + maximum: 100000 + ts: + type: integer + title: Versioning timestamp + select: + type: [array, "null"] + items: {type: string} + description: Taxon fields to keep in the results + default: null +query: | + // Fetch the siblings + let parent_id = first( + for e in @@taxon_child_of + filter e.from == @id + filter e.created <= @ts and e.expired >= @ts + limit 1 + return e.to + ) + let sibling_ids = ( + for e in @@taxon_child_of + filter e.to == parent_id + filter e.created <= @ts and e.expired >= @ts + filter e.from != @id + return e.from + ) + // Apply sort and limits to the results + let siblings = ( + for tax in @@taxon_coll + filter tax.id in sibling_ids + filter tax.created <= @ts AND tax.expired >= @ts + sort tax.@sciname_field asc + limit @offset, @limit + return (@select ? KEEP(tax, @select) : tax) + ) + return {total_count: COUNT(sibling_ids), results: siblings} diff --git a/spec/stored_queries/taxonomy/taxonomy_get_taxon_from_ws_obj.yaml b/spec/stored_queries/taxonomy/taxonomy_get_taxon_from_ws_obj.yaml new file mode 100644 index 00000000..d7bb93c1 --- /dev/null +++ b/spec/stored_queries/taxonomy/taxonomy_get_taxon_from_ws_obj.yaml @@ -0,0 +1,26 @@ +# Fetch a taxon document from a workspace object reference +name: taxonomy_get_taxon_from_ws_obj +params: + type: object + required: [obj_ref, ts, "@taxon_coll"] + properties: + "@taxon_coll": + type: string + title: Taxon collection name + examples: [ncbi_taxon, gtdb_taxon] + obj_ref: + type: string + title: Workspace versioned object reference + ts: + type: integer + title: Versioning timestamp +query_prefix: with @@taxon_coll +query: | + for obj in ws_object_version + filter obj._key == @obj_ref + filter obj.is_public or obj.workspace_id IN ws_ids + for tax in 1 outbound obj ws_obj_version_has_taxon + filter tax.created <= @ts + limit 1 + return tax + diff --git a/spec/stored_queries/taxonomy/taxonomy_search_sci_name.yaml b/spec/stored_queries/taxonomy/taxonomy_search_sci_name.yaml new file mode 100644 index 00000000..8217fdf2 --- /dev/null +++ b/spec/stored_queries/taxonomy/taxonomy_search_sci_name.yaml @@ -0,0 +1,70 @@ +# Should be REVISED then DEPRECATED +# +# Search for a taxon with a scientific name +# Offset is limited to 10k +name: taxonomy_search_sci_name +params: + type: object + required: [search_text, ts, "@taxon_coll", sciname_field] + properties: + "@taxon_coll": + type: string + title: Taxon collection name + examples: [ncbi_taxon, gtdb_taxon] + no_count: + type: boolean + default: false + description: Skip the calculation of a total count of search results + search_text: + type: string + title: Search text + description: Text to search on for the scientific name + ranks: + description: Filter the query to include only these ranks. An empty array is ignored. + type: array + default: [] + items: + type: string + include_strains: + description: true to include strains in the result, regardless of the ranks field. false + to perform no special filtering on strains. + type: boolean + default: false + offset: + type: integer + default: 0 + maximum: 100000 + limit: + type: integer + default: 20 + maximum: 1000 + ts: + type: integer + title: Versioning timestamp + select: + type: [array, "null"] + items: {type: string} + description: Taxon fields to keep in the results + default: null + sciname_field: + type: string + title: Scientific name field name + examples: [scientific_name, name] +query: | + // Search using the fulltext index on scientific_name + // Don't limit the results yet so we can get the total_count below + LET results = ( + FOR doc IN FULLTEXT(@@taxon_coll, @sciname_field, @search_text) + // Filter non-expired docs + FILTER doc.created <= @ts AND doc.expired >= @ts + FILTER LENGTH(@ranks) > 0 ? + (@include_strains ? (doc.rank in @ranks OR doc.strain) : doc.rank in @ranks) : true + RETURN doc + ) + // Limit the results + LET limited = ( + FOR r IN results + LIMIT @offset, @limit + RETURN @select ? KEEP(r, @select) : r + ) + RETURN @no_count ? {results: limited} : {results: limited, total_count: COUNT(results)} diff --git a/spec/stored_queries/taxonomy/taxonomy_search_species.yaml b/spec/stored_queries/taxonomy/taxonomy_search_species.yaml new file mode 100644 index 00000000..0e5fbb82 --- /dev/null +++ b/spec/stored_queries/taxonomy/taxonomy_search_species.yaml @@ -0,0 +1,41 @@ +# DEPRECATED. See taxonomy_search_species_strains and taxonomy_search_species_strains_no_sort +# +# Search for a species/strain. Similar to search_sci_name, but simpler and quicker +name: taxonomy_search_species +params: + type: object + required: [search_text, ts, "@taxon_coll", sciname_field] + properties: + "@taxon_coll": + type: string + title: Taxon collection name + examples: [ncbi_taxon, gtdb_taxon] + search_text: + type: string + title: Search text + description: Text to search on for the scientific name + offset: + type: integer + default: 0 + maximum: 100000 + limit: + type: integer + default: 20 + maximum: 1000 + ts: + type: integer + title: Versioning timestamp + select: + type: [array, "null"] + items: {type: string} + description: Taxon fields to keep in the results + default: null + sciname_field: + type: string + title: Scientific name field name + examples: [scientific_name, name] +query: | + FOR doc IN FULLTEXT(@@taxon_coll, @sciname_field, @search_text) + FILTER doc.created <= @ts AND doc.expired >= @ts AND (doc.rank == "species" OR doc.strain) + LIMIT @offset, @limit + RETURN @select ? KEEP(doc, @select) : doc diff --git a/spec/stored_queries/taxonomy/taxonomy_search_species_strain.yaml b/spec/stored_queries/taxonomy/taxonomy_search_species_strain.yaml new file mode 100644 index 00000000..6ad6ee75 --- /dev/null +++ b/spec/stored_queries/taxonomy/taxonomy_search_species_strain.yaml @@ -0,0 +1,63 @@ +# Search ncbi_taxon collection for species/strains by scientific name +name: taxonomy_search_species_strain +params: + type: object + required: ["@taxon_coll", sciname_field, search_text] + additionalProperties: false + properties: + "@taxon_coll": + type: string + title: Taxon collection name + examples: [ncbi_taxon, gtdb_taxon] + search_text: + type: string + title: Search text + examples: [escherichia, es] + description: Text to search on the search attribute values + sciname_field: + type: string + title: Scientific name field name + examples: [scientific_name, name] + ts: + type: [integer, "null"] + title: Versioning timestamp + default: null + offset: + type: [integer, "null"] + title: Paging offset + maximum: 100000 + default: 0 + limit: + type: [integer, "null"] + title: Max results to return + default: 20 + maximum: 1000 + select: + type: [string, array, "null"] + items: + type: string + examples: [scientific_name, [scientific_name, id]] + default: null + description: Document attributes to keep in the results +query: | + LET search_text__norm = REGEX_REPLACE(LOWER(TRIM(@search_text)), "\\s+", " ") + LET search_text__first_exact_tok = REGEX_SPLIT(search_text__norm, " ")[0] + LET search_text__icu_toks = TOKENS(@search_text, "icu_tokenize") // analyzer + LET search_text__wordboundmod_icu_toks = ( + FOR tok IN search_text__icu_toks + RETURN REGEX_REPLACE(tok, ",.*", "") // commas cannot be escaped in fulltext search + ) + LET search_text__fulltext = CONCAT_SEPARATOR(", ", // comma delimit + FOR tok IN search_text__wordboundmod_icu_toks // prepend "prefix:" + RETURN CONCAT("prefix:", tok) + ) + FOR doc IN FULLTEXT(@@taxon_coll, @sciname_field, search_text__fulltext) + FILTER @ts ? doc.created <= @ts AND doc.expired >= @ts : true + FILTER doc.rank IN ["species", "strain"] OR doc.strain + LET doc_sciname__norm = REGEX_REPLACE(LOWER(TRIM(doc.scientific_name)), "\\s+", " ") // for exact matching + LET contains_ind = CONTAINS(doc_sciname__norm, search_text__norm, true) + SORT contains_ind == 0 DESC, // prefix match + doc_sciname__norm == search_text__norm DESC, // exact match + doc.scientific_name // lexical + LIMIT @offset ? @offset : 0, @limit ? @limit : 20 + RETURN @select ? KEEP(doc, @select) : doc diff --git a/spec/stored_queries/taxonomy/taxonomy_search_species_strain_no_sort.yaml b/spec/stored_queries/taxonomy/taxonomy_search_species_strain_no_sort.yaml new file mode 100644 index 00000000..b9c0a56c --- /dev/null +++ b/spec/stored_queries/taxonomy/taxonomy_search_species_strain_no_sort.yaml @@ -0,0 +1,58 @@ +# Search ncbi_taxon collection for species/strains by scientific name +# Except do not sort, just return the first however many documents +# Useful for short prefixes (e.g., "s") that would be expensive yet not meaningful to sort +name: taxonomy_search_species_strain_no_sort +params: + type: object + required: ["@taxon_coll", sciname_field, search_text] + additionalProperties: false + properties: + "@taxon_coll": + type: string + title: Taxon collection name + examples: [ncbi_taxon, gtdb_taxon] + sciname_field: + type: string + title: Scientific name field name + examples: [scientific_name, name] + search_text: + type: string + title: Search text + examples: [escherichia, es] + description: Text to search on the search attribute values + ts: + type: [integer, "null"] + title: Versioning timestamp + default: null + offset: + type: [integer, "null"] + title: Paging offset + maximum: 100000 + default: 0 + limit: + type: [integer, "null"] + title: Max results to return + default: 20 + maximum: 1000 + select: + type: [string, array, "null"] + items: + type: string + examples: [scientific_name, [scientific_name, id]] + default: null + description: Document attributes to keep in the results +query: | + LET search_text__icu_toks = TOKENS(@search_text, "icu_tokenize") // analyzer + LET search_text__wordboundmod_icu_toks = ( + FOR tok IN search_text__icu_toks + RETURN REGEX_REPLACE(tok, ",.*", "") // commas cannot be escaped in fulltext search + ) + LET search_text__fulltext = CONCAT_SEPARATOR(", ", // comma delimit + FOR tok IN search_text__wordboundmod_icu_toks // prepend "prefix:" + RETURN CONCAT("prefix:", tok) + ) + FOR doc IN FULLTEXT(@@taxon_coll, @sciname_field, search_text__fulltext) + FILTER @ts ? doc.created <= @ts AND doc.expired >= @ts : true + FILTER doc.rank IN ["species", "strain"] OR doc.strain + LIMIT @offset ? @offset : 0, @limit ? @limit : 20 + RETURN @select ? KEEP(doc, @select) : doc diff --git a/spec/stored_queries/test/fetch_test_vertex.yaml b/spec/stored_queries/test/fetch_test_vertex.yaml new file mode 100644 index 00000000..8845f4a1 --- /dev/null +++ b/spec/stored_queries/test/fetch_test_vertex.yaml @@ -0,0 +1,13 @@ +# Test query - fetch a single test vertex by ID +name: fetch_test_vertex +params: + type: object + required: [key] + properties: + key: + type: string + title: _key to match on +query: | + FOR o IN test_vertex + FILTER o._key == @key + RETURN o diff --git a/spec/stored_queries/test/list_test_vertices.yaml b/spec/stored_queries/test/list_test_vertices.yaml new file mode 100644 index 00000000..5d027d78 --- /dev/null +++ b/spec/stored_queries/test/list_test_vertices.yaml @@ -0,0 +1,7 @@ +# Test query - List all test vertices +# Has some simple auth against ws_ids +name: list_test_vertices +query: | + FOR o IN test_vertex + FILTER o.is_public || o.ws_id IN ws_ids + RETURN o diff --git a/spec/stored_queries/ws/ws_fetch_related_data.yaml b/spec/stored_queries/ws/ws_fetch_related_data.yaml new file mode 100644 index 00000000..67215233 --- /dev/null +++ b/spec/stored_queries/ws/ws_fetch_related_data.yaml @@ -0,0 +1,106 @@ +name: ws_fetch_related_data +params: + type: object + required: [obj_key] + properties: + obj_key: + type: string + description: Key of the wsprov_object to search on +query_prefix: WITH ws_type_version, ws_object, ws_workspace +query: | + LET obj_id = concat('ws_object_version/', @obj_key) + FOR obj IN ws_object_version + FILTER obj._id == obj_id + LET prov = ( + FOR v, e, p IN 1..10 ANY obj ws_prov_descendant_of + OPTIONS {bfs: true, uniqueVertices: 'global'} + LIMIT 1000 + FILTER v && !v.deleted + // Check permissions + FILTER v.is_public || v.workspace IN ws_ids + // Fetch the type + LET t = FIRST( + FOR t IN 1 OUTBOUND v ws_obj_instance_of_type + LIMIT 1 + RETURN t + ) + // Fetch the parent unversioned object + LET parent = FIRST( + FOR parent IN 1 OUTBOUND v ws_version_of + LIMIT 1 + RETURN parent + ) + // Fetch the workspace for this object + LET ws = FIRST( + FOR ws IN 1 INBOUND parent ws_workspace_contains_obj + LIMIT 1 + RETURN ws + ) + RETURN {data: v, type: t, hops: COUNT(p.edges), ws} + ) + let refs = ( + FOR v, e, p IN 1..10 ANY obj ws_refers_to + OPTIONS {bfs: true, uniqueVertices: 'global'} + LIMIT 1000 + FILTER v && !v.deleted + // Check permissions + FILTER v.is_public || v.workspace IN ws_ids + // Fetch the type + LET t = FIRST( + FOR t IN 1 OUTBOUND v ws_obj_instance_of_type + LIMIT 1 + RETURN t + ) + // Fetch the parent unversioned object + LET parent = FIRST( + FOR parent IN 1 OUTBOUND v ws_version_of + LIMIT 1 + RETURN parent + ) + // Fetch the workspace for this object + LET ws = FIRST( + FOR ws IN 1 INBOUND parent ws_workspace_contains_obj + LIMIT 1 + RETURN ws + ) + RETURN {data: v, type: t, hops: COUNT(p.edges), ws} + ) + let copies = ( + FOR v, e, p IN 1..10 ANY obj ws_copied_from + OPTIONS {bfs: true, uniqueVertices: 'global'} + LIMIT 1000 + FILTER v && !v.deleted + // Check permissions + FILTER v.is_public || v.workspace IN ws_ids + // Fetch the type + LET t = FIRST( + FOR t IN 1 OUTBOUND v ws_obj_instance_of_type + LIMIT 1 + RETURN t + ) + // Fetch the parent unversioned object + LET parent = FIRST( + FOR parent IN 1 OUTBOUND v ws_version_of + LIMIT 1 + RETURN parent + ) + // Fetch the workspace for this object + LET ws = FIRST( + FOR ws IN 1 INBOUND parent ws_workspace_contains_obj + LIMIT 1 + RETURN ws + ) + RETURN {data: v, type: t, hops: COUNT(p.edges), ws} + ) + let type = FIRST( + FOR t IN 1 OUTBOUND obj ws_obj_instance_of_type + LIMIT 1 + RETURN t + ) + RETURN { + obj: obj, + obj_type: type, + copies: {data: copies, count: COUNT(copies)}, + prov: {data: prov, count: COUNT(prov)}, + refs: {data: refs, count: COUNT(refs)} + } diff --git a/spec/stored_queries/wsprov/wsprov_count_linked_object_types.yaml b/spec/stored_queries/wsprov/wsprov_count_linked_object_types.yaml new file mode 100644 index 00000000..0514b94b --- /dev/null +++ b/spec/stored_queries/wsprov/wsprov_count_linked_object_types.yaml @@ -0,0 +1,60 @@ +# Fetch the counts by type of ws_objects that reference an object +# Note: If both show_private and show_public are true, this will be treated as an OR +name: wsprov_count_linked_object_types +params: + type: object + required: [obj_key, type] + properties: + obj_key: + type: string + description: Key of the wsprov_object to search on + show_private: + type: boolean + description: limit to objects in workspaces that a user has access to + default: true + show_public: + type: boolean + description: limit to objects in public workspaces + default: true + type: + type: string + description: WS type to filter on + owners: + default: null + description: Array of usernames to filter by owner + anyOf: + - {type: "null"} + - type: array + items: + type: string +query: | + WITH wsprov_object + LET obj_id = concat('wsprov_object/', @obj_key) + + let out = ( + for v, e, p in 1..100 + OUTBOUND obj_id wsprov_links, wsprov_copied_into + OPTIONS {bfs: true, uniqueVertices: 'global'} + FILTER (!@type || v.ws_type == @type) + FILTER (!@owners || v.owner IN @owners) + FILTER (@show_private && @show_public) + ? (v.is_public || v.workspace_id IN ws_ids) + : (!@show_private || v.workspace_id IN ws_ids) && (!@show_public || v.is_public) + COLLECT type = v.ws_type with count into type_count + RETURN {type, type_count} + ) + + let inb = ( + for v, e, p in 1..100 + INBOUND obj_id wsprov_links, wsprov_copied_into + OPTIONS {bfs: true, uniqueVertices: 'global'} + FILTER (!@type || v.ws_type == @type) + FILTER (!@owners || v.owner IN @owners) + FILTER (@show_private && @show_public) + ? (v.is_public || v.workspace_id IN ws_ids) + : (!@show_private || v.workspace_id IN ws_ids) && (!@show_public || v.is_public) + COLLECT type = v.ws_type with count into type_count + RETURN {type, type_count} + ) + + return {out, inb} diff --git a/spec/stored_queries/wsprov/wsprov_fetch_copies.yaml b/spec/stored_queries/wsprov/wsprov_fetch_copies.yaml new file mode 100644 index 00000000..49d1e1be --- /dev/null +++ b/spec/stored_queries/wsprov/wsprov_fetch_copies.yaml @@ -0,0 +1,48 @@ +# For a given object, fetch all the objects that it has been copied from or +# to, no matter how many nested times (copies of copies of copies, forward or backward) +# Also returns all linked objects of those copies of any nested level. +# Note: If both show_private and show_public are true, this will be treated as an OR +name: wsprov_fetch_copies +params: + type: object + required: [obj_key] + properties: + obj_key: + type: string + description: wsprov_object key to find links for + show_private: + type: boolean + description: limit to objects in workspaces that a user has access to + default: true + show_public: + type: boolean + description: limit to objects in public workspaces + default: true + result_limit: + default: 10 + type: integer + description: result limit + offset: + default: 0 + type: integer + description: result offset + types: + default: null + description: Optional array of WS types to filter on + anyOf: + - {type: 'null'} + - type: array + items: + type: string +query: | + WITH wsprov_object + LET obj_id = CONCAT('wsprov_object/', @obj_key) + FOR v, e, p IN 1..3 ANY obj_id wsprov_copied_into + OPTIONS {uniqueVertices: 'global', bfs: true} + LET simple_type = SPLIT(SPLIT(v.ws_type, '-', 1)[0], '.')[1] + FILTER (@show_private && @show_public) ? (v.is_public || v.workspace_id IN ws_ids) : + (!@show_private || v.workspace_id IN ws_ids) && (!@show_public || v.is_public) + FILTER (!@types || simple_type IN @types) + FILTER p.vertices[*].ws_type none == "KBaseGenomeAnnotations.Taxon-1.0" + LIMIT @offset, @result_limit + RETURN v diff --git a/spec/stored_queries/wsprov/wsprov_fetch_linked_objects.yaml b/spec/stored_queries/wsprov/wsprov_fetch_linked_objects.yaml new file mode 100644 index 00000000..4d1533db --- /dev/null +++ b/spec/stored_queries/wsprov/wsprov_fetch_linked_objects.yaml @@ -0,0 +1,95 @@ +# Find all linked objects to a given object +# *** if both show_private and show_public are true, this will be treated as an OR *** +name: wsprov_fetch_linked_objects +params: + type: object + required: [obj_key] + properties: + obj_key: + type: string + description: array of private workspace ids the user has access to + show_private: + type: boolean + description: limit to objects in workspaces that a user has access to + default: true + show_public: + type: boolean + description: limit to objects in public workspaces + default: true + types: + description: list of ws types to filter on (set as null to disable) + default: null + anyOf: + - {type: 'null'} + - type: array + items: {type: string} + owners: + description: list of usernames to filter by owner (set as null to disable) + default: null + anyOf: + - {type: 'null'} + - type: array + items: {type: string} + results_limit: + default: 10 + type: integer + description: limit of total results + offset: + default: 0 + type: integer + description: result offset +query: | + WITH wsprov_object + LET obj_id = CONCAT("wsprov_object/", @obj_key) + + let out = ( + FOR v, e, p IN 1..100 + OUTBOUND obj_id wsprov_links, wsprov_copied_into + OPTIONS {uniqueVertices: "global", bfs: true} + FILTER (!@types || v.ws_type IN @types) + FILTER (!@owners || v.owner IN @owners) + FILTER (@show_private && @show_public) + ? (v.is_public || v.workspace_id IN ws_ids) + : (!@show_private || v.workspace_id IN ws_ids) && (!@show_public || v.is_public) + LIMIT @offset, @results_limit + RETURN { + vertex: { + _key: v._key, + is_public: v.is_public, + narr_name: v.narr_name, + obj_name: v.obj_name, + owner: v.owner, + save_date: v.save_date, + workspace_id: v.workspace_id, + ws_type: v.ws_type + }, + path: p + } + ) + + let inb = ( + FOR v, e, p IN 1..100 + INBOUND obj_id wsprov_links, wsprov_copied_into + OPTIONS {uniqueVertices: "global", bfs: true} + FILTER (!@types || v.ws_type IN @types) + FILTER (!@owners || v.owner IN @owners) + FILTER (@show_private && @show_public) + ? (v.is_public || v.workspace_id IN ws_ids) + : (!@show_private || v.workspace_id IN ws_ids) && (!@show_public || v.is_public) + LIMIT @offset, @results_limit + RETURN { + vertex: { + _key: v._key, + is_public: v.is_public, + narr_name: v.narr_name, + obj_name: v.obj_name, + owner: v.owner, + save_date: v.save_date, + workspace_id: v.workspace_id, + ws_type: v.ws_type + }, + path: p + } + ) + + return APPEND(out, inb) diff --git a/spec/stored_queries/wsprov/wsprov_fetch_obj_field.yaml b/spec/stored_queries/wsprov/wsprov_fetch_obj_field.yaml new file mode 100644 index 00000000..2d41de92 --- /dev/null +++ b/spec/stored_queries/wsprov/wsprov_fetch_obj_field.yaml @@ -0,0 +1,20 @@ +# Fetch some object field for an array of object ids +name: wsprov_fetch_obj_field +params: + type: object + required: [prop, obj_ids] + properties: + prop: + type: string + description: property name that you want to fetch + obj_ids: + type: array + items: + type: string + description: array of object ids +query: | + with wsprov_object + for o in wsprov_object + filter o._id in @obj_ids + filter o.is_public || o.workspace_id IN ws_ids + return {key: o._key, @prop: o[@prop]} diff --git a/spec/stored_queries/wsprov/wsprov_fetch_object.yaml b/spec/stored_queries/wsprov/wsprov_fetch_object.yaml new file mode 100644 index 00000000..3f25a0b1 --- /dev/null +++ b/spec/stored_queries/wsprov/wsprov_fetch_object.yaml @@ -0,0 +1,14 @@ +# Fetch a wsprov_object +name: wsprov_fetch_object +params: + type: object + required: [key] + properties: + key: + type: string + description: key of the object to fetch +query: | + for o in wsprov_object + filter o._key == @key + filter o.is_public || (o.workspace_id IN ws_ids) + return o diff --git a/spec/stored_queries/wsprov/wsprov_fetch_paths_between_objects.yaml b/spec/stored_queries/wsprov/wsprov_fetch_paths_between_objects.yaml new file mode 100644 index 00000000..cdb6bc06 --- /dev/null +++ b/spec/stored_queries/wsprov/wsprov_fetch_paths_between_objects.yaml @@ -0,0 +1,34 @@ +# Fetch the counts of a ws_objects in the RE that is linked to a wsprov_object +# *** if both show_private and show_private are true this will be treated as an OR *** +name: wsprov_fetch_paths_between_objects +params: + type: object + required: [start_key, end_key] + properties: + start_key: + type: string + description: key of the object to start from + end_key: + type: string + description: key of the object to terminate with + show_private: + type: boolean + description: if present, limit to objects in workspaces that a user has access to + default: true + show_public: + type: boolean + description: if present, limit to objects in public workspaces + default: true + max_depth: + default: 10 + type: integer + description: longest path to explore +query: | + WITH wsprov_object + FOR v, e, path IN 1..@max_depth + ANY CONCAT('wsprov_object/', @start_key) wsprov_links + OPTIONS {'uniqueVertices': 'path', 'uniqueEdges': 'path'} + FILTER (@show_private && @show_public) ? (v.is_public || v.workspace_id IN ws_ids) : + (!@show_private || v.workspace_id IN ws_ids) && (!@show_public || v.is_public) + filter v._key == @end_key + RETURN path diff --git a/spec/stored_queries/wsprov/wsprov_fetch_references.yaml b/spec/stored_queries/wsprov/wsprov_fetch_references.yaml new file mode 100644 index 00000000..50dbf695 --- /dev/null +++ b/spec/stored_queries/wsprov/wsprov_fetch_references.yaml @@ -0,0 +1,26 @@ +# Fetch inbound references for an object with acl +name: wsprov_fetch_references +params: + type: object + required: [obj_key] + properties: + obj_key: + type: string + description: wsprov_object ._key field that you want to query against + result_limit: + default: 10 + type: integer + description: limit of object results + offset: + default: 0 + type: integer + description: result offset for pagination +query: | + with wsprov_object + let obj_id = concat('wsprov_object/', @obj_key) + for v, e, p in 1..100 inbound obj_id wsprov_links + options {bfs: true, uniqueVertices: 'global'} + filter p.edges[*].type all == 'reference' + filter v.is_public || v.workspace_id IN ws_ids + limit @offset, @result_limit + return v diff --git a/spec/stored_queries/wsprov/wsprov_list_referencing_type_counts.yaml b/spec/stored_queries/wsprov/wsprov_list_referencing_type_counts.yaml new file mode 100644 index 00000000..ec6232f2 --- /dev/null +++ b/spec/stored_queries/wsprov/wsprov_list_referencing_type_counts.yaml @@ -0,0 +1,52 @@ +# Fetch the counts by type of ws_objects that refference an object +# *** if both show_private and show_private are true this will be treated as an OR *** +name: wsprov_list_referencing_type_counts +params: + type: object + required: [key] + properties: + key: + type: string + description: key of the object to start from + show_private: + type: boolean + description: if present, limit to objects in workspaces that a user has access to + default: true + show_public: + type: boolean + description: if present, limit to objects in public workspaces + default: true + owners: + description: if present, limit to objects with owner in list + default: null + anyOf: + - {type: 'null'} + - type: array + items: {type: string} + simplify_type: + type: boolean + description: if true, strip out the module and version before collecting + default: false +query: | + WITH wsprov_object + LET ws_objects = ( + FOR v IN 1..10 + INBOUND CONCAT('wsprov_object/', @key) wsprov_links + OPTIONS {'uniqueVertices': 'global', 'bfs':true} + // If both show_private and show_public, return if either is true + FILTER (@show_private && @show_public) ? (v.is_public || v.workspace_id IN ws_ids) : + (!@show_private || v.workspace_id IN ws_ids) && (!@show_public || v.is_public) + FILTER (!@owners || v.owner IN @owners) + return v + ) + LET types = ( + FOR v in ws_objects + LET ws_type = @simplify_type ? SPLIT(SPLIT(v.ws_type, '-', 1)[0], '.')[1] : v.ws_type + COLLECT type = ws_type WITH COUNT INTO type_count + RETURN {type, type_count} + ) + LET narrs = ( + FOR v in ws_objects + RETURN DISTINCT v.narr_name + ) + RETURN PUSH(types, {'type': 'Narrative', 'type_count': COUNT(narrs)}) diff --git a/spec/stored_query_schema.yaml b/spec/stored_query_schema.yaml new file mode 100644 index 00000000..62ce4d74 --- /dev/null +++ b/spec/stored_query_schema.yaml @@ -0,0 +1,20 @@ +name: stored_query_schema +type: object +required: ['query', 'name'] +properties: + name: + type: string + title: + type: string + description: + type: string + params: + type: object + query_prefix: + type: string + query: + type: string + $schema: + type: string + format: uri +additionalProperties: false diff --git a/spec/test/__init__.py b/spec/test/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/spec/test/collections/__init__.py b/spec/test/collections/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/spec/test/collections/test_djornl.py b/spec/test/collections/test_djornl.py new file mode 100644 index 00000000..a517098c --- /dev/null +++ b/spec/test/collections/test_djornl.py @@ -0,0 +1,67 @@ +""" +Tests for the Dan Jacobson ORNL Arabidopsis collection schemas. + +Tests to ensure that specific elements of the collection schemas validate correctly. + +These tests run within the re_api docker image. +""" +import unittest +from os.path import join as os_path_join +from relation_engine_server.utils.config import get_config +from relation_engine_server.utils.spec_loader import get_schema +from relation_engine_server.utils.json_validation import get_schema_validator +from jsonschema.exceptions import ValidationError + +_BASE_DIR = os_path_join("/app", "spec") + + +class Test_DJORNL_Collections(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.maxDiff = None + cls.config = get_config() + cls.repo_path = cls.config["spec_paths"]["root"] + for key in cls.config["spec_paths"].keys(): + if cls.repo_path in cls.config["spec_paths"][key]: + cls.config["spec_paths"][key] = cls.config["spec_paths"][key].replace( + cls.repo_path, _BASE_DIR + ) + + @classmethod + def tearDownClass(cls): + # undo all the config changes + for key in cls.config["spec_paths"].keys(): + if _BASE_DIR in cls.config["spec_paths"][key]: + cls.config["spec_paths"][key] = cls.config["spec_paths"][key].replace( + _BASE_DIR, cls.repo_path + ) + + def test_node(self, query_name=None, test_data=None): + """ensure node data validates correctly""" + + schema_file = get_schema("collection", "djornl_node", path_only=True) + validator = get_schema_validator(schema_file=schema_file, validate_at="/schema") + + test_data = [ + { + "data": {"_key": "AT1G01010", "go_terms": ["GO:0003700", "GO:0003677"]}, + "valid": True, + }, + { + "data": {"_key": "ABCDE", "node_type": "vertex"}, + "valid": False, + "error": "'vertex' is not valid under any of the given schemas", + }, + { + "data": {"_key": "ABCDE", "clusters": ["GO:0003700", "GO:0003700"]}, + "valid": False, + "error": "\\['GO:0003700', 'GO:0003700'\\] has non-unique elements", + }, + ] + + for test in test_data: + if test["valid"]: + self.assertTrue(validator.is_valid(test["data"])) + else: + with self.assertRaisesRegex(ValidationError, test["error"]): + validator.validate(test["data"]) diff --git a/spec/test/collections/test_silva.py b/spec/test/collections/test_silva.py new file mode 100644 index 00000000..267db322 --- /dev/null +++ b/spec/test/collections/test_silva.py @@ -0,0 +1,299 @@ +import re +from jsonschema.exceptions import ValidationError +from relation_engine_server.utils.json_validation import get_schema_validator +import os +import unittest + +cwd = os.path.dirname(os.path.abspath(__file__)) +yaml_drpth = os.path.join(cwd, "../../collections/silva") +node_yaml_flpth = os.path.join(yaml_drpth, "silva_taxon.yaml") +edge_yaml_flpth = os.path.join(yaml_drpth, "silva_child_of_taxon.yaml") + + +class SILVATreeJSONSchemaTest(unittest.TestCase): + """ + Test the API of the nodes and edges representing SILVA taxonomy tree + All information is from SILVA (arb-silva.de) + See their documentation for more details + """ + + @classmethod + def setUpClass(cls): + cls.validator_node = get_schema_validator( + schema_file=node_yaml_flpth, validate_at="/schema" + ) + cls.validator_edge = get_schema_validator( + schema_file=edge_yaml_flpth, validate_at="/schema" + ) + + cls.nodes_valid = [ + { + "id": "0", # Root's info is assigned by API, since SILVA doesn't seem to have a root node + "name": "Root", + "rank": "root_rank", + }, + { + "id": "2", + "name": "Archea", + "rank": "domain", + }, + { + "id": "47023", + "name": "BCP clade", + "rank": "major_clade", + "release": 138, + }, + { + "id": "42919", + "name": "Asgardarchaeota", + "rank": "phylum", + "release": 138, + }, + { + "id": "4155", + "name": "Amb-18S-504", + "rank": "order", + "release": 119.1, + }, + { + "id": "47162", + "name": "Japygoidea", + "rank": "superfamily", + "release": 138, + }, + { + "id": "47142", + "name": "Tantulocarida", + "rank": "subclass", + "release": 138, + }, + { + "id": "HM032797.1.1344", + "name": "Yeosuana aromativorans", + "rank": "sequence", + "sequence": "gattaca", + "dataset": ["parc", "ref", "nr99"], + }, + { + "id": "CRQV01000019.5091.6588", + "name": "Streptococcus penumoniae", + "rank": "sequence", + "sequence": "gattaca", + "dataset": ["parc", "ref"], # actually in nr99 + }, + { + "id": "HQ216288.1.1242", + "name": "uncultured bacterium", + "rank": "sequence", + "sequence": "gattaca", + "dataset": ["parc"], # actually in nr99 + }, + ] + + cls.nodes_invalid_errors = [ + ( + { + # missing + "id": "id", + "name": "name", + }, + "'rank' is a required property", + ), + ( + { + # missing + "id": "id", + "rank": "kingdom", + }, + "'name' is a required property", + ), + ( + { + # missing + "name": "name", + "rank": "major_clade", + }, + "'id' is a required property", + ), + ( + { + # type + "id": 1, + "name": "name", + "rank": "subphylum", + }, + "1 is not of type 'string'", + ), + ( + { + # type + "id": "id", + "name": 1, + "rank": "subkingdom", + }, + "1 is not of type 'string'", + ), + ( + { + # type + "id": "id", + "name": "name", + "rank": 1, + }, + "1 is not of type 'string'", + ), + ( + { + # type + "id": "id", + "name": "name", + "rank": "infraphylum", + "release": "119", + }, + "'119' is not of type 'number'", + ), + ( + { + # type + "id": "id", + "name": "name", + "rank": "sequence", + "sequence": 1, + }, + "1 is not of type 'string'", + ), + ( + { + # type + "id": "id", + "name": "name", + "rank": "subphylum", + "dataset": 1, + }, + "1 is not of type 'array'", + ), + ( + { + # enum + "id": "id", + "name": "name", + "rank": "fictional_rank", + }, + "'fictional_rank' is not one of ['superfamily', 'subphylum', 'subfamily', " + + "'phylum', 'order', 'major_clade', 'infraclass', 'suborder', 'family', " + + "'superkingdom', 'domain', 'superphylum', 'superorder', 'superclass', " + + "'infraphylum', 'subclass', 'genus', 'class', 'kingdom', 'subkingdom', " + + "'root_rank', 'sequence']", + ), + ( + { + # enum + "id": "id", + "name": "name", + "rank": "superclass", + "dataset": ["nr99", "ref", "parc"], # array in wrong order + }, + "['nr99', 'ref', 'parc'] is not one of [['parc'], ['parc', 'ref'], ['parc', 'ref', 'nr99']]", + ), + ] + + cls.edges_valid = [ + { + "id": "2", + "from": "2", + "to": "0", + }, + { + "id": "42919", + "from": "42919", + "to": "2", + }, + { + "id": "HM032797.1.1344", + "from": "HM032797.1.1344", + "to": "44300", + }, + { + "id": "CRQV01000019.5091.6588", + "from": "CRQV01000019.5091.6588", + "to": "1853", + }, + ] + + cls.edges_invalid_errors = [ + ( + { + # missing + "from": "2", + "to": "0", + }, + "'id' is a required property", + ), + ( + { + # missing + "id": "2", + "to": "0", + }, + "'from' is a required property", + ), + ( + { + # missing + "id": "2", + "from": "2", + }, + "'to' is a required property", + ), + ( + { + # type + "id": 2, + "from": "2", + "to": "0", + }, + "2 is not of type 'string'", + ), + ( + { + # type + "id": "2", + "from": 2, + "to": "0", + }, + "2 is not of type 'string'", + ), + ( + { + # type + "id": "2", + "from": "2", + "to": 0, + }, + "0 is not of type 'string'", + ), + ] + + def _test_type(self, validator, insts_valid, insts_invalid_errors): + for inst in insts_valid: + with self.subTest(inst=inst): + validator.validate(inst) + + for inst, err_expected in insts_invalid_errors: + with self.subTest(inst=inst): + with self.assertRaisesRegex( + ValidationError, "^" + re.escape(err_expected) + "\n" + ): + validator.validate(inst) + + def test(self): + self._test_type( + self.validator_node, self.nodes_valid, self.nodes_invalid_errors + ) + self._test_type( + self.validator_edge, self.edges_valid, self.edges_invalid_errors + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/spec/test/data/ncbi_taxon.json b/spec/test/data/ncbi_taxon.json new file mode 100644 index 00000000..31866edb --- /dev/null +++ b/spec/test/data/ncbi_taxon.json @@ -0,0 +1,2426 @@ +[ + { + "_key": "863172_2021-02-01", + "_id": "ncbi_taxon/863172_2021-02-01", + "_rev": "_b2kgpUK---", + "id": "863172", + "scientific_name": "Influenza A virus (A/Pavia/2789/2009(H3N2))", + "rank": "no rank", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 863172, + "gencode": 1, + "first_version": "2021-02-01", + "last_version": "2021-02-01", + "created": 1612915015847, + "expired": 9007199254740991, + "release_created": 1612137600000, + "release_expired": 9007199254740991 + }, + { + "_key": "1964177_2018-11-01", + "_id": "ncbi_taxon/1964177_2018-11-01", + "_rev": "_b2nMRlW--_", + "id": "1964177", + "scientific_name": "Inga virgultosa", + "rank": "species", + "strain": false, + "aliases": [ + { + "category": "authority", + "name": "Inga virgultosa (Vahl) Desv., 1826" + } + ], + "ncbi_taxon_id": 1964177, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "10247_2021-02-01", + "_id": "ncbi_taxon/10247_2021-02-01", + "_rev": "_b2i7Ehu---", + "id": "10247", + "scientific_name": "Vaccinia virus WR 65-16", + "rank": "no rank", + "strain": true, + "aliases": [ + { + "category": "equivalent name", + "name": "Vaccinia virus (strain WR 65-16)" + } + ], + "ncbi_taxon_id": 10247, + "gencode": 1, + "first_version": "2021-02-01", + "last_version": "2021-02-01", + "created": 1612915015847, + "expired": 9007199254740991, + "release_created": 1612137600000, + "release_expired": 9007199254740991 + }, + { + "_key": "1863603_2018-11-01", + "_id": "ncbi_taxon/1863603_2018-11-01", + "_rev": "_b2m9Oj2--_", + "id": "1863603", + "scientific_name": "Cicadellidae sp. BOLD:ACL9911", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 1863603, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "2314225_2018-11-01", + "_id": "ncbi_taxon/2314225_2018-11-01", + "_rev": "_b2nig46--_", + "id": "2314225", + "scientific_name": "Rhamphomyia sp. BIOUG24867-G07", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 2314225, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "279812_2018-11-01", + "_id": "ncbi_taxon/279812_2018-11-01", + "_rev": "_b2jVMUW--H", + "id": "279812", + "scientific_name": "Roseobacter sp. YS-57", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 279812, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "1627_2021-02-01", + "_id": "ncbi_taxon/1627_2021-02-01", + "_rev": "_b2i6Xii---", + "id": "1627", + "scientific_name": "Lactobacillus sp. 'thermophilus'", + "rank": "species", + "strain": false, + "aliases": [ + { + "category": "authority", + "name": "\"Lactobacillus thermophilus\" Ayers and Johnson" + }, + { + "category": "synonym", + "name": "'Lactobacillus thermophilus'" + } + ], + "ncbi_taxon_id": 1627, + "gencode": 11, + "first_version": "2021-02-01", + "last_version": "2021-02-01", + "created": 1612915015847, + "expired": 9007199254740991, + "release_created": 1612137600000, + "release_expired": 9007199254740991 + }, + { + "_key": "184426_2018-11-01", + "_id": "ncbi_taxon/184426_2018-11-01", + "_rev": "_b2jLY96--_", + "id": "184426", + "scientific_name": "environmental samples", + "rank": "no rank", + "strain": true, + "aliases": [], + "ncbi_taxon_id": 184426, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "329529_2018-11-01", + "_id": "ncbi_taxon/329529_2018-11-01", + "_rev": "_b2jZwWS--_", + "id": "329529", + "scientific_name": "environmental samples", + "rank": "no rank", + "strain": true, + "aliases": [], + "ncbi_taxon_id": 329529, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "2233698_2020-03-01", + "_id": "ncbi_taxon/2233698_2020-03-01", + "_rev": "_b2ngQMq--D", + "id": "2233698", + "scientific_name": "Porphyrogenes calathana", + "rank": "species", + "strain": false, + "aliases": [ + { + "category": "authority", + "name": "Ocyba calathana (Hewitson, 1868)" + }, + { + "category": "authority", + "name": "Porphyrogenes calathana (Hewitson, 1868)" + }, + { + "category": "synonym", + "name": "Ocyba calathana" + }, + { + "category": "includes", + "name": "Porphyrogenes sp. 1 WL-2018" + } + ], + "ncbi_taxon_id": 2233698, + "gencode": 1, + "first_version": "2020-03-01", + "last_version": "2021-02-01", + "created": 1584487952760, + "expired": 9007199254740991, + "release_created": 1583020800000, + "release_expired": 9007199254740991 + }, + { + "_key": "1231195_2018-11-01", + "_id": "ncbi_taxon/1231195_2018-11-01", + "_rev": "_b2ld-Iq--_", + "id": "1231195", + "scientific_name": "environmental samples", + "rank": "no rank", + "strain": true, + "aliases": [], + "ncbi_taxon_id": 1231195, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "1232375_2018-11-01", + "_id": "ncbi_taxon/1232375_2018-11-01", + "_rev": "_b2ld-iW--_", + "id": "1232375", + "scientific_name": "Streptomyces sp. 11719", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 1232375, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "1359250_2018-11-01", + "_id": "ncbi_taxon/1359250_2018-11-01", + "_rev": "_b2l4Yau--B", + "id": "1359250", + "scientific_name": "Calliandra calycina", + "rank": "species", + "strain": false, + "aliases": [ + { + "category": "authority", + "name": "Calliandra calycina Benth." + } + ], + "ncbi_taxon_id": 1359250, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "2452716_2018-12-01", + "_id": "ncbi_taxon/2452716_2018-12-01", + "_rev": "_b2nktie--_", + "id": "2452716", + "scientific_name": "Brachystomellidae sp. BIOUG28261-E12", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 2452716, + "gencode": 1, + "first_version": "2018-12-01", + "last_version": "2021-02-01", + "created": 1543622460000, + "expired": 9007199254740991, + "release_created": 1543622400000, + "release_expired": 9007199254740991 + }, + { + "_key": "576250_2018-11-01", + "_id": "ncbi_taxon/576250_2018-11-01", + "_rev": "_b2kAI_a--B", + "id": "576250", + "scientific_name": "environmental samples", + "rank": "no rank", + "strain": true, + "aliases": [], + "ncbi_taxon_id": 576250, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "1899338_2018-11-01", + "_id": "ncbi_taxon/1899338_2018-11-01", + "_rev": "_b2nAhaK--_", + "id": "1899338", + "scientific_name": "Hydrocotyle hydrophila", + "rank": "species", + "strain": false, + "aliases": [ + { + "category": "authority", + "name": "Hydrocotyle hydrophila Petrie" + } + ], + "ncbi_taxon_id": 1899338, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "1449683_2018-11-01", + "_id": "ncbi_taxon/1449683_2018-11-01", + "_rev": "_b2mMzWe--_", + "id": "1449683", + "scientific_name": "environmental samples", + "rank": "no rank", + "strain": true, + "aliases": [], + "ncbi_taxon_id": 1449683, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "562_2021-02-01", + "_id": "ncbi_taxon/562_2021-02-01", + "_rev": "_b2i6X-W--C", + "id": "562", + "scientific_name": "Escherichia coli", + "rank": "species", + "strain": false, + "aliases": [ + { + "category": "type material", + "name": "ATCC 11775" + }, + { + "category": "type material", + "name": "CCUG 24" + }, + { + "category": "type material", + "name": "CCUG 29300" + }, + { + "category": "type material", + "name": "CIP 54.8" + }, + { + "category": "type material", + "name": "DSM 30083" + }, + { + "category": "type material", + "name": "IAM 12119" + }, + { + "category": "type material", + "name": "JCM 1649" + }, + { + "category": "type material", + "name": "LMG 2092" + }, + { + "category": "type material", + "name": "LMG:2092" + }, + { + "category": "type material", + "name": "NBRC 102203" + }, + { + "category": "type material", + "name": "NCCB 54008" + }, + { + "category": "type material", + "name": "NCTC 9001" + }, + { + "category": "type material", + "name": "strain U5/41" + }, + { + "category": "authority", + "name": "\"Bacillus coli\" Migula 1895" + }, + { + "category": "authority", + "name": "\"Bacterium coli commune\" Escherich 1885" + }, + { + "category": "authority", + "name": "\"Bacterium coli\" (Migula 1895) Lehmann and Neumann 1896" + }, + { + "category": "authority", + "name": "Escherichia coli (Migula 1895) Castellani and Chalmers 1919" + }, + { + "category": "synonym", + "name": "Bacillus coli" + }, + { + "category": "synonym", + "name": "Bacterium coli commune" + }, + { + "category": "synonym", + "name": "Bacterium coli" + }, + { + "category": "synonym", + "name": "Enterococcus coli" + }, + { + "category": "includes", + "name": "bacterium 10a" + }, + { + "category": "includes", + "name": "bacterium E3" + }, + { + "category": "includes", + "name": "Escherichia sp. 3_2_53FAA" + }, + { + "category": "includes", + "name": "Escherichia sp. MAR" + }, + { + "category": "common name", + "name": "E. coli" + }, + { + "category": "equivalent name", + "name": "Escherichia/Shigella coli" + } + ], + "ncbi_taxon_id": 562, + "gencode": 11, + "first_version": "2021-02-01", + "last_version": "2021-02-01", + "created": 1612915015847, + "expired": 9007199254740991, + "release_created": 1612137600000, + "release_expired": 9007199254740991 + }, + { + "_key": "134371_2018-11-01", + "_id": "ncbi_taxon/134371_2018-11-01", + "_rev": "_b2jG1wS--D", + "id": "134371", + "scientific_name": "Influenza A virus PX8-XIII(A/USSR/90/77(H1N1)xA/Pintail Duck/Primorie/695/76(H2N3))", + "rank": "no rank", + "strain": true, + "aliases": [], + "ncbi_taxon_id": 134371, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "699252_2020-03-01", + "_id": "ncbi_taxon/699252_2020-03-01", + "_rev": "_b2kULKW--B", + "id": "699252", + "scientific_name": "environmental samples", + "rank": "no rank", + "strain": true, + "aliases": [], + "ncbi_taxon_id": 699252, + "gencode": 11, + "first_version": "2020-03-01", + "last_version": "2021-02-01", + "created": 1584487952760, + "expired": 9007199254740991, + "release_created": 1583020800000, + "release_expired": 9007199254740991 + }, + { + "_key": "576326_2018-11-01", + "_id": "ncbi_taxon/576326_2018-11-01", + "_rev": "_b2kAICO--_", + "id": "576326", + "scientific_name": "Sobarocephala atricornis", + "rank": "species", + "strain": false, + "aliases": [ + { + "category": "authority", + "name": "Sobarocephala atricornis Sabrosky, 1974" + } + ], + "ncbi_taxon_id": 576326, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "446434_2018-11-01", + "_id": "ncbi_taxon/446434_2018-11-01", + "_rev": "_b2jq3Su--_", + "id": "446434", + "scientific_name": "Cyrnus flavidus", + "rank": "species", + "strain": false, + "aliases": [ + { + "category": "authority", + "name": "Cyrnus flavidus McLachlan, 1864" + } + ], + "ncbi_taxon_id": 446434, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "364520_2018-11-01", + "_id": "ncbi_taxon/364520_2018-11-01", + "_rev": "_b2jee32--_", + "id": "364520", + "scientific_name": "environmental samples", + "rank": "no rank", + "strain": true, + "aliases": [], + "ncbi_taxon_id": 364520, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "2653369_2020-03-01", + "_id": "ncbi_taxon/2653369_2020-03-01", + "_rev": "_b2nyp1i--_", + "id": "2653369", + "scientific_name": "Typhlodromus ernesti", + "rank": "species", + "strain": false, + "aliases": [ + { + "category": "authority", + "name": "Typhlodromus ernesti Ragusa & Swirski, 1978" + }, + { + "category": "synonym", + "name": "Typhlodromus (Typhlodromus) ernesti" + } + ], + "ncbi_taxon_id": 2653369, + "gencode": 1, + "first_version": "2020-03-01", + "last_version": "2021-02-01", + "created": 1584487952760, + "expired": 9007199254740991, + "release_created": 1583020800000, + "release_expired": 9007199254740991 + }, + { + "_key": "88974_2018-11-01", + "_id": "ncbi_taxon/88974_2018-11-01", + "_rev": "_b2jDItq--F", + "id": "88974", + "scientific_name": "uncultured alpha proteobacterium DCM-FREE-27", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 88974, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "1794445_2018-11-01", + "_id": "ncbi_taxon/1794445_2018-11-01", + "_rev": "_b2m1vLe--D", + "id": "1794445", + "scientific_name": "Bacillus sp. 987B6_12ACASO", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 1794445, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "82145_2018-11-01", + "_id": "ncbi_taxon/82145_2018-11-01", + "_rev": "_b2jCIFS--B", + "id": "82145", + "scientific_name": "Balanopaceae", + "rank": "family", + "strain": false, + "aliases": [ + { + "category": "authority", + "name": "Balanopaceae Benth. & Hook.f., 1880" + } + ], + "ncbi_taxon_id": 82145, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "2558586_2019-05-01", + "_id": "ncbi_taxon/2558586_2019-05-01", + "_rev": "_b2nrQEe--_", + "id": "2558586", + "scientific_name": "Streptococcus satellite phage Javan236", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 2558586, + "gencode": 11, + "first_version": "2019-05-01", + "last_version": "2021-02-01", + "created": 1556668860000, + "expired": 9007199254740991, + "release_created": 1556668800000, + "release_expired": 9007199254740991 + }, + { + "_key": "1194471_2018-11-01", + "_id": "ncbi_taxon/1194471_2018-11-01", + "_rev": "_b2lTKk6--D", + "id": "1194471", + "scientific_name": "Leptothrix sp. FH_36", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 1194471, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "206622_2018-11-01", + "_id": "ncbi_taxon/206622_2018-11-01", + "_rev": "_b2jNhmm--B", + "id": "206622", + "scientific_name": "Cymopterus beckii", + "rank": "species", + "strain": false, + "aliases": [ + { + "category": "authority", + "name": "Cymopterus beckii S.L.Welsh & Goodrich" + } + ], + "ncbi_taxon_id": 206622, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "2689249_2021-02-01", + "_id": "ncbi_taxon/2689249_2021-02-01", + "_rev": "_b2n1hOG--I", + "id": "2689249", + "scientific_name": "Troglohyphantes jamatus", + "rank": "species", + "strain": false, + "aliases": [ + { + "category": "authority", + "name": "Troglohyphantes jamatus Roewer, 1931" + } + ], + "ncbi_taxon_id": 2689249, + "gencode": 1, + "first_version": "2021-02-01", + "last_version": "2021-02-01", + "created": 1612915015847, + "expired": 9007199254740991, + "release_created": 1612137600000, + "release_expired": 9007199254740991 + }, + { + "_key": "844482_2018-11-01", + "_id": "ncbi_taxon/844482_2018-11-01", + "_rev": "_b2kgBA6--B", + "id": "844482", + "scientific_name": "Lepidoptera sp. BOLD:AAF9521", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 844482, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "406715_2018-11-01", + "_id": "ncbi_taxon/406715_2018-11-01", + "_rev": "_b2jmSNy--_", + "id": "406715", + "scientific_name": "Leptofauchea chiloensis", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 406715, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "179932_2018-11-01", + "_id": "ncbi_taxon/179932_2018-11-01", + "_rev": "_b2jLY-y--F", + "id": "179932", + "scientific_name": "Avenionia", + "rank": "genus", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 179932, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "1378707_2018-11-01", + "_id": "ncbi_taxon/1378707_2018-11-01", + "_rev": "_b2l5hDu--_", + "id": "1378707", + "scientific_name": "Impatiens cf. drepanophora Suksathan 4681", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 1378707, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "11566_2021-02-01", + "_id": "ncbi_taxon/11566_2021-02-01", + "_rev": "_b2i7S9y---", + "id": "11566", + "scientific_name": "Influenza C virus (C/PIG/Beijing/439/1982)", + "rank": "no rank", + "strain": true, + "aliases": [ + { + "category": "equivalent name", + "name": "Influenza C virus (STRAIN C/PIG/BEIJING/439/82)" + } + ], + "ncbi_taxon_id": 11566, + "gencode": 1, + "first_version": "2021-02-01", + "last_version": "2021-02-01", + "created": 1612915015847, + "expired": 9007199254740991, + "release_created": 1612137600000, + "release_expired": 9007199254740991 + }, + { + "_key": "1388837_2018-11-01", + "_id": "ncbi_taxon/1388837_2018-11-01", + "_rev": "_b2l8ENu--B", + "id": "1388837", + "scientific_name": "Amynthas sp. GD201106-05", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 1388837, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "707363_2018-11-01", + "_id": "ncbi_taxon/707363_2018-11-01", + "_rev": "_b2kUNni--B", + "id": "707363", + "scientific_name": "Ulva sp. EE2", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 707363, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "2412393_2018-11-01", + "_id": "ncbi_taxon/2412393_2018-11-01", + "_rev": "_b2nkL0m--D", + "id": "2412393", + "scientific_name": "Psychodidae sp. BIOUG23100-F11", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 2412393, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "11150_2021-02-01", + "_id": "ncbi_taxon/11150_2021-02-01", + "_rev": "_b2i7PPa---", + "id": "11150", + "scientific_name": "Porcine transmissible gastroenteritis coronavirus strain FS772/70", + "rank": "no rank", + "strain": true, + "aliases": [ + { + "category": "equivalent name", + "name": "Porcine transmissible gastroenteritis coronavirus (STRAIN FS772/70)" + } + ], + "ncbi_taxon_id": 11150, + "gencode": 1, + "first_version": "2021-02-01", + "last_version": "2021-02-01", + "created": 1612915015847, + "expired": 9007199254740991, + "release_created": 1612137600000, + "release_expired": 9007199254740991 + }, + { + "_key": "1523177_2021-02-01", + "_id": "ncbi_taxon/1523177_2021-02-01", + "_rev": "_b2mWP9q---", + "id": "1523177", + "scientific_name": "Influenza A virus (A/Hangzhou/779/2010(H3N2))", + "rank": "no rank", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 1523177, + "gencode": 1, + "first_version": "2021-02-01", + "last_version": "2021-02-01", + "created": 1612915015847, + "expired": 9007199254740991, + "release_created": 1612137600000, + "release_expired": 9007199254740991 + }, + { + "_key": "2601701_2019-09-01", + "_id": "ncbi_taxon/2601701_2019-09-01", + "_rev": "_b2nxHvK--_", + "id": "2601701", + "scientific_name": "Habenaria macrandra", + "rank": "species", + "strain": false, + "aliases": [ + { + "category": "authority", + "name": "Habenaria macrandra Lindl., 1862" + } + ], + "ncbi_taxon_id": 2601701, + "gencode": 1, + "first_version": "2019-09-01", + "last_version": "2021-02-01", + "created": 1567296060000, + "expired": 9007199254740991, + "release_created": 1567296000000, + "release_expired": 9007199254740991 + }, + { + "_key": "2718644_2021-02-01", + "_id": "ncbi_taxon/2718644_2021-02-01", + "_rev": "_b2n6tHK--K", + "id": "2718644", + "scientific_name": "Listeria sp. FSL_L7-1519", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 2718644, + "gencode": 11, + "first_version": "2021-02-01", + "last_version": "2021-02-01", + "created": 1612915015847, + "expired": 9007199254740991, + "release_created": 1612137600000, + "release_expired": 9007199254740991 + }, + { + "_key": "880422_2021-02-01", + "_id": "ncbi_taxon/880422_2021-02-01", + "_rev": "_b2kkp5i---", + "id": "880422", + "scientific_name": "Influenza A virus (A/Habana/14720/2010(H3N2))", + "rank": "no rank", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 880422, + "gencode": 1, + "first_version": "2021-02-01", + "last_version": "2021-02-01", + "created": 1612915015847, + "expired": 9007199254740991, + "release_created": 1612137600000, + "release_expired": 9007199254740991 + }, + { + "_key": "1497341_2018-11-01", + "_id": "ncbi_taxon/1497341_2018-11-01", + "_rev": "_b2mUStW--D", + "id": "1497341", + "scientific_name": "Pseudomonas sp. enrichment culture clone PF1", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 1497341, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "10272_2021-02-01", + "_id": "ncbi_taxon/10272_2021-02-01", + "_rev": "_b2i7Eym---", + "id": "10272", + "scientific_name": "Rabbit fibroma virus (strain Kasza)", + "rank": "no rank", + "strain": true, + "aliases": [ + { + "category": "equivalent name", + "name": "Shope fibroma virus (STRAIN KASZA)" + } + ], + "ncbi_taxon_id": 10272, + "gencode": 1, + "first_version": "2021-02-01", + "last_version": "2021-02-01", + "created": 1612915015847, + "expired": 9007199254740991, + "release_created": 1612137600000, + "release_expired": 9007199254740991 + }, + { + "_key": "4758_2018-11-01", + "_id": "ncbi_taxon/4758_2018-11-01", + "_rev": "_b2i7oIW--_", + "id": "4758", + "scientific_name": "Neocallimastix patriciarum", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 4758, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "2713500_2021-02-01", + "_id": "ncbi_taxon/2713500_2021-02-01", + "_rev": "_b2n6rv----", + "id": "2713500", + "scientific_name": "Listeria sp. FSL_L7-0091", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 2713500, + "gencode": 11, + "first_version": "2021-02-01", + "last_version": "2021-02-01", + "created": 1612915015847, + "expired": 9007199254740991, + "release_created": 1612137600000, + "release_expired": 9007199254740991 + }, + { + "_key": "1132808_2018-11-01", + "_id": "ncbi_taxon/1132808_2018-11-01", + "_rev": "_b2lFoQ2--_", + "id": "1132808", + "scientific_name": "Alocasia scalprum", + "rank": "species", + "strain": false, + "aliases": [ + { + "category": "authority", + "name": "Alocasia scalprum A.Hay" + } + ], + "ncbi_taxon_id": 1132808, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "1231191_2018-11-01", + "_id": "ncbi_taxon/1231191_2018-11-01", + "_rev": "_b2ld-Ii--_", + "id": "1231191", + "scientific_name": "environmental samples", + "rank": "no rank", + "strain": true, + "aliases": [], + "ncbi_taxon_id": 1231191, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "578129_2021-02-01", + "_id": "ncbi_taxon/578129_2021-02-01", + "_rev": "_b2kAKTW---", + "id": "578129", + "scientific_name": "Heteroliodon occipitalis", + "rank": "species", + "strain": false, + "aliases": [ + { + "category": "type material", + "name": "BMNH 1946.1.12.28" + }, + { + "category": "authority", + "name": "Pseudoxyrhopus occipitalis Boulenger, 1896" + }, + { + "category": "synonym", + "name": "Pseudoxyrhopus occipitalis" + } + ], + "ncbi_taxon_id": 578129, + "gencode": 1, + "first_version": "2021-02-01", + "last_version": "2021-02-01", + "created": 1612915015847, + "expired": 9007199254740991, + "release_created": 1612137600000, + "release_expired": 9007199254740991 + }, + { + "_key": "2727498_2021-02-01", + "_id": "ncbi_taxon/2727498_2021-02-01", + "_rev": "_b2n6up6--W", + "id": "2727498", + "scientific_name": "Caulastrocecis interstratella", + "rank": "species", + "strain": false, + "aliases": [ + { + "category": "authority", + "name": "Caulastrocecis interstratella (Christoph, 1872)" + } + ], + "ncbi_taxon_id": 2727498, + "gencode": 1, + "first_version": "2021-02-01", + "last_version": "2021-02-01", + "created": 1612915015847, + "expired": 9007199254740991, + "release_created": 1612137600000, + "release_expired": 9007199254740991 + }, + { + "_key": "2651177_2020-03-01", + "_id": "ncbi_taxon/2651177_2020-03-01", + "_rev": "_b2nypS6--B", + "id": "2651177", + "scientific_name": "environmental samples", + "rank": "no rank", + "strain": true, + "aliases": [], + "ncbi_taxon_id": 2651177, + "gencode": 1, + "first_version": "2020-03-01", + "last_version": "2021-02-01", + "created": 1584487952760, + "expired": 9007199254740991, + "release_created": 1583020800000, + "release_expired": 9007199254740991 + }, + { + "_key": "219650_2018-11-01", + "_id": "ncbi_taxon/219650_2018-11-01", + "_rev": "_b2jPl6W--_", + "id": "219650", + "scientific_name": "environmental samples", + "rank": "no rank", + "strain": true, + "aliases": [], + "ncbi_taxon_id": 219650, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "10324_2021-02-01", + "_id": "ncbi_taxon/10324_2021-02-01", + "_rev": "_b2i7Fqu---", + "id": "10324", + "scientific_name": "Bovine herpesvirus type 1.1 (strain P8-2)", + "rank": "no rank", + "strain": true, + "aliases": [ + { + "category": "equivalent name", + "name": "Bovine herpesvirus type 1 (strain P8-2)" + } + ], + "ncbi_taxon_id": 10324, + "gencode": 1, + "first_version": "2021-02-01", + "last_version": "2021-02-01", + "created": 1612915015847, + "expired": 9007199254740991, + "release_created": 1612137600000, + "release_expired": 9007199254740991 + }, + { + "_key": "1744050_2018-11-01", + "_id": "ncbi_taxon/1744050_2018-11-01", + "_rev": "_b2mxbKa--_", + "id": "1744050", + "scientific_name": "Spilogona sp. BOLD:ACC9483", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 1744050, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "11065_2021-02-01", + "_id": "ncbi_taxon/11065_2021-02-01", + "_rev": "_b2i7N0m---", + "id": "11065", + "scientific_name": "Dengue virus 2 Thailand/NGS-C/1944", + "rank": "no rank", + "strain": true, + "aliases": [ + { + "category": "equivalent name", + "name": "Dengue virus NGC" + }, + { + "category": "equivalent name", + "name": "Dengue virus prototype strain New Guinea C (NGC)" + }, + { + "category": "equivalent name", + "name": "Dengue virus type 2 (NGC-prototype)" + }, + { + "category": "equivalent name", + "name": "Dengue virus type 2 (strain New Guinea C)" + }, + { + "category": "equivalent name", + "name": "Dengue virus type 2 Thailand/NGS-C/1944" + } + ], + "ncbi_taxon_id": 11065, + "gencode": 1, + "first_version": "2021-02-01", + "last_version": "2021-02-01", + "created": 1612915015847, + "expired": 9007199254740991, + "release_created": 1612137600000, + "release_expired": 9007199254740991 + }, + { + "_key": "379362_2018-11-01", + "_id": "ncbi_taxon/379362_2018-11-01", + "_rev": "_b2jgWoK--_", + "id": "379362", + "scientific_name": "environmental samples", + "rank": "no rank", + "strain": true, + "aliases": [], + "ncbi_taxon_id": 379362, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "1356674_2018-11-01", + "_id": "ncbi_taxon/1356674_2018-11-01", + "_rev": "_b2l4Xc6--D", + "id": "1356674", + "scientific_name": "methanogenic prokaryote enrichment culture B19_144", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 1356674, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "35683_2018-11-01", + "_id": "ncbi_taxon/35683_2018-11-01", + "_rev": "_b2i9AW6--B", + "id": "35683", + "scientific_name": "Pseudopedinella", + "rank": "genus", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 35683, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "819947_2018-11-01", + "_id": "ncbi_taxon/819947_2018-11-01", + "_rev": "_b2kfzke--B", + "id": "819947", + "scientific_name": "Diptera sp. BOLD:AAG2430", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 819947, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "2038696_2018-11-01", + "_id": "ncbi_taxon/2038696_2018-11-01", + "_rev": "_b2nVysS--_", + "id": "2038696", + "scientific_name": "Erica grandiflora", + "rank": "species", + "strain": false, + "aliases": [ + { + "category": "authority", + "name": "Erica grandiflora L.f., 1781" + } + ], + "ncbi_taxon_id": 2038696, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "1220632_2018-11-01", + "_id": "ncbi_taxon/1220632_2018-11-01", + "_rev": "_b2lZJbu--B", + "id": "1220632", + "scientific_name": "Citrobacter sp. 003.13", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 1220632, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "310254_2018-11-01", + "_id": "ncbi_taxon/310254_2018-11-01", + "_rev": "_b2jXGIO--_", + "id": "310254", + "scientific_name": "Elymus gmelinii", + "rank": "species", + "strain": false, + "aliases": [ + { + "category": "authority", + "name": "Elymus gmelinii (Ledeb.) Tzvelev, 1968" + } + ], + "ncbi_taxon_id": 310254, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "79696_2021-02-01", + "_id": "ncbi_taxon/79696_2021-02-01", + "_rev": "_b2jBeOO---", + "id": "79696", + "scientific_name": "Influenza B virus (B/Ann Arbor/1/1966 [cold-adapted and wild- type])", + "rank": "no rank", + "strain": true, + "aliases": [ + { + "category": "equivalent name", + "name": "Influenza B virus (strain B/Ann Arbor/1/66 [cold-adapted and wild- type])" + } + ], + "ncbi_taxon_id": 79696, + "gencode": 1, + "first_version": "2021-02-01", + "last_version": "2021-02-01", + "created": 1612915015847, + "expired": 9007199254740991, + "release_created": 1612137600000, + "release_expired": 9007199254740991 + }, + { + "_key": "2333206_2018-11-01", + "_id": "ncbi_taxon/2333206_2018-11-01", + "_rev": "_b2nirGu--B", + "id": "2333206", + "scientific_name": "Phronia sp. JSDIP316-10", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 2333206, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "2310952_2018-11-01", + "_id": "ncbi_taxon/2310952_2018-11-01", + "_rev": "_b2niMzm--D", + "id": "2310952", + "scientific_name": "Gymnopternus sp. BIOUG25014-A04", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 2310952, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "1474010_2018-11-01", + "_id": "ncbi_taxon/1474010_2018-11-01", + "_rev": "_b2mR25S--B", + "id": "1474010", + "scientific_name": "Sciaridae sp. BOLD:ACA7925", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 1474010, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "1230346_2018-11-01", + "_id": "ncbi_taxon/1230346_2018-11-01", + "_rev": "_b2lc90m--B", + "id": "1230346", + "scientific_name": "environmental samples", + "rank": "no rank", + "strain": true, + "aliases": [], + "ncbi_taxon_id": 1230346, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "1674892_2018-11-01", + "_id": "ncbi_taxon/1674892_2018-11-01", + "_rev": "_b2mra9O--D", + "id": "1674892", + "scientific_name": "Anaerolineales bacterium Chloro_03", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 1674892, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "699250_2019-06-01", + "_id": "ncbi_taxon/699250_2019-06-01", + "_rev": "_b2kULKW--_", + "id": "699250", + "scientific_name": "environmental samples", + "rank": "no rank", + "strain": true, + "aliases": [], + "ncbi_taxon_id": 699250, + "gencode": 11, + "first_version": "2019-06-01", + "last_version": "2021-02-01", + "created": 1559347260000, + "expired": 9007199254740991, + "release_created": 1559347200000, + "release_expired": 9007199254740991 + }, + { + "_key": "563569_2018-11-01", + "_id": "ncbi_taxon/563569_2018-11-01", + "_rev": "_b2j98-u--_", + "id": "563569", + "scientific_name": "Streptomyces sp. 13665B", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 563569, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "1125656_2018-11-01", + "_id": "ncbi_taxon/1125656_2018-11-01", + "_rev": "_b2lFlwy--B", + "id": "1125656", + "scientific_name": "Tetrathiobacter sp. LC417", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 1125656, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "157696_2018-11-01", + "_id": "ncbi_taxon/157696_2018-11-01", + "_rev": "_b2jI89G--B", + "id": "157696", + "scientific_name": "Alicyclobacillus sp. DSM 6481", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 157696, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "190758_2020-03-01", + "_id": "ncbi_taxon/190758_2020-03-01", + "_rev": "_b2jMhti--D", + "id": "190758", + "scientific_name": "Leucaena salvadorensis", + "rank": "species", + "strain": false, + "aliases": [ + { + "category": "common name", + "name": "aserillo" + }, + { + "category": "common name", + "name": "sepia vaina" + }, + { + "category": "authority", + "name": "Leucaena salvadorensis Standl. ex Britton & Rose" + } + ], + "ncbi_taxon_id": 190758, + "gencode": 1, + "first_version": "2020-03-01", + "last_version": "2021-02-01", + "created": 1584487952760, + "expired": 9007199254740991, + "release_created": 1583020800000, + "release_expired": 9007199254740991 + }, + { + "_key": "2051570_2018-11-01", + "_id": "ncbi_taxon/2051570_2018-11-01", + "_rev": "_b2nZ-GS--_", + "id": "2051570", + "scientific_name": "Influenza B virus (B/Brisbane/FSS700/2017)", + "rank": "no rank", + "strain": true, + "aliases": [], + "ncbi_taxon_id": 2051570, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "1424688_2018-11-01", + "_id": "ncbi_taxon/1424688_2018-11-01", + "_rev": "_b2mGzQi--B", + "id": "1424688", + "scientific_name": "Angraecum cf. moandense CM-2013", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 1424688, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "2718636_2021-02-01", + "_id": "ncbi_taxon/2718636_2021-02-01", + "_rev": "_b2n6tHG--M", + "id": "2718636", + "scientific_name": "Listeria sp. FSL_L7-0091", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 2718636, + "gencode": 11, + "first_version": "2021-02-01", + "last_version": "2021-02-01", + "created": 1612915015847, + "expired": 9007199254740991, + "release_created": 1612137600000, + "release_expired": 9007199254740991 + }, + { + "_key": "1652495_2018-11-01", + "_id": "ncbi_taxon/1652495_2018-11-01", + "_rev": "_b2mo7-q--B", + "id": "1652495", + "scientific_name": "Corynebacterium crudilactis", + "rank": "species", + "strain": false, + "aliases": [ + { + "category": "type material", + "name": "CCUG 69192" + }, + { + "category": "type material", + "name": "DSM 100882" + }, + { + "category": "type material", + "name": "LMG 29813" + }, + { + "category": "type material", + "name": "LMG:29813" + }, + { + "category": "type material", + "name": "strain JZ16" + }, + { + "category": "authority", + "name": "Corynebacterium crudilactis Zimmermann et al. 2016" + }, + { + "category": "includes", + "name": "Corynebacterium sp. DSM 100882" + }, + { + "category": "includes", + "name": "Corynebacterium sp. JZ16" + } + ], + "ncbi_taxon_id": 1652495, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "576247_2018-11-01", + "_id": "ncbi_taxon/576247_2018-11-01", + "_rev": "_b2kAI_W--D", + "id": "576247", + "scientific_name": "environmental samples", + "rank": "no rank", + "strain": true, + "aliases": [], + "ncbi_taxon_id": 576247, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "2030720_2018-11-01", + "_id": "ncbi_taxon/2030720_2018-11-01", + "_rev": "_b2nVw-a--_", + "id": "2030720", + "scientific_name": "Deyeuxia ovata var. ovata", + "rank": "varietas", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 2030720, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "1973755_2021-02-01", + "_id": "ncbi_taxon/1973755_2021-02-01", + "_rev": "_b2nMEP----", + "id": "1973755", + "scientific_name": "Influenza A virus (A/Connecticut/07/2017(H1N1))", + "rank": "no rank", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 1973755, + "gencode": 1, + "first_version": "2021-02-01", + "last_version": "2021-02-01", + "created": 1612915015847, + "expired": 9007199254740991, + "release_created": 1612137600000, + "release_expired": 9007199254740991 + }, + { + "_key": "106193_2021-02-01", + "_id": "ncbi_taxon/106193_2021-02-01", + "_rev": "_b2jDxa----", + "id": "106193", + "scientific_name": "Temnothorax luteus", + "rank": "species", + "strain": false, + "aliases": [ + { + "category": "synonym", + "name": "Leptothorax luteus" + }, + { + "category": "authority", + "name": "Temnothorax luteus (Forel, 1874)" + } + ], + "ncbi_taxon_id": 106193, + "gencode": 1, + "first_version": "2021-02-01", + "last_version": "2021-02-01", + "created": 1612915015847, + "expired": 9007199254740991, + "release_created": 1612137600000, + "release_expired": 9007199254740991 + }, + { + "_key": "1231197_2018-11-01", + "_id": "ncbi_taxon/1231197_2018-11-01", + "_rev": "_b2ld-Iu--_", + "id": "1231197", + "scientific_name": "environmental samples", + "rank": "no rank", + "strain": true, + "aliases": [], + "ncbi_taxon_id": 1231197, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "11064_2021-02-01", + "_id": "ncbi_taxon/11064_2021-02-01", + "_rev": "_b2i7Nze---", + "id": "11064", + "scientific_name": "Dengue virus 2 Jamaica/1409/1983", + "rank": "no rank", + "strain": true, + "aliases": [ + { + "category": "equivalent name", + "name": "Dengue virus type 2 Jamaica/1409/1983" + }, + { + "category": "equivalent name", + "name": "Dengue virus type 2 (strain Jamaica)" + } + ], + "ncbi_taxon_id": 11064, + "gencode": 1, + "first_version": "2021-02-01", + "last_version": "2021-02-01", + "created": 1612915015847, + "expired": 9007199254740991, + "release_created": 1612137600000, + "release_expired": 9007199254740991 + }, + { + "_key": "133823_2018-11-01", + "_id": "ncbi_taxon/133823_2018-11-01", + "_rev": "_b2jG1pS--H", + "id": "133823", + "scientific_name": "environmental samples", + "rank": "no rank", + "strain": true, + "aliases": [], + "ncbi_taxon_id": 133823, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "2418748_2018-11-01", + "_id": "ncbi_taxon/2418748_2018-11-01", + "_rev": "_b2nkTeS--_", + "id": "2418748", + "scientific_name": "Tanypodinae sp. BIOUG27705-C01", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 2418748, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "72806_2018-11-01", + "_id": "ncbi_taxon/72806_2018-11-01", + "_rev": "_b2jBN3a--_", + "id": "72806", + "scientific_name": "Williamsia maris", + "rank": "species", + "strain": false, + "aliases": [ + { + "category": "type material", + "name": "DSM 44693" + }, + { + "category": "type material", + "name": "JCM 12070" + }, + { + "category": "type material", + "name": "NCIMB 13945" + }, + { + "category": "type material", + "name": "strain SJS0289/JS1" + }, + { + "category": "includes", + "name": "Gordona sp. SJS0289-JS1" + }, + { + "category": "includes", + "name": "Gordonia sp. SJS0289-JS1" + }, + { + "category": "authority", + "name": "Williamsia maris Stach et al. 2004" + } + ], + "ncbi_taxon_id": 72806, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "41524_2018-11-01", + "_id": "ncbi_taxon/41524_2018-11-01", + "_rev": "_b2j-aOG--_", + "id": "41524", + "scientific_name": "Salmonella enterica subsp. houtenae serovar 16:z4,z32:--", + "rank": "no rank", + "strain": true, + "aliases": [ + { + "category": "synonym", + "name": "Salmonella enterica IV 16:z4,z32:--" + }, + { + "category": "synonym", + "name": "Salmonella enterica serovar IV 16:z4,z32:--" + } + ], + "ncbi_taxon_id": 41524, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "641932_2018-11-01", + "_id": "ncbi_taxon/641932_2018-11-01", + "_rev": "_b2kHS-a--_", + "id": "641932", + "scientific_name": "Streptomyces sp. ERI MA-01", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 641932, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "2569359_2020-03-01", + "_id": "ncbi_taxon/2569359_2020-03-01", + "_rev": "_b2nsRK6--_", + "id": "2569359", + "scientific_name": "Clibanarius sp. AY-2019", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 2569359, + "gencode": 1, + "first_version": "2020-03-01", + "last_version": "2021-02-01", + "created": 1584487952760, + "expired": 9007199254740991, + "release_created": 1583020800000, + "release_expired": 9007199254740991 + }, + { + "_key": "857244_2018-11-01", + "_id": "ncbi_taxon/857244_2018-11-01", + "_rev": "_b2kgXfy--_", + "id": "857244", + "scientific_name": "Pseudomonas sp. V219", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 857244, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "691007_2021-02-01", + "_id": "ncbi_taxon/691007_2021-02-01", + "_rev": "_b2kP8ue---", + "id": "691007", + "scientific_name": "Influenza A virus (A/chicken/West Java/Smi-M6/2008(H5N1))", + "rank": "no rank", + "strain": false, + "aliases": [ + { + "category": "equivalent name", + "name": "Influenza virus (A/chicken/West Java/Smi-M6/2008(H5N1))" + } + ], + "ncbi_taxon_id": 691007, + "gencode": 1, + "first_version": "2021-02-01", + "last_version": "2021-02-01", + "created": 1612915015847, + "expired": 9007199254740991, + "release_created": 1612137600000, + "release_expired": 9007199254740991 + }, + { + "_key": "2713502_2021-02-01", + "_id": "ncbi_taxon/2713502_2021-02-01", + "_rev": "_b2n6rv---A", + "id": "2713502", + "scientific_name": "Listeria sp. FSL_L7-1519", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 2713502, + "gencode": 11, + "first_version": "2021-02-01", + "last_version": "2021-02-01", + "created": 1612915015847, + "expired": 9007199254740991, + "release_created": 1612137600000, + "release_expired": 9007199254740991 + }, + { + "_key": "1148469_2018-11-01", + "_id": "ncbi_taxon/1148469_2018-11-01", + "_rev": "_b2lKR8q--B", + "id": "1148469", + "scientific_name": "Bradyrhizobium sp. SCNU 9", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 1148469, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "1935312_2018-11-01", + "_id": "ncbi_taxon/1935312_2018-11-01", + "_rev": "_b2nIRUW--D", + "id": "1935312", + "scientific_name": "Corticiaceae sp.", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 1935312, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "1635398_2018-11-01", + "_id": "ncbi_taxon/1635398_2018-11-01", + "_rev": "_b2mnjo2--C", + "id": "1635398", + "scientific_name": "Janthinobacterium sp. NA55", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 1635398, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "292089_2018-11-01", + "_id": "ncbi_taxon/292089_2018-11-01", + "_rev": "_b2jWDpa--_", + "id": "292089", + "scientific_name": "Muraltia horrida", + "rank": "species", + "strain": false, + "aliases": [ + { + "category": "authority", + "name": "Muraltia horrida Diels" + } + ], + "ncbi_taxon_id": 292089, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "261389_2018-11-01", + "_id": "ncbi_taxon/261389_2018-11-01", + "_rev": "_b2jSxcW--_", + "id": "261389", + "scientific_name": "environmental samples", + "rank": "no rank", + "strain": true, + "aliases": [], + "ncbi_taxon_id": 261389, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "1636559_2019-06-01", + "_id": "ncbi_taxon/1636559_2019-06-01", + "_rev": "_b2mnkIK--B", + "id": "1636559", + "scientific_name": "Lactococcus phage 936 group phage Phi13.16", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 1636559, + "gencode": 11, + "first_version": "2019-06-01", + "last_version": "2021-02-01", + "created": 1559347260000, + "expired": 9007199254740991, + "release_created": 1559347200000, + "release_expired": 9007199254740991 + }, + { + "_key": "1899424_2018-11-01", + "_id": "ncbi_taxon/1899424_2018-11-01", + "_rev": "_b2nAhcK--_", + "id": "1899424", + "scientific_name": "'Prunus dulcis' phytoplasma", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 1899424, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "1604534_2018-11-01", + "_id": "ncbi_taxon/1604534_2018-11-01", + "_rev": "_Zc0PA0q--B", + "id": "1604534", + "scientific_name": "Pseudogobio cf. esocinus CBM:ZF:12684", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 1604534, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2019-06-01", + "created": 1541030460000, + "expired": 1561939259999, + "release_created": 1541030400000, + "release_expired": 1561939199999 + }, + { + "_key": "1329276_2018-11-01", + "_id": "ncbi_taxon/1329276_2018-11-01", + "_rev": "_ZcrE_9O--D", + "id": "1329276", + "scientific_name": "Klebsormidium sp. BIOTA 14615.5a", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 1329276, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2019-01-01", + "created": 1541030460000, + "expired": 1548979259999, + "release_created": 1541030400000, + "release_expired": 1548979199999 + }, + { + "_key": "508775_2018-11-01", + "_id": "ncbi_taxon/508775_2018-11-01", + "_rev": "_b2n7Xqu--B", + "id": "508775", + "scientific_name": "Norovirus GII.9", + "rank": "no rank", + "strain": true, + "aliases": [ + { + "category": "synonym", + "name": "Norovirus genogroup GII.9" + } + ], + "ncbi_taxon_id": 508775, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2020-03-01", + "created": 1541030460000, + "expired": 1612915015846, + "release_created": 1541030400000, + "release_expired": 1612137599999 + }, + { + "_key": "338794_2018-11-01", + "_id": "ncbi_taxon/338794_2018-11-01", + "_rev": "_b2jbO4G--D", + "id": "338794", + "scientific_name": "low G+C Gram-positive bacterium HTA462", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 338794, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "586732_2018-11-01", + "_id": "ncbi_taxon/586732_2018-11-01", + "_rev": "_b2kB1gK--B", + "id": "586732", + "scientific_name": "Integrating expression vector pJEB403+drrA", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 586732, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "1127597_2018-11-01", + "_id": "ncbi_taxon/1127597_2018-11-01", + "_rev": "_b2lFmce--B", + "id": "1127597", + "scientific_name": "Fusarium cf. solani 3+4-uuu DPGS-2011", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 1127597, + "gencode": 1, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "1173779_2018-11-01", + "_id": "ncbi_taxon/1173779_2018-11-01", + "_rev": "_b2lOxFa--_", + "id": "1173779", + "scientific_name": "Salmonella enterica subsp. diarizonae serovar 60:r:e,n,x,z15", + "rank": "no rank", + "strain": true, + "aliases": [], + "ncbi_taxon_id": 1173779, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "1906029_2018-11-01", + "_id": "ncbi_taxon/1906029_2018-11-01", + "_rev": "_b2nDL5---_", + "id": "1906029", + "scientific_name": "Nostoc sp. 'Peltigera sp. \"hawaiensis\" P1236 cyanobiont'", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 1906029, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "1945188_2018-11-01", + "_id": "ncbi_taxon/1945188_2018-11-01", + "_rev": "_b2nJbF2--_", + "id": "1945188", + "scientific_name": "Reporter vector p1168hIL6mC/EBP-luc+", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 1945188, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "1945295_2018-11-01", + "_id": "ncbi_taxon/1945295_2018-11-01", + "_rev": "_b2nJbIK--_", + "id": "1945295", + "scientific_name": "Vector pEntry-attR2-IRES-eGFP-luc+-pA-attL3", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 1945295, + "gencode": 11, + "first_version": "2018-11-01", + "last_version": "2021-02-01", + "created": 1541030460000, + "expired": 9007199254740991, + "release_created": 1541030400000, + "release_expired": 9007199254740991 + }, + { + "_key": "2727889_2021-02-01", + "_id": "ncbi_taxon/2727889_2021-02-01", + "_rev": "_b2n6us---A", + "id": "2727889", + "scientific_name": "Pleurocapsales cyanobacterium 'Beach rock 4+5\"'", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": 2727889, + "gencode": 11, + "first_version": "2021-02-01", + "last_version": "2021-02-01", + "created": 1612915015847, + "expired": 9007199254740991, + "release_created": 1612137600000, + "release_expired": 9007199254740991 + }, + { + "_key": "fake_2021-02-01", + "_id": "ncbi_taxon/fake_2021-02-01", + "_rev": "fake", + "id": "fake", + "scientific_name": "|Fake|fake|fake| ||fake||", + "rank": "species", + "strain": false, + "aliases": [], + "ncbi_taxon_id": -1, + "gencode": 11, + "first_version": "2021-02-01", + "last_version": "2021-02-01", + "created": 1612915015847, + "expired": 9007199254740991, + "release_created": 1612137600000, + "release_expired": 9007199254740991 + } +] diff --git a/spec/test/djornl/col_count_errors/directed_edges.tsv b/spec/test/djornl/col_count_errors/directed_edges.tsv new file mode 100644 index 00000000..ff8713e3 --- /dev/null +++ b/spec/test/djornl/col_count_errors/directed_edges.tsv @@ -0,0 +1,8 @@ +node1 node2 score edge_descrip edge_type directed +As2 AT1G01040 5.422046084731258 AraGWAS-Association_score phenotype-association_AraGWAS 1 +As75 AT1G01020 39.98573324312915 AraGWAS-Association_score phenotype-association_AraGWAS 0 +AT1G01010 AT1G01020 2.39322646755088 AraNetv2_log-likelihood-score protein-protein-interaction_high-throughput_AraNet_v2 +AT1G01010 AT1G01030 2.39322646755088 AraNetv2_log-likelihood-score protein-protein-interaction_high-throughput_AraNet_v2 1 +AT1G01010 AT1G01040 2.39322646755088 +AT1G01030 AT1G01050 2.5494618241936697 AraNetv2_log-likelihood-score pairwise-gene-coexpression_AraNet_v2 1 +AT1G01050 AT1G01060 4.34242054808616 AraNetv2_log-likelihood-score protein-protein-interaction_literature-curated_AraNet_v2 1 diff --git a/spec/test/djornl/col_count_errors/edges.tsv b/spec/test/djornl/col_count_errors/edges.tsv new file mode 100644 index 00000000..51953490 --- /dev/null +++ b/spec/test/djornl/col_count_errors/edges.tsv @@ -0,0 +1,8 @@ +node1 node2 score edge_descrip edge_type +As2 AT1G01040 5.422046084731258 AraGWAS-Association_score phenotype-association_AraGWAS 1 +As75 AT1G01020 39.98573324312915 AraGWAS-Association_score phenotype-association_AraGWAS +AT1G01010 AT1G01020 2.39322646755088 AraNetv2_log-likelihood-score protein-protein-interaction_high-throughput_AraNet_v2 +AT1G01010 AT1G01030 2.39322646755088 AraNetv2_log-likelihood-score protein-protein-interaction_high-throughput_AraNet_v2 +AT1G01010 AT1G01040 2.39322646755088 +AT1G01030 AT1G01050 2.5494618241936697 AraNetv2_log-likelihood-score pairwise-gene-coexpression_AraNet_v2 +AT1G01050 AT1G01060 4.34242054808616 AraNetv2_log-likelihood-score protein-protein-interaction_literature-curated_AraNet_v2 diff --git a/spec/test/djornl/col_count_errors/manifest.yaml b/spec/test/djornl/col_count_errors/manifest.yaml new file mode 100644 index 00000000..e7d615e5 --- /dev/null +++ b/spec/test/djornl/col_count_errors/manifest.yaml @@ -0,0 +1,12 @@ +name: Dan Jacobson Exascale data +release_date: "2020-06-06" +file_list: + - data_type: edge + path: edges.tsv + + - data_type: edge + path: directed_edges.tsv + + - data_type: node + file_format: csv + path: nodes.csv diff --git a/spec/test/djornl/col_count_errors/nodes.csv b/spec/test/djornl/col_count_errors/nodes.csv new file mode 100644 index 00000000..4edd51f5 --- /dev/null +++ b/spec/test/djornl/col_count_errors/nodes.csv @@ -0,0 +1,14 @@ +node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_description,GO_terms,MapMan_bin,MapMan_name,MapMan_description,pheno_AraGWAS_ID,pheno_description,pheno_pto_name,pheno_pto_description,pheno_reference,User_Notes +As2,pheno,,,,,,,,,,,,,10.21958/phenotype:103,,bacterial disease resistance,The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj],"Atwell et. al, Nature 2010", +As75,pheno,,,,,,,,,,,,,,,10.21958/phenotype:67,"Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",arsenic concentration,A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik],"Atwell et. al, Nature 2010", +AT1G01020,gene,AT1G01020.6,ARV1,,protein_coding,ARV1 family protein;(source:Araport11),,,molecular_function,GO:0003674,35.1,not assigned.annotated,(original description: pep chromosome:TAIR10:1:6788:8737:-1 gene:AT1G01020 transcript:AT1G01020.6 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:ARV1 description:ARV1 family protein [Source:UniProtKB/TrEMBL;Acc:Q5MK24]) & no description available(sp|q5mk24|arv1_arath : 99.4),,,,,, +AT1G01030,gene,AT1G01030.2,NGA3,NGATHA3,protein_coding,AP2/B3-like transcriptional factor family protein;(source:Araport11),,,"DNA-binding transcription factor activity, DNA binding","GO:0003700, GO:0003677",15.5.5.3,.RNA biosynthesis.transcriptional regulation.B3 transcription factor superfamily.transcription factor (RAV/NGATHA),transcription factor (RAV/NGATHA) (original description: pep chromosome:TAIR10:1:11649:13714:-1 gene:AT1G01030 transcript:AT1G01030.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NGA3 description:B3 domain-containing transcription factor NGA3 [Source:UniProtKB/Swiss-Prot;Acc:Q9MAN1]),,,,,, +AT1G01040,gene,AT1G01040.2,SUS1,SUSPENSOR 1,protein_coding,dicer-like 1;(source:Araport11),"Encodes a Dicer homolog. Dicer is a RNA helicase involved in microRNA processing. Mutations in this locus can result in embryo lethality. Embryo shape at seed maturity is globular-elongate. Other mutants convert the floral meristems to an indeterminate state, others yet show defects in ovule development. mRNA is expressed in all shoot tissues. DCL1 is able to produce miRNAs and siRNAs. The mRNA is cell-to-cell mobile.",dicer-like 1,"metal ion binding, protein binding, ribonuclease III activity, ATP-dependent helicase activity, ATP binding, RNA binding, helicase activity, double-stranded RNA binding, DNA binding","GO:0046872, GO:0005515, GO:0004525, GO:0008026, GO:0005524, GO:0003723, GO:0004386, GO:0003725, GO:0003677",16.10.2.1.1,.RNA processing.mRNA silencing.miRNA pathway.DCL1-HYL1 miRNA biogenesis complex.endoribonuclease component DCL1,endoribonuclease component DCL1 of DCL1-HYL1 miRNA biogenesis complex (original description: pep chromosome:TAIR10:1:23416:31120:1 gene:AT1G01040 transcript:AT1G01040.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:DCL1 description:Dicer-like 1 [Source:UniProtKB/TrEMBL;Acc:F4HQG6]),,,,,, +AT1G01050,gene,AT1G01050.2,PPa1,pyrophosphorylase 1,protein_coding,pyrophosphorylase 1;(source:Araport11),,,inorganic diphosphatase activity,GO:0004427,35.1,not assigned.annotated,(original description: pep chromosome:TAIR10:1:31382:33009:-1 gene:AT1G01050 transcript:AT1G01050.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:PPA1 description:Soluble inorganic pyrophosphatase 1 [Source:UniProtKB/Swiss-Prot;Acc:Q93V56]) & Soluble inorganic pyrophosphatase 1 OS=Arabidopsis thaliana (sp|q93v56|ipyr1_arath : 419.0),,,,,, +AT1G01060,gene,AT1G01060.8,LHY1,LATE ELONGATED HYPOCOTYL 1,protein_coding,Homeodomain-like superfamily protein;(source:Araport11),,,"DNA-binding transcription factor activity, DNA binding, transcription regulatory region DNA binding","GO:0003700, GO:0003677, GO:0044212",27.1.1,.Multi-process regulation.circadian clock system.core oscillator protein (LHY|CCA1),circadian clock core oscillator protein (LHY|CCA1) (original description: pep chromosome:TAIR10:1:33967:37230:-1 gene:AT1G01060 transcript:AT1G01060.8 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:LHY description:LHY1 [Source:UniProtKB/TrEMBL;Acc:A0A178W761]),,,,,, +AT1G01070,gene,AT1G01070.2,UMAMIT28,Usually multiple acids move in and out Transporters 28,protein_coding,nodulin MtN21 /EamA-like transporter family protein;(source:Araport11),Encodes a plasma membrane-localized amino acid transporter likely involved in amino acid export in the developing seed.,nodulin MtN21 /EamA-like transporter family protein,L-glutamine transmembrane transporter activity,GO:0015186,24.2.1.5,.Solute transport.carrier-mediated transport.DMT superfamily.solute transporter (UmamiT),solute transporter (UmamiT) (original description: pep chromosome:TAIR10:1:38752:40945:-1 gene:AT1G01070 transcript:AT1G01070.2 gene_biotype:protein_coding transcript_biotype:protein_coding description:WAT1-related protein [Source:UniProtKB/TrEMBL;Acc:A0A178WFU3]),,,,,, +AT1G01080,gene,AT1G01080.3,,,protein_coding,RNA-binding (RRM/RBD/RNP motifs) family protein;(source:Araport11),,,"RNA binding, mRNA binding","GO:0003723, GO:0003729",35.1,not assigned.annotated,"(original description: pep chromosome:TAIR10:1:44970:47059:-1 gene:AT1G01080 transcript:AT1G01080.3 gene_biotype:protein_coding transcript_biotype:protein_coding description:RNA-binding (RRM/RBD/RNP motifs) family protein [Source:UniProtKB/TrEMBL;Acc:F4HQH8]) & 33 kDa ribonucleoprotein, chloroplastic OS=Nicotiana sylvestris (sp|p19684|roc5_nicsy : 109.0)",,,,,, +AT1G01090,gene,AT1G01090.1,PDH-E1 ALPHA,pyruvate dehydrogenase E1 alpha,protein_coding,pyruvate dehydrogenase E1 alpha;(source:Araport11),pyruvate dehydrogenase E1 alpha subunit,pyruvate dehydrogenase E1 alpha,pyruvate dehydrogenase (acetyl-transferring) activity,GO:0004739,5.1.2.2.1.1,.Lipid metabolism.fatty acid biosynthesis.acetyl-CoA generation.plastidial pyruvate dehydrogenase complex.E1 pyruvate dehydrogenase subcomplex.subunit alpha,subunit alpha of E1 pyruvate dehydrogenase component (original description: pep chromosome:TAIR10:1:47234:49304:-1 gene:AT1G01090 transcript:AT1G01090.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:PDH-E1 ALPHA description:Pyruvate dehydrogenase E1 component subunit alpha [Source:UniProtKB/TrEMBL;Acc:A0A178W8A7]),,,,,, +AT1G01100,gene,AT1G01100.4,,,protein_coding,60S acidic ribosomal protein family;(source:Araport11),,60S acidic ribosomal protein family,"structural constituent of ribosome, ribonucleoprotein complex binding, protein kinase activator activity","GO:0003735, GO:0043021, GO:0030295",17.1.2.1.46,.Protein biosynthesis.ribosome biogenesis.large ribosomal subunit (LSU).LSU proteome.component RPP1,component RPP1 of LSU proteome component (original description: pep chromosome:TAIR10:1:50090:51187:-1 gene:AT1G01100 transcript:AT1G01100.4 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:RPP1A description:60S acidic ribosomal protein P1-1 [Source:UniProtKB/Swiss-Prot;Acc:Q8LCW9]),,,,,, +Na23,pheno,,,,,,,,,,,,,10.21958/phenotype:5,"Sodium concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",sodium concentration,The total sodium ion concentration measured in a given volume of a plant or a plant part or plant extract. [GR:pj],"Atwell et. al, Nature 2010", +SDV,pheno,,,,,,,,,,,,,10.21958/phenotype:104,"Number of days following stratification to opening of first flower. The experiment was stopped at 200 d, and accessions that had not flowered at that point were assigned a value of 200",days to flowering trait,"A flowering time trait (TO:0002616)which is the number of days required for an individual flower (PO:0009046), a whole plant (PO:0000003) or a plant population to reach flowering stage (PO:0007616) from a predetermined time point (e.g. the date of seed sowing, seedling transplant, or seedling emergence). [GR:pj, TO:cooperl]","Atwell et. al, Nature 2010", diff --git a/spec/test/djornl/duplicate_data/I2_named.tsv b/spec/test/djornl/duplicate_data/I2_named.tsv new file mode 100644 index 00000000..bef48c38 --- /dev/null +++ b/spec/test/djornl/duplicate_data/I2_named.tsv @@ -0,0 +1,9 @@ +cluster_id node_ids +# data_type: cluster +# cluster_prefix: markov_i2 +# title: Markov clustering, inflation = 2 +Cluster1 AT1G01010,AT1G01030,AT1G01040 +Cluster2 AT1G01050,AT1G01060,AT1G01070 +Cluster3 AT1G01090 +# Cluster4 +Cluster5 AT1G01020 diff --git a/spec/test/djornl/duplicate_data/I4_named.tsv b/spec/test/djornl/duplicate_data/I4_named.tsv new file mode 100644 index 00000000..29b6cd36 --- /dev/null +++ b/spec/test/djornl/duplicate_data/I4_named.tsv @@ -0,0 +1,8 @@ +cluster_id node_ids +# cluster_prefix: markov_i4 +# title: Markov clustering, inflation = 4 +# data_type: cluster +# Cluster1 +# Cluster2 +Cluster3 AT1G01080 +# Cluster4 diff --git a/spec/test/djornl/duplicate_data/I6_copy.csv b/spec/test/djornl/duplicate_data/I6_copy.csv new file mode 100644 index 00000000..a21bd2af --- /dev/null +++ b/spec/test/djornl/duplicate_data/I6_copy.csv @@ -0,0 +1,8 @@ +cluster_id,node_ids +# data_type: cluster +# cluster_prefix: markov_i6 +# title: Markov clustering, inflation = 6 +Cluster1,"AT1G01040,AT1G01090" +Cluster2,AT1G01070 +Cluster3,"AT1G01010,AT1G01020,AT1G01030" +# Cluster4 diff --git a/spec/test/djornl/duplicate_data/I6_named.tsv b/spec/test/djornl/duplicate_data/I6_named.tsv new file mode 100644 index 00000000..e7688f17 --- /dev/null +++ b/spec/test/djornl/duplicate_data/I6_named.tsv @@ -0,0 +1,8 @@ +cluster_id node_ids +# data_type: cluster +# cluster_prefix: markov_i6 +# title: Markov clustering, inflation = 6 +Cluster1 AT1G01040,AT1G01090 +Cluster2 AT1G01070 +Cluster3 AT1G01010,AT1G01020,AT1G01030 +# Cluster4 diff --git a/spec/test/djornl/duplicate_data/edges.tsv b/spec/test/djornl/duplicate_data/edges.tsv new file mode 100644 index 00000000..270cab3d --- /dev/null +++ b/spec/test/djornl/duplicate_data/edges.tsv @@ -0,0 +1,17 @@ +node1 node2 score directed edge_type +As2 AT1G01020 8.4 0 phenotype-association_AraGWAS +As2 AT1G01040 5.4 0 phenotype-association_AraGWAS +As75 AT1G01020 39.9 0 phenotype-association_AraGWAS +AT1G01010 AT1G01040 2.5 0 domain-co-occurrence_AraNet_v2 +AT1G01010 AT1G01040 170.5 0 protein-protein-interaction_literature-curated_AraNet_v2 +AT1G01030 AT1G01050 2.6 0 pairwise-gene-coexpression_AraNet_v2 +AT1G01050 AT1G01060 2.7 0 protein-protein-interaction_literature-curated_AraNet_v2 +# duplicated line +AT1G01010 AT1G01040 2.5 0 domain-co-occurrence_AraNet_v2 +AT1G01080 AT1G01090 2.8 0 protein-protein-interaction_literature-curated_AraNet_v2 +# these are OK +SDV AT1G01100 8.4 0 protein-protein-interaction_literature-curated_AraNet_v2 +SDV AT1G01100 5.4 1 protein-protein-interaction_literature-curated_AraNet_v2 +AT1G01100 SDV 2.4 1 protein-protein-interaction_literature-curated_AraNet_v2 +# this is a dupe! +AT1G01100 SDV 8.5 0 protein-protein-interaction_literature-curated_AraNet_v2 diff --git a/spec/test/djornl/duplicate_data/extra_node.csv b/spec/test/djornl/duplicate_data/extra_node.csv new file mode 100644 index 00000000..9dbcdf54 --- /dev/null +++ b/spec/test/djornl/duplicate_data/extra_node.csv @@ -0,0 +1,5 @@ +# data_type: node +node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_description,GO_terms,MapMan_bin,MapMan_name,MapMan_description +AT1G01100,gene,AT1G01100.4,,,protein_coding,60S acidic ribosomal protein family;(source:Araport11),,60S acidic ribosomal protein family,"structural constituent of ribosome, ribonucleoprotein complex binding, protein kinase activator activity","GO:0003735, GO:0043021, GO:0030295",17.1.2.1.46,.Protein biosynthesis.ribosome biogenesis.large ribosomal subunit (LSU).LSU proteome.component RPP1,component RPP1 of LSU proteome component (original description: pep chromosome:TAIR10:1:50090:51187:-1 gene:AT1G01100 transcript:AT1G01100.4 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:RPP1A description:60S acidic ribosomal protein P1-1 [Source:UniProtKB/Swiss-Prot;Acc:Q8LCW9]) +# duplicated line with alterations +AT1G01080,gene,AT1G01080.3,whatever!,,protein_coding,RNA-binding (RRM/RBD/RNP motifs) family protein;(source:Araport11),,,"RNA binding, mRNA binding","GO:0003723, GO:0003729",35.1,not assigned.annotated,"(original description: pep chromosome:TAIR10:1:44970:47059:-1 gene:AT1G01080 transcript:AT1G01080.3 gene_biotype:protein_coding transcript_biotype:protein_coding description:RNA-binding (RRM/RBD/RNP motifs) family protein [Source:UniProtKB/TrEMBL;Acc:F4HQH8]) & 33 kDa ribonucleoprotein, chloroplastic OS=Nicotiana sylvestris (sp|p19684|roc5_nicsy : 109.0)" diff --git a/spec/test/djornl/duplicate_data/hithruput-edges.csv b/spec/test/djornl/duplicate_data/hithruput-edges.csv new file mode 100644 index 00000000..94129f85 --- /dev/null +++ b/spec/test/djornl/duplicate_data/hithruput-edges.csv @@ -0,0 +1,12 @@ +node1,node2,score,edge_descrip,edge_type,directed +AT1G01010,AT1G01020,2.3,AraNetv2_log-likelihood-score,protein-protein-interaction_high-throughput_AraNet_v2,0 +AT1G01010,AT1G01030,2.4,AraNetv2_log-likelihood-score,protein-protein-interaction_high-throughput_AraNet_v2,0 +# potentially erroneous line +AT1G01010,AT1G01030,2.7,AraNetv2_log-likelihood-score,protein-protein-interaction_high-throughput_AraNet_v2,0 +# duplicated line from the other file +AT1G01060,AT1G01050,2.7,AraNetv2_log-likelihood-score,protein-protein-interaction_literature-curated_AraNet_v2,0 +# potentially erroneous duplication from the other file +AT1G01050,AT1G01030,2.6000001,AraNetv2_log-likelihood-score,pairwise-gene-coexpression_AraNet_v2,0 +# directed edge dupe +SDV,AT1G01100,2001,whatever,protein-protein-interaction_literature-curated_AraNet_v2,1 + diff --git a/spec/test/djornl/duplicate_data/manifest.yaml b/spec/test/djornl/duplicate_data/manifest.yaml new file mode 100644 index 00000000..0197cad7 --- /dev/null +++ b/spec/test/djornl/duplicate_data/manifest.yaml @@ -0,0 +1,42 @@ +name: Dan Jacobson Exascale data +release_date: "2020-06-06" +home_url: "https://github.com/kbase/exascale_data" +file_list: + - data_type: edge + path: edges.tsv + date: "2020-12-25" + + - data_type: edge + path: hithruput-edges.csv + date: "2020-12-25" + + - data_type: node + date: "2019-01-01" + file_format: csv + path: nodes.csv + + - data_type: cluster + cluster_prefix: markov_i2 + path: I2_named.tsv + + - data_type: cluster + cluster_prefix: markov_i4 + path: I4_named.tsv + + - data_type: cluster + cluster_prefix: markov_i6 + path: I6_named.tsv + + - data_type: cluster + cluster_prefix: markov_i6 + path: I6_copy.csv + + - data_type: node + date: "2019-01-01" + file_format: csv + path: pheno_nodes.csv + + - data_type: node + date: "2019-01-01" + file_format: csv + path: extra_node.csv diff --git a/spec/test/djornl/duplicate_data/nodes.csv b/spec/test/djornl/duplicate_data/nodes.csv new file mode 100644 index 00000000..bfb26a70 --- /dev/null +++ b/spec/test/djornl/duplicate_data/nodes.csv @@ -0,0 +1,13 @@ +# data_type: node +node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_description,GO_terms,MapMan_bin,MapMan_name,MapMan_description,pheno_AraGWAS_ID,pheno_description,pheno_pto_name,pheno_pto_description,pheno_reference,User_Notes +AT1G01010,gene,AT1G01010.1,NTL10,NAC domain containing protein 1,protein_coding,NAC domain containing protein 1;(source:Araport11),,NAC domain containing protein 1,"DNA-binding transcription factor activity, DNA binding","GO:0003700, GO:0003677",15.5.17,.RNA biosynthesis.transcriptional regulation.transcription factor (NAC),transcription factor (NAC) (original description: pep chromosome:TAIR10:1:3631:5899:1 gene:AT1G01010 transcript:AT1G01010.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NAC001 description:NAC domain-containing protein 1 [Source:UniProtKB/Swiss-Prot;Acc:Q0WV96]),,,,,, +AT1G01020,gene,AT1G01020.6,ARV1,,protein_coding,ARV1 family protein;(source:Araport11),,,molecular_function,GO:0003674,35.1,not assigned.annotated,(original description: pep chromosome:TAIR10:1:6788:8737:-1 gene:AT1G01020 transcript:AT1G01020.6 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:ARV1 description:ARV1 family protein [Source:UniProtKB/TrEMBL;Acc:Q5MK24]) & no description available(sp|q5mk24|arv1_arath : 99.4),,,,,, +AT1G01030,gene,AT1G01030.2,NGA3,NGATHA3,protein_coding,AP2/B3-like transcriptional factor family protein;(source:Araport11),,,"DNA-binding transcription factor activity, DNA binding","GO:0003700, GO:0003677",15.5.5.3,.RNA biosynthesis.transcriptional regulation.B3 transcription factor superfamily.transcription factor (RAV/NGATHA),transcription factor (RAV/NGATHA) (original description: pep chromosome:TAIR10:1:11649:13714:-1 gene:AT1G01030 transcript:AT1G01030.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NGA3 description:B3 domain-containing transcription factor NGA3 [Source:UniProtKB/Swiss-Prot;Acc:Q9MAN1]),,,,,, +AT1G01040,gene,AT1G01040.2,SUS1,SUSPENSOR 1,protein_coding,dicer-like 1;(source:Araport11),"Encodes a Dicer homolog. Dicer is a RNA helicase involved in microRNA processing. Mutations in this locus can result in embryo lethality. Embryo shape at seed maturity is globular-elongate. Other mutants convert the floral meristems to an indeterminate state, others yet show defects in ovule development. mRNA is expressed in all shoot tissues. DCL1 is able to produce miRNAs and siRNAs. The mRNA is cell-to-cell mobile.",dicer-like 1,"metal ion binding, protein binding, ribonuclease III activity, ATP-dependent helicase activity, ATP binding, RNA binding, helicase activity, double-stranded RNA binding, DNA binding","GO:0046872, GO:0005515, GO:0004525, GO:0008026, GO:0005524, GO:0003723, GO:0004386, GO:0003725, GO:0003677",16.10.2.1.1,.RNA processing.mRNA silencing.miRNA pathway.DCL1-HYL1 miRNA biogenesis complex.endoribonuclease component DCL1,endoribonuclease component DCL1 of DCL1-HYL1 miRNA biogenesis complex (original description: pep chromosome:TAIR10:1:23416:31120:1 gene:AT1G01040 transcript:AT1G01040.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:DCL1 description:Dicer-like 1 [Source:UniProtKB/TrEMBL;Acc:F4HQG6]),,,,,, +AT1G01050,gene,AT1G01050.2,PPa1,pyrophosphorylase 1,protein_coding,pyrophosphorylase 1;(source:Araport11),,,inorganic diphosphatase activity,GO:0004427,35.1,not assigned.annotated,(original description: pep chromosome:TAIR10:1:31382:33009:-1 gene:AT1G01050 transcript:AT1G01050.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:PPA1 description:Soluble inorganic pyrophosphatase 1 [Source:UniProtKB/Swiss-Prot;Acc:Q93V56]) & Soluble inorganic pyrophosphatase 1 OS=Arabidopsis thaliana (sp|q93v56|ipyr1_arath : 419.0),,,,,, +AT1G01060,gene,AT1G01060.8,LHY1,LATE ELONGATED HYPOCOTYL 1,protein_coding,Homeodomain-like superfamily protein;(source:Araport11),,,"DNA-binding transcription factor activity, DNA binding, transcription regulatory region DNA binding","GO:0003700, GO:0003677, GO:0044212",27.1.1,.Multi-process regulation.circadian clock system.core oscillator protein (LHY|CCA1),circadian clock core oscillator protein (LHY|CCA1) (original description: pep chromosome:TAIR10:1:33967:37230:-1 gene:AT1G01060 transcript:AT1G01060.8 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:LHY description:LHY1 [Source:UniProtKB/TrEMBL;Acc:A0A178W761]),,,,,, +AT1G01070,gene,AT1G01070.2,UMAMIT28,Usually multiple acids move in and out Transporters 28,protein_coding,nodulin MtN21 /EamA-like transporter family protein;(source:Araport11),Encodes a plasma membrane-localized amino acid transporter likely involved in amino acid export in the developing seed.,nodulin MtN21 /EamA-like transporter family protein,L-glutamine transmembrane transporter activity,GO:0015186,24.2.1.5,.Solute transport.carrier-mediated transport.DMT superfamily.solute transporter (UmamiT),solute transporter (UmamiT) (original description: pep chromosome:TAIR10:1:38752:40945:-1 gene:AT1G01070 transcript:AT1G01070.2 gene_biotype:protein_coding transcript_biotype:protein_coding description:WAT1-related protein [Source:UniProtKB/TrEMBL;Acc:A0A178WFU3]),,,,,, +AT1G01080,gene,AT1G01080.3,,,protein_coding,RNA-binding (RRM/RBD/RNP motifs) family protein;(source:Araport11),,,"RNA binding, mRNA binding","GO:0003723, GO:0003729",35.1,not assigned.annotated,"(original description: pep chromosome:TAIR10:1:44970:47059:-1 gene:AT1G01080 transcript:AT1G01080.3 gene_biotype:protein_coding transcript_biotype:protein_coding description:RNA-binding (RRM/RBD/RNP motifs) family protein [Source:UniProtKB/TrEMBL;Acc:F4HQH8]) & 33 kDa ribonucleoprotein, chloroplastic OS=Nicotiana sylvestris (sp|p19684|roc5_nicsy : 109.0)",,,,,, +AT1G01090,gene,AT1G01090.1,PDH-E1 ALPHA,pyruvate dehydrogenase E1 alpha,protein_coding,pyruvate dehydrogenase E1 alpha;(source:Araport11),pyruvate dehydrogenase E1 alpha subunit,pyruvate dehydrogenase E1 alpha,"pyruvate dehydrogenase (acetyl-transferring) activity, protein binding","GO:0004739, GO:0005515",5.1.2.2.1.1,.Lipid metabolism.fatty acid biosynthesis.acetyl-CoA generation.plastidial pyruvate dehydrogenase complex.E1 pyruvate dehydrogenase subcomplex.subunit alpha,subunit alpha of E1 pyruvate dehydrogenase component (original description: pep chromosome:TAIR10:1:47234:49304:-1 gene:AT1G01090 transcript:AT1G01090.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:PDH-E1 ALPHA description:Pyruvate dehydrogenase E1 component subunit alpha [Source:UniProtKB/TrEMBL;Acc:A0A178W8A7]),,,,,, +# duplicated line +AT1G01050,gene,AT1G01050.2,PPa1,pyrophosphorylase 1,protein_coding,pyrophosphorylase 1;(source:Araport11),,,inorganic diphosphatase activity,GO:0004427,35.1,not assigned.annotated,(original description: pep chromosome:TAIR10:1:31382:33009:-1 gene:AT1G01050 transcript:AT1G01050.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:PPA1 description:Soluble inorganic pyrophosphatase 1 [Source:UniProtKB/Swiss-Prot;Acc:Q93V56]) & Soluble inorganic pyrophosphatase 1 OS=Arabidopsis thaliana (sp|q93v56|ipyr1_arath : 419.0),,,,,, diff --git a/spec/test/djornl/duplicate_data/pheno_nodes.csv b/spec/test/djornl/duplicate_data/pheno_nodes.csv new file mode 100644 index 00000000..9add7b7d --- /dev/null +++ b/spec/test/djornl/duplicate_data/pheno_nodes.csv @@ -0,0 +1,5 @@ +node_id,node_type,pheno_AraGWAS_ID,pheno_description,pheno_pto_name,pheno_pto_description,pheno_reference,User_Notes +As2,pheno,10.21958/phenotype:103,,bacterial disease resistance,The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj],"Atwell et. al, Nature 2010", +As75,pheno,10.21958/phenotype:67,"Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",arsenic concentration,A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik],"Atwell et. al, Nature 2010", +Na23,pheno,10.21958/phenotype:5,"Sodium concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",sodium concentration,The total sodium ion concentration measured in a given volume of a plant or a plant part or plant extract. [GR:pj],"Atwell et. al, Nature 2010", +SDV,pheno,10.21958/phenotype:104,"Number of days following stratification to opening of first flower. The experiment was stopped at 200 d, and accessions that had not flowered at that point were assigned a value of 200",days to flowering trait,"A flowering time trait (TO:0002616)which is the number of days required for an individual flower (PO:0009046), a whole plant (PO:0000003) or a plant population to reach flowering stage (PO:0007616) from a predetermined time point (e.g. the date of seed sowing, seedling transplant, or seedling emergence). [GR:pj, TO:cooperl]","Atwell et. al, Nature 2010", diff --git a/spec/test/djornl/empty_files/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv b/spec/test/djornl/empty_files/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv new file mode 100644 index 00000000..7fe64278 --- /dev/null +++ b/spec/test/djornl/empty_files/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv @@ -0,0 +1 @@ +node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_description,GO_terms,MapMan_bin,MapMan_name,MapMan_description,pheno_AraGWAS_ID,pheno_description,pheno_pto_name,pheno_pto_description,pheno_reference,User_Notes diff --git a/spec/test/djornl/empty_files/cluster_data/comment_only.tsv b/spec/test/djornl/empty_files/cluster_data/comment_only.tsv new file mode 100644 index 00000000..9ce2fbf1 --- /dev/null +++ b/spec/test/djornl/empty_files/cluster_data/comment_only.tsv @@ -0,0 +1 @@ +# what? diff --git a/spec/test/djornl/empty_files/cluster_data/headers_only.tsv b/spec/test/djornl/empty_files/cluster_data/headers_only.tsv new file mode 100644 index 00000000..3233ca40 --- /dev/null +++ b/spec/test/djornl/empty_files/cluster_data/headers_only.tsv @@ -0,0 +1,4 @@ +cluster_id node_ids +# comment +# comment +# comment diff --git a/spec/test/djornl/empty_files/cluster_data/no_content.tsv b/spec/test/djornl/empty_files/cluster_data/no_content.tsv new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/spec/test/djornl/empty_files/cluster_data/no_content.tsv @@ -0,0 +1 @@ + diff --git a/spec/test/djornl/empty_files/manifest.yaml b/spec/test/djornl/empty_files/manifest.yaml new file mode 100644 index 00000000..498ab523 --- /dev/null +++ b/spec/test/djornl/empty_files/manifest.yaml @@ -0,0 +1,21 @@ +name: Dan Jacobson Exascale data +release_date: "2020-06-06" +file_list: + - data_type: edge + path: merged_edges-AMW-060820_AF.tsv + + - data_type: node + file_format: csv + path: aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv + + - data_type: cluster + cluster_prefix: markov_i2 + path: cluster_data/headers_only.tsv + + - data_type: cluster + cluster_prefix: markov_i4 + path: cluster_data/no_content.tsv + + - data_type: cluster + cluster_prefix: markov_i6 + path: cluster_data/comment_only.tsv diff --git a/spec/test/djornl/empty_files/merged_edges-AMW-060820_AF.tsv b/spec/test/djornl/empty_files/merged_edges-AMW-060820_AF.tsv new file mode 100644 index 00000000..4b2bca02 --- /dev/null +++ b/spec/test/djornl/empty_files/merged_edges-AMW-060820_AF.tsv @@ -0,0 +1,4 @@ +# this line is a comment +# so is this line +# oh no +# there's no content in this file! diff --git a/spec/test/djornl/invalid_file/edges.tsv/empty b/spec/test/djornl/invalid_file/edges.tsv/empty new file mode 100644 index 00000000..e69de29b diff --git a/spec/test/djornl/invalid_file/manifest.yaml b/spec/test/djornl/invalid_file/manifest.yaml new file mode 100644 index 00000000..985f458a --- /dev/null +++ b/spec/test/djornl/invalid_file/manifest.yaml @@ -0,0 +1,13 @@ +name: Dan Jacobson Exascale data +release_date: "2020-06-06" +file_list: + - data_type: edge + path: edges.tsv + + - data_type: node + file_format: csv + path: nodes.csv + + - data_type: cluster + cluster_prefix: markov_i2 + path: clusters.tsv diff --git a/spec/test/djornl/invalid_file/merged_edges-AMW-060820_AF.tsv/empty b/spec/test/djornl/invalid_file/merged_edges-AMW-060820_AF.tsv/empty new file mode 100644 index 00000000..e69de29b diff --git a/spec/test/djornl/invalid_manifest/cluster_no_prefix.yaml b/spec/test/djornl/invalid_manifest/cluster_no_prefix.yaml new file mode 100644 index 00000000..b8993731 --- /dev/null +++ b/spec/test/djornl/invalid_manifest/cluster_no_prefix.yaml @@ -0,0 +1,18 @@ +# first cluster file has no prefix +name: Dan Jacobson Exascale data +release_date: "2020-06-06" +file_list: + - data_type: edge + path: edges.tsv + date: "2020-12-25" + + - data_type: node + path: nodes.csv + date: "2019-01-01" + + - data_type: cluster + path: I2_named.tsv + + - data_type: cluster + cluster_prefix: markov_i4 + path: I4_named.tsv diff --git a/spec/test/djornl/invalid_manifest/date_not_in_quotes.yaml b/spec/test/djornl/invalid_manifest/date_not_in_quotes.yaml new file mode 100644 index 00000000..d40e9e6f --- /dev/null +++ b/spec/test/djornl/invalid_manifest/date_not_in_quotes.yaml @@ -0,0 +1,17 @@ +# edge date is not quoted (pyyaml creates a datetime.date object) +name: Dan Jacobson Exascale data +release_date: "2020-06-06" +file_list: + - data_type: edge + date: 2020-12-25 + path: edge_data + file_format: csv + + - data_type: node + path: nodes.csv + date: "2019-01-01" + + - data_type: cluster + cluster_prefix: markov_i2 + file_format: tsv + path: clusters diff --git a/spec/test/djornl/invalid_manifest/invalid_format.yaml b/spec/test/djornl/invalid_manifest/invalid_format.yaml new file mode 100644 index 00000000..125227ce --- /dev/null +++ b/spec/test/djornl/invalid_manifest/invalid_format.yaml @@ -0,0 +1,17 @@ +# invalid node file format +name: Dan Jacobson Exascale data +release_date: "2020-06-06" +file_list: + - data_type: edge + date: "2020-12-25" + path: edge_data.tsv + + - data_type: node + path: nodes.csv + date: "2019-01-01" + file_format: txt + + - data_type: cluster + cluster_prefix: markov_i2 + file_format: tsv + path: clusters diff --git a/spec/test/djornl/invalid_manifest/manifest.yaml b/spec/test/djornl/invalid_manifest/manifest.yaml new file mode 100644 index 00000000..7abfa0e2 --- /dev/null +++ b/spec/test/djornl/invalid_manifest/manifest.yaml @@ -0,0 +1,14 @@ +# multiple errors +name: Dan Jacobson Exascale data +release_date: "2020-06-06" +file_list: + - data_type: edge + path: edges.tsv + + - data_type: node + + - data_type: cluster + path: clusters.tsv + + - data_type: ping-pong balls + path: where? diff --git a/spec/test/djornl/invalid_manifest/missing_path.yaml b/spec/test/djornl/invalid_manifest/missing_path.yaml new file mode 100644 index 00000000..c93bec17 --- /dev/null +++ b/spec/test/djornl/invalid_manifest/missing_path.yaml @@ -0,0 +1,21 @@ +# edge file path missing +name: Dan Jacobson Exascale data +release_date: "2020-06-06" +file_list: + - data_type: edge + date: "2020-12-25" + + - data_type: node + path: nodes.csv + date: "2019-01-01" + + - data_type: cluster + cluster_prefix: markov_i2 + + - data_type: cluster + cluster_prefix: markov_i4 + path: I4_named.tsv + + - data_type: cluster + cluster_prefix: markov_i6 + path: I6_named.tsv diff --git a/spec/test/djornl/invalid_manifest/no_file_format.yaml b/spec/test/djornl/invalid_manifest/no_file_format.yaml new file mode 100644 index 00000000..ee8b9082 --- /dev/null +++ b/spec/test/djornl/invalid_manifest/no_file_format.yaml @@ -0,0 +1,16 @@ +# edge file has no indicator of file format +name: Dan Jacobson Exascale data +release_date: "2020-06-06" +file_list: + - data_type: edge + date: "2020-12-25" + path: edge_data + + - data_type: node + path: nodes.csv + date: "2019-01-01" + + - data_type: cluster + cluster_prefix: markov_i2 + file_format: tsv + path: clusters diff --git a/spec/test/djornl/invalid_manifest/no_file_list.yaml b/spec/test/djornl/invalid_manifest/no_file_list.yaml new file mode 100644 index 00000000..ebaf9fb8 --- /dev/null +++ b/spec/test/djornl/invalid_manifest/no_file_list.yaml @@ -0,0 +1,3 @@ +# missing file_list +name: Dan Jacobson Exascale data +release_date: "2020-06-06" diff --git a/spec/test/djornl/invalid_types/directed_edges.tsv b/spec/test/djornl/invalid_types/directed_edges.tsv new file mode 100644 index 00000000..d5f80dfa --- /dev/null +++ b/spec/test/djornl/invalid_types/directed_edges.tsv @@ -0,0 +1,10 @@ +# data_type: edge +node1 node2 score edge_descrip edge_type directed +As2 AT1G01020 8.4 AraGWAS-Association_score phenotype-association_AraGWAS 1 +As2 AT1G01040 5.4 AraGWAS-Association_score phenotype-association_AraGWAS true +As75 AT1G01020 39.9 AraGWAS-Association_score phenotype-association_AraGWAS "" +AT1G01010 AT1G01040 2.5 AraNetv2_log-likelihood-score domain-co-occurrence_AraNet_v2 directed +AT1G01010 AT1G01040 170.5 AraNetv2_log-likelihood-score protein-protein-interaction_literature-curated_AraNet_v2 "0" +AT1G01030 AT1G01050 2.6 AraNetv2_log-likelihood-score pairwise-gene-coexpression_AraNet_v2 false +AT1G01050 AT1G01060 2.7 AraNetv2_log-likelihood-score protein-protein-interaction_literature-curated_AraNet_v2 "1" +AT1G01080 AT1G01090 2.8 AraNetv2_log-likelihood-score protein-protein-interaction_literature-curated_AraNet_v2 0 diff --git a/spec/test/djornl/invalid_types/edges.tsv b/spec/test/djornl/invalid_types/edges.tsv new file mode 100644 index 00000000..fdb7b00d --- /dev/null +++ b/spec/test/djornl/invalid_types/edges.tsv @@ -0,0 +1,10 @@ +# data_type: edge +node1 node2 score edge_descrip edge_type +As2 AT1G01020 8.422046084731258 AraGWAS-Association_score Same-Old-Stuff + AT1G01040 6 AraGWAS-Association_score phenotype-association_AraGWAS +As75 39.98573324312915 AraGWAS-Association_score phenotype-association_AraGWAS +AT1G01010 AT1G01020 2.39322646755088 +AT1G01010 AT1G01030 2. AraNetv2_log-likelihood-score protein-protein-interaction_high-throughput_AraNet_v2 +AT1G01010 AT1G01040 "2.39322646755088" AraNetv2_log-likelihood-score raNetv2-DC_ +AT1G01030 AT1G01050 25494618241936697 AraNetv2_log-likelihood-score pairwise-gene-coexpression_AraNet_v2 +AT1G01050 AT1G01060 score! AraNetv2_log-likelihood-score protein-protein-interaction_literature-curated_AraNet_v2 diff --git a/spec/test/djornl/invalid_types/manifest.yaml b/spec/test/djornl/invalid_types/manifest.yaml new file mode 100644 index 00000000..6b5d90e7 --- /dev/null +++ b/spec/test/djornl/invalid_types/manifest.yaml @@ -0,0 +1,20 @@ +name: Dan Jacobson Exascale data +release_date: "2020-06-06" +file_list: + - data_type: edge + path: edges.tsv + + - data_type: edge + path: directed_edges.tsv + + - data_type: node + file_format: csv + path: nodes.csv + + - data_type: cluster + path: markov2_named.tsv + cluster_prefix: markov_i2 + + - data_type: node + file_format: csv + path: pheno_nodes.csv diff --git a/spec/test/djornl/invalid_types/markov2_named.tsv b/spec/test/djornl/invalid_types/markov2_named.tsv new file mode 100644 index 00000000..f82190fd --- /dev/null +++ b/spec/test/djornl/invalid_types/markov2_named.tsv @@ -0,0 +1,9 @@ +cluster_id node_ids +# data_type: cluster +# cluster_prefix: markov_i2 +# title: Markov clustering, inflation = 2 +Cluster1 AT1G01010,AT1G01030,AT1G01040 +Cluster2 AT1G01050,AT1G01060,AT1G01070 +HoneyNutCluster3 AT1G01080,AT1G01090 +Cluster4 +Cluster5 AT1G01020 diff --git a/spec/test/djornl/invalid_types/nodes.csv b/spec/test/djornl/invalid_types/nodes.csv new file mode 100644 index 00000000..db64e077 --- /dev/null +++ b/spec/test/djornl/invalid_types/nodes.csv @@ -0,0 +1,16 @@ +node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_description,GO_terms,MapMan_bin,MapMan_name,MapMan_description,pheno_AraGWAS_ID,pheno_description,pheno_pto_name,pheno_pto_description,pheno_reference,User_Notes +# data_type: node +As2,pheno,,,,,,,,,,,,,10.21958/phenotype:103,,bacterial disease resistance,The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj],"Atwell et. al, Nature 2010", +As75,pheno,,,,,,,,,,,,,10.21958/phenotype:67,"Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",arsenic concentration,A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik],"Atwell et. al, Nature 2010", +AT1G01010,Monkey,AT1G01010.1,NTL10,NAC domain containing protein 1,protein_coding,NAC domain containing protein 1;(source:Araport11),,NAC domain containing protein 1,"DNA-binding transcription factor activity, DNA binding","GO:0003700, GO:0003677",15.5.17,.RNA biosynthesis.transcriptional regulation.transcription factor (NAC),transcription factor (NAC) (original description: pep chromosome:TAIR10:1:3631:5899:1 gene:AT1G01010 transcript:AT1G01010.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NAC001 description:NAC domain-containing protein 1 [Source:UniProtKB/Swiss-Prot;Acc:Q0WV96]),,,,,, +AT1G01020,gene,AT1G01020.6,ARV1,,protein_coding,ARV1 family protein;(source:Araport11),,,molecular_function,GO:0003674,35.1,not assigned.annotated,(original description: pep chromosome:TAIR10:1:6788:8737:-1 gene:AT1G01020 transcript:AT1G01020.6 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:ARV1 description:ARV1 family protein [Source:UniProtKB/TrEMBL;Acc:Q5MK24]) & no description available(sp|q5mk24|arv1_arath : 99.4),,,,,, +A,gene,AT1G01030.2,NGA3,NGATHA3,protein_coding,AP2/B3-like transcriptional factor family protein;(source:Araport11),,,"DNA-binding transcription factor activity, DNA binding","GO:0003700, GO:0003677",15.5.5.3,.RNA biosynthesis.transcriptional regulation.B3 transcription factor superfamily.transcription factor (RAV/NGATHA),transcription factor (RAV/NGATHA) (original description: pep chromosome:TAIR10:1:11649:13714:-1 gene:AT1G01030 transcript:AT1G01030.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NGA3 description:B3 domain-containing transcription factor NGA3 [Source:UniProtKB/Swiss-Prot;Acc:Q9MAN1]),,,,,, +AT1G01040,gene,AT1G01040.2,SUS1,SUSPENSOR 1,protein_coding,dicer-like 1;(source:Araport11),"Encodes a Dicer homolog. Dicer is a RNA helicase involved in microRNA processing. Mutations in this locus can result in embryo lethality. Embryo shape at seed maturity is globular-elongate. Other mutants convert the floral meristems to an indeterminate state, others yet show defects in ovule development. mRNA is expressed in all shoot tissues. DCL1 is able to produce miRNAs and siRNAs. The mRNA is cell-to-cell mobile.",dicer-like 1,"metal ion binding, protein binding, ribonuclease III activity, ATP-dependent helicase activity, ATP binding, RNA binding, helicase activity, double-stranded RNA binding, DNA binding","GO:0046872, GO:0005515, GO:0004525, GO:0008026, GO:0005524, GO:0003723, GO:0004386, GO:0003725, GO:0003677",16.10.2.1.1,.RNA processing.mRNA silencing.miRNA pathway.DCL1-HYL1 miRNA biogenesis complex.endoribonuclease component DCL1,endoribonuclease component DCL1 of DCL1-HYL1 miRNA biogenesis complex (original description: pep chromosome:TAIR10:1:23416:31120:1 gene:AT1G01040 transcript:AT1G01040.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:DCL1 description:Dicer-like 1 [Source:UniProtKB/TrEMBL;Acc:F4HQG6]),,,,,, +AT1G01050,gene,AT1G01050.2,PPa1,pyrophosphorylase 1,protein_coding,pyrophosphorylase 1;(source:Araport11),,,inorganic diphosphatase activity,GO:0004427,35.1,not assigned.annotated,(original description: pep chromosome:TAIR10:1:31382:33009:-1 gene:AT1G01050 transcript:AT1G01050.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:PPA1 description:Soluble inorganic pyrophosphatase 1 [Source:UniProtKB/Swiss-Prot;Acc:Q93V56]) & Soluble inorganic pyrophosphatase 1 OS=Arabidopsis thaliana (sp|q93v56|ipyr1_arath : 419.0),,,,,, +AT1G01060,gene,AT1G01060.8,LHY1,LATE ELONGATED HYPOCOTYL 1,protein_coding,Homeodomain-like superfamily protein;(source:Araport11),,,"DNA-binding transcription factor activity, DNA binding, transcription regulatory region DNA binding","GO:0003700, GO:0003677, GO:0044212",27.1.1,.Multi-process regulation.circadian clock system.core oscillator protein (LHY|CCA1),circadian clock core oscillator protein (LHY|CCA1) (original description: pep chromosome:TAIR10:1:33967:37230:-1 gene:AT1G01060 transcript:AT1G01060.8 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:LHY description:LHY1 [Source:UniProtKB/TrEMBL;Acc:A0A178W761]),,,,,, +AT1G01070,gene,AT1G01070.2,UMAMIT28,Usually multiple acids move in and out Transporters 28,protein_coding,nodulin MtN21 /EamA-like transporter family protein;(source:Araport11),Encodes a plasma membrane-localized amino acid transporter likely involved in amino acid export in the developing seed.,nodulin MtN21 /EamA-like transporter family protein,L-glutamine transmembrane transporter activity,GO:0015186,24.2.1.5,.Solute transport.carrier-mediated transport.DMT superfamily.solute transporter (UmamiT),solute transporter (UmamiT) (original description: pep chromosome:TAIR10:1:38752:40945:-1 gene:AT1G01070 transcript:AT1G01070.2 gene_biotype:protein_coding transcript_biotype:protein_coding description:WAT1-related protein [Source:UniProtKB/TrEMBL;Acc:A0A178WFU3]),,,,,, +AT1G01080,gene,AT1G01080.3,,,protein_coding,RNA-binding (RRM/RBD/RNP motifs) family protein;(source:Araport11),,,"RNA binding, mRNA binding","GO:0003723, GO:0003729",35.1,not assigned.annotated,"(original description: pep chromosome:TAIR10:1:44970:47059:-1 gene:AT1G01080 transcript:AT1G01080.3 gene_biotype:protein_coding transcript_biotype:protein_coding description:RNA-binding (RRM/RBD/RNP motifs) family protein [Source:UniProtKB/TrEMBL;Acc:F4HQH8]) & 33 kDa ribonucleoprotein, chloroplastic OS=Nicotiana sylvestris (sp|p19684|roc5_nicsy : 109.0)",,,,,, +AT1G01090,gene,AT1G01090.1,PDH-E1 ALPHA,pyruvate dehydrogenase E1 alpha,protein_coding,pyruvate dehydrogenase E1 alpha;(source:Araport11),pyruvate dehydrogenase E1 alpha subunit,pyruvate dehydrogenase E1 alpha,pyruvate dehydrogenase (acetyl-transferring) activity,GO:0004739,5.1.2.2.1.1,.Lipid metabolism.fatty acid biosynthesis.acetyl-CoA generation.plastidial pyruvate dehydrogenase complex.E1 pyruvate dehydrogenase subcomplex.subunit alpha,subunit alpha of E1 pyruvate dehydrogenase component (original description: pep chromosome:TAIR10:1:47234:49304:-1 gene:AT1G01090 transcript:AT1G01090.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:PDH-E1 ALPHA description:Pyruvate dehydrogenase E1 component subunit alpha [Source:UniProtKB/TrEMBL;Acc:A0A178W8A7]),,,,,, +AT1G01100,gene,AT1G01100.4,,,protein_coding,60S acidic ribosomal protein family;(source:Araport11),,60S acidic ribosomal protein family,"structural constituent of ribosome, ribonucleoprotein complex binding, protein kinase activator activity","GO:0003735, GO:0043021, GO:0030295",17.1.2.1.46,.Protein biosynthesis.ribosome biogenesis.large ribosomal subunit (LSU).LSU proteome.component RPP1,component RPP1 of LSU proteome component (original description: pep chromosome:TAIR10:1:50090:51187:-1 gene:AT1G01100 transcript:AT1G01100.4 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:RPP1A description:60S acidic ribosomal protein P1-1 [Source:UniProtKB/Swiss-Prot;Acc:Q8LCW9]),,,,,, +Na23,pheno,,,,,,,,,,,,,10.21958/phenotype:5,"Sodium concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",sodium concentration,The total sodium ion concentration measured in a given volume of a plant or a plant part or plant extract. [GR:pj],"Atwell et. al, Nature 2010", +SDV,pheno,,,,,,,,,,,,,10.21958/phenotype:104,"Number of days following stratification to opening of first flower. The experiment was stopped at 200 d, and accessions that had not flowered at that point were assigned a value of 200",days to flowering trait,"A flowering time trait (TO:0002616)which is the number of days required for an individual flower (PO:0009046), a whole plant (PO:0000003) or a plant population to reach flowering stage (PO:0007616) from a predetermined time point (e.g. the date of seed sowing, seedling transplant, or seedling emergence). [GR:pj, TO:cooperl]","Atwell et. al, Nature 2010", diff --git a/spec/test/djornl/invalid_types/pheno_nodes.csv b/spec/test/djornl/invalid_types/pheno_nodes.csv new file mode 100644 index 00000000..d695313d --- /dev/null +++ b/spec/test/djornl/invalid_types/pheno_nodes.csv @@ -0,0 +1,4 @@ +node_id,node_type,pheno_AraGWAS_ID,pheno_description,pheno_pto_name,pheno_pto_description,pheno_reference,User_Notes +# no +# valid +# data diff --git a/spec/test/djornl/missing_files/manifest.yaml b/spec/test/djornl/missing_files/manifest.yaml new file mode 100644 index 00000000..d79e76a7 --- /dev/null +++ b/spec/test/djornl/missing_files/manifest.yaml @@ -0,0 +1,12 @@ +name: Dan Jacobson Exascale data +release_date: "2020-06-06" +file_list: + - data_type: edge + path: edges.tsv + + - data_type: node + path: nodes.csv + + - data_type: cluster + cluster_prefix: markov_i2 + path: clusters.tsv diff --git a/spec/test/djornl/missing_required_headers/I2_named.tsv b/spec/test/djornl/missing_required_headers/I2_named.tsv new file mode 100644 index 00000000..3767347f --- /dev/null +++ b/spec/test/djornl/missing_required_headers/I2_named.tsv @@ -0,0 +1,8 @@ +cluster node_list +# data_type: cluster +# cluster_prefix: markov_i2 +# title: Markov clustering, inflation = 2 +Cluster1 AT1G01010,AT1G01030,AT1G01040 +Cluster2 AT1G01050,AT1G01060,AT1G01070 +Cluster3 AT1G01090 +Cluster5 AT1G01020 diff --git a/spec/test/djornl/missing_required_headers/I4_named.tsv b/spec/test/djornl/missing_required_headers/I4_named.tsv new file mode 100644 index 00000000..1fa92267 --- /dev/null +++ b/spec/test/djornl/missing_required_headers/I4_named.tsv @@ -0,0 +1,5 @@ +cluster_id node_ids other cool stuff +# cluster_prefix: markov_i4 +# title: Markov clustering, inflation = 4 +# data_type: cluster +Cluster3 AT1G01080 diff --git a/spec/test/djornl/missing_required_headers/I6_named.tsv b/spec/test/djornl/missing_required_headers/I6_named.tsv new file mode 100644 index 00000000..85b7aa81 --- /dev/null +++ b/spec/test/djornl/missing_required_headers/I6_named.tsv @@ -0,0 +1,8 @@ +cluster_id node_ids node_ids +# data_type: cluster +# cluster_prefix: markov_i6 +# title: Markov clustering, inflation = 6 +Cluster1 AT1G01040,AT1G01090 +Cluster2 AT1G01070 +Cluster3 AT1G01010,AT1G01020,AT1G01030 +# Cluster4 diff --git a/spec/test/djornl/missing_required_headers/edges.tsv b/spec/test/djornl/missing_required_headers/edges.tsv new file mode 100644 index 00000000..468172a4 --- /dev/null +++ b/spec/test/djornl/missing_required_headers/edges.tsv @@ -0,0 +1,9 @@ +node1 node2 edge edge_descrip edge_type +As2 AT1G01020 8.4 AraGWAS-Association_score phenotype-association_AraGWAS +As2 AT1G01040 5.4 AraGWAS-Association_score phenotype-association_AraGWAS +As75 AT1G01020 39.9 AraGWAS-Association_score phenotype-association_AraGWAS +AT1G01010 AT1G01040 2.5 AraNetv2_log-likelihood-score domain-co-occurrence_AraNet_v2 +AT1G01010 AT1G01040 170.5 AraNetv2_log-likelihood-score protein-protein-interaction_literature-curated_AraNet_v2 +AT1G01030 AT1G01050 2.6 AraNetv2_log-likelihood-score pairwise-gene-coexpression_AraNet_v2 +AT1G01050 AT1G01060 2.7 AraNetv2_log-likelihood-score protein-protein-interaction_literature-curated_AraNet_v2 +AT1G01080 AT1G01090 2.8 AraNetv2_log-likelihood-score protein-protein-interaction_literature-curated_AraNet_v2 diff --git a/spec/test/djornl/missing_required_headers/extra_node.csv b/spec/test/djornl/missing_required_headers/extra_node.csv new file mode 100644 index 00000000..f33c19f7 --- /dev/null +++ b/spec/test/djornl/missing_required_headers/extra_node.csv @@ -0,0 +1,3 @@ +# data_type: node +node_id,node_types,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_description,GO_terms,MapMan_bin,MapMan_name,MapMan_description +AT1G01100,gene,AT1G01100.4,,,protein_coding,60S acidic ribosomal protein family;(source:Araport11),,60S acidic ribosomal protein family,"structural constituent of ribosome, ribonucleoprotein complex binding, protein kinase activator activity","GO:0003735, GO:0043021, GO:0030295",17.1.2.1.46,.Protein biosynthesis.ribosome biogenesis.large ribosomal subunit (LSU).LSU proteome.component RPP1,component RPP1 of LSU proteome component (original description: pep chromosome:TAIR10:1:50090:51187:-1 gene:AT1G01100 transcript:AT1G01100.4 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:RPP1A description:60S acidic ribosomal protein P1-1 [Source:UniProtKB/Swiss-Prot;Acc:Q8LCW9]) diff --git a/spec/test/djornl/missing_required_headers/extra_node.tsv b/spec/test/djornl/missing_required_headers/extra_node.tsv new file mode 100644 index 00000000..d09b410f --- /dev/null +++ b/spec/test/djornl/missing_required_headers/extra_node.tsv @@ -0,0 +1,3 @@ +# data_type: node +node_id node_types transcript gene_symbol gene_full_name gene_model_type TAIR_Computational_description TAIR_Curator_summary TAIR_short_description GO_description GO_terms MapMan_bin MapMan_name MapMan_description +AT1G01100 gene AT1G01100.4 protein_coding 60S acidic ribosomal protein family;(source:Araport11) 60S acidic ribosomal protein family structural constituent of ribosome, ribonucleoprotein complex binding, protein kinase activator activity GO:0003735, GO:0043021, GO:0030295 17.1.2.1.46 .Protein biosynthesis.ribosome biogenesis.large ribosomal subunit (LSU).LSU proteome.component RPP1 component RPP1 of LSU proteome component (original description: pep chromosome:TAIR10:1:50090:51187:-1 gene:AT1G01100 transcript:AT1G01100.4 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:RPP1A description:60S acidic ribosomal protein P1-1 [Source:UniProtKB/Swiss-Prot;Acc:Q8LCW9]) diff --git a/spec/test/djornl/missing_required_headers/hithruput-edges.csv b/spec/test/djornl/missing_required_headers/hithruput-edges.csv new file mode 100644 index 00000000..e57fae35 --- /dev/null +++ b/spec/test/djornl/missing_required_headers/hithruput-edges.csv @@ -0,0 +1,3 @@ +node1,node2,score,edge_descrip,layer_descrip +AT1G01010,AT1G01020,2.3,AraNetv2_log-likelihood-score,protein-protein-interaction_high-throughput_AraNet_v2 +AT1G01010,AT1G01030,2.4,AraNetv2_log-likelihood-score,protein-protein-interaction_high-throughput_AraNet_v2 diff --git a/spec/test/djornl/missing_required_headers/manifest.yaml b/spec/test/djornl/missing_required_headers/manifest.yaml new file mode 100644 index 00000000..0d761213 --- /dev/null +++ b/spec/test/djornl/missing_required_headers/manifest.yaml @@ -0,0 +1,38 @@ +name: Dan Jacobson Exascale data +release_date: "2020-06-06" +home_url: "https://github.com/kbase/exascale_data" +file_list: + - data_type: edge + path: edges.tsv + date: "2020-12-25" + + - data_type: edge + path: hithruput-edges.csv + date: "2020-12-25" + + - data_type: node + date: "2019-01-01" + file_format: csv + path: nodes.csv + + - data_type: cluster + cluster_prefix: markov_i2 + path: I2_named.tsv + + - data_type: cluster + cluster_prefix: markov_i4 + path: I4_named.tsv + + - data_type: cluster + cluster_prefix: markov_i6 + path: I6_named.tsv + + - data_type: node + date: "2019-01-01" + file_format: csv + path: extra_node.csv + + - data_type: node + date: "2019-01-01" + file_format: csv + path: pheno_nodes.csv diff --git a/spec/test/djornl/missing_required_headers/nodes.csv b/spec/test/djornl/missing_required_headers/nodes.csv new file mode 100644 index 00000000..678a6657 --- /dev/null +++ b/spec/test/djornl/missing_required_headers/nodes.csv @@ -0,0 +1,11 @@ +# data_type: node +node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_description,GO_terms,MapMan_bin,MapMan_name,MapMan_description,pheno_AraGWAS_ID,pheno_description,pheno_pto_name,pheno_pto_description,pheno_reference,User_Notes +AT1G01010,gene,AT1G01010.1,NTL10,NAC domain containing protein 1,protein_coding,NAC domain containing protein 1;(source:Araport11),,NAC domain containing protein 1,"DNA-binding transcription factor activity, DNA binding","GO:0003700, GO:0003677",15.5.17,.RNA biosynthesis.transcriptional regulation.transcription factor (NAC),transcription factor (NAC) (original description: pep chromosome:TAIR10:1:3631:5899:1 gene:AT1G01010 transcript:AT1G01010.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NAC001 description:NAC domain-containing protein 1 [Source:UniProtKB/Swiss-Prot;Acc:Q0WV96]),,,,,, +AT1G01020,gene,AT1G01020.6,ARV1,,protein_coding,ARV1 family protein;(source:Araport11),,,molecular_function,GO:0003674,35.1,not assigned.annotated,(original description: pep chromosome:TAIR10:1:6788:8737:-1 gene:AT1G01020 transcript:AT1G01020.6 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:ARV1 description:ARV1 family protein [Source:UniProtKB/TrEMBL;Acc:Q5MK24]) & no description available(sp|q5mk24|arv1_arath : 99.4),,,,,, +AT1G01030,gene,AT1G01030.2,NGA3,NGATHA3,protein_coding,AP2/B3-like transcriptional factor family protein;(source:Araport11),,,"DNA-binding transcription factor activity, DNA binding","GO:0003700, GO:0003677",15.5.5.3,.RNA biosynthesis.transcriptional regulation.B3 transcription factor superfamily.transcription factor (RAV/NGATHA),transcription factor (RAV/NGATHA) (original description: pep chromosome:TAIR10:1:11649:13714:-1 gene:AT1G01030 transcript:AT1G01030.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NGA3 description:B3 domain-containing transcription factor NGA3 [Source:UniProtKB/Swiss-Prot;Acc:Q9MAN1]),,,,,, +AT1G01040,gene,AT1G01040.2,SUS1,SUSPENSOR 1,protein_coding,dicer-like 1;(source:Araport11),"Encodes a Dicer homolog. Dicer is a RNA helicase involved in microRNA processing. Mutations in this locus can result in embryo lethality. Embryo shape at seed maturity is globular-elongate. Other mutants convert the floral meristems to an indeterminate state, others yet show defects in ovule development. mRNA is expressed in all shoot tissues. DCL1 is able to produce miRNAs and siRNAs. The mRNA is cell-to-cell mobile.",dicer-like 1,"metal ion binding, protein binding, ribonuclease III activity, ATP-dependent helicase activity, ATP binding, RNA binding, helicase activity, double-stranded RNA binding, DNA binding","GO:0046872, GO:0005515, GO:0004525, GO:0008026, GO:0005524, GO:0003723, GO:0004386, GO:0003725, GO:0003677",16.10.2.1.1,.RNA processing.mRNA silencing.miRNA pathway.DCL1-HYL1 miRNA biogenesis complex.endoribonuclease component DCL1,endoribonuclease component DCL1 of DCL1-HYL1 miRNA biogenesis complex (original description: pep chromosome:TAIR10:1:23416:31120:1 gene:AT1G01040 transcript:AT1G01040.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:DCL1 description:Dicer-like 1 [Source:UniProtKB/TrEMBL;Acc:F4HQG6]),,,,,, +AT1G01050,gene,AT1G01050.2,PPa1,pyrophosphorylase 1,protein_coding,pyrophosphorylase 1;(source:Araport11),,,inorganic diphosphatase activity,GO:0004427,35.1,not assigned.annotated,(original description: pep chromosome:TAIR10:1:31382:33009:-1 gene:AT1G01050 transcript:AT1G01050.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:PPA1 description:Soluble inorganic pyrophosphatase 1 [Source:UniProtKB/Swiss-Prot;Acc:Q93V56]) & Soluble inorganic pyrophosphatase 1 OS=Arabidopsis thaliana (sp|q93v56|ipyr1_arath : 419.0),,,,,, +AT1G01060,gene,AT1G01060.8,LHY1,LATE ELONGATED HYPOCOTYL 1,protein_coding,Homeodomain-like superfamily protein;(source:Araport11),,,"DNA-binding transcription factor activity, DNA binding, transcription regulatory region DNA binding","GO:0003700, GO:0003677, GO:0044212",27.1.1,.Multi-process regulation.circadian clock system.core oscillator protein (LHY|CCA1),circadian clock core oscillator protein (LHY|CCA1) (original description: pep chromosome:TAIR10:1:33967:37230:-1 gene:AT1G01060 transcript:AT1G01060.8 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:LHY description:LHY1 [Source:UniProtKB/TrEMBL;Acc:A0A178W761]),,,,,, +AT1G01070,gene,AT1G01070.2,UMAMIT28,Usually multiple acids move in and out Transporters 28,protein_coding,nodulin MtN21 /EamA-like transporter family protein;(source:Araport11),Encodes a plasma membrane-localized amino acid transporter likely involved in amino acid export in the developing seed.,nodulin MtN21 /EamA-like transporter family protein,L-glutamine transmembrane transporter activity,GO:0015186,24.2.1.5,.Solute transport.carrier-mediated transport.DMT superfamily.solute transporter (UmamiT),solute transporter (UmamiT) (original description: pep chromosome:TAIR10:1:38752:40945:-1 gene:AT1G01070 transcript:AT1G01070.2 gene_biotype:protein_coding transcript_biotype:protein_coding description:WAT1-related protein [Source:UniProtKB/TrEMBL;Acc:A0A178WFU3]),,,,,, +AT1G01080,gene,AT1G01080.3,,,protein_coding,RNA-binding (RRM/RBD/RNP motifs) family protein;(source:Araport11),,,"RNA binding, mRNA binding","GO:0003723, GO:0003729",35.1,not assigned.annotated,"(original description: pep chromosome:TAIR10:1:44970:47059:-1 gene:AT1G01080 transcript:AT1G01080.3 gene_biotype:protein_coding transcript_biotype:protein_coding description:RNA-binding (RRM/RBD/RNP motifs) family protein [Source:UniProtKB/TrEMBL;Acc:F4HQH8]) & 33 kDa ribonucleoprotein, chloroplastic OS=Nicotiana sylvestris (sp|p19684|roc5_nicsy : 109.0)",,,,,, +AT1G01090,gene,AT1G01090.1,PDH-E1 ALPHA,pyruvate dehydrogenase E1 alpha,protein_coding,pyruvate dehydrogenase E1 alpha;(source:Araport11),pyruvate dehydrogenase E1 alpha subunit,pyruvate dehydrogenase E1 alpha,"pyruvate dehydrogenase (acetyl-transferring) activity, protein binding","GO:0004739, GO:0005515",5.1.2.2.1.1,.Lipid metabolism.fatty acid biosynthesis.acetyl-CoA generation.plastidial pyruvate dehydrogenase complex.E1 pyruvate dehydrogenase subcomplex.subunit alpha,subunit alpha of E1 pyruvate dehydrogenase component (original description: pep chromosome:TAIR10:1:47234:49304:-1 gene:AT1G01090 transcript:AT1G01090.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:PDH-E1 ALPHA description:Pyruvate dehydrogenase E1 component subunit alpha [Source:UniProtKB/TrEMBL;Acc:A0A178W8A7]),,,,,, diff --git a/spec/test/djornl/missing_required_headers/pheno_nodes.csv b/spec/test/djornl/missing_required_headers/pheno_nodes.csv new file mode 100644 index 00000000..f7ba6de1 --- /dev/null +++ b/spec/test/djornl/missing_required_headers/pheno_nodes.csv @@ -0,0 +1,5 @@ +id,node_type,pheno_AraGWAS_ID,pheno_description,pheno_pto_name,pheno_pto_description,pheno_ref,UserNotes +As2,pheno,10.21958/phenotype:103,,bacterial disease resistance,The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj],"Atwell et. al, Nature 2010", +As75,pheno,10.21958/phenotype:67,"Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",arsenic concentration,A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik],"Atwell et. al, Nature 2010", +Na23,pheno,10.21958/phenotype:5,"Sodium concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",sodium concentration,The total sodium ion concentration measured in a given volume of a plant or a plant part or plant extract. [GR:pj],"Atwell et. al, Nature 2010", +SDV,pheno,10.21958/phenotype:104,"Number of days following stratification to opening of first flower. The experiment was stopped at 200 d, and accessions that had not flowered at that point were assigned a value of 200",days to flowering trait,"A flowering time trait (TO:0002616)which is the number of days required for an individual flower (PO:0009046), a whole plant (PO:0000003) or a plant population to reach flowering stage (PO:0007616) from a predetermined time point (e.g. the date of seed sowing, seedling transplant, or seedling emergence). [GR:pj, TO:cooperl]","Atwell et. al, Nature 2010", diff --git a/spec/test/djornl/results.json b/spec/test/djornl/results.json new file mode 100644 index 00000000..5b694916 --- /dev/null +++ b/spec/test/djornl/results.json @@ -0,0 +1,889 @@ +{ + "load_edges": { + "nodes": [ + {"_key": "As2"}, + {"_key": "AT1G01020"}, + {"_key": "AT1G01040"}, + {"_key": "As75"}, + {"_key": "AT1G01010"}, + {"_key": "AT1G01030"}, + {"_key": "AT1G01050"}, + {"_key": "AT1G01060"}, + {"_key": "AT1G01080"}, + {"_key": "AT1G01090"}, + {"_key": "AT1G01100"}, + {"_key": "SDV"} + ], + "edges": [ + { + "_key": "As2__AT1G01020__phenotype-association_AraGWAS__False__8.4", + "_from": "djornl_node/As2", + "_to": "djornl_node/AT1G01020", + "edge_type": "phenotype-association_AraGWAS", + "score": 8.4, + "directed": false + }, + { + "_key": "As2__AT1G01040__phenotype-association_AraGWAS__False__5.4", + "_from": "djornl_node/As2", + "_to": "djornl_node/AT1G01040", + "edge_type": "phenotype-association_AraGWAS", + "score": 5.4, "directed": false + }, + { + "_key": "As75__AT1G01020__phenotype-association_AraGWAS__False__39.9", + "_from": "djornl_node/As75", + "_to": "djornl_node/AT1G01020", + "edge_type": "phenotype-association_AraGWAS", + "score": 39.9, + "directed": false + }, + { + "_key": "AT1G01010__AT1G01020__protein-protein-interaction_high-throughput_AraNet_v2__False__2.3", + "_from": "djornl_node/AT1G01010", + "_to": "djornl_node/AT1G01020", + "edge_type": "protein-protein-interaction_high-throughput_AraNet_v2", + "score": 2.3, + "directed": false + }, + { + "_key": "AT1G01010__AT1G01030__protein-protein-interaction_high-throughput_AraNet_v2__False__2.4", + "_from": "djornl_node/AT1G01010", + "_to": "djornl_node/AT1G01030", + "edge_type": "protein-protein-interaction_high-throughput_AraNet_v2", + "score": 2.4, + "directed": false + }, + { + "_key": "AT1G01010__AT1G01040__domain-co-occurrence_AraNet_v2__False__2.5", + "_from": "djornl_node/AT1G01010", + "_to": "djornl_node/AT1G01040", + "edge_type": "domain-co-occurrence_AraNet_v2", + "score": 2.5, + "directed": false + }, + { + "_key": "AT1G01010__AT1G01040__protein-protein-interaction_literature-curated_AraNet_v2__False__170.5", + "_from": "djornl_node/AT1G01010", + "_to": "djornl_node/AT1G01040", + "edge_type": "protein-protein-interaction_literature-curated_AraNet_v2", + "score": 170.5, + "directed": false + }, + { + "_key": "AT1G01030__AT1G01050__pairwise-gene-coexpression_AraNet_v2__False__2.6", + "_from": "djornl_node/AT1G01030", + "_to": "djornl_node/AT1G01050", + "edge_type": "pairwise-gene-coexpression_AraNet_v2", + "score": 2.6, + "directed": false + }, + { + "_key": "AT1G01050__AT1G01060__protein-protein-interaction_literature-curated_AraNet_v2__False__2.7", + "_from": "djornl_node/AT1G01050", + "_to": "djornl_node/AT1G01060", + "edge_type": "protein-protein-interaction_literature-curated_AraNet_v2", + "score": 2.7, + "directed": false + }, + { + "_key": "AT1G01080__AT1G01090__protein-protein-interaction_literature-curated_AraNet_v2__False__2.8", + "_from": "djornl_node/AT1G01080", + "_to": "djornl_node/AT1G01090", + "edge_type": "protein-protein-interaction_literature-curated_AraNet_v2", + "score": 2.8, + "directed": false + }, + { + "_key": "AT1G01100__SDV__protein-protein-interaction_literature-curated_AraNet_v2__True__2.4", + "_from": "djornl_node/AT1G01100", + "_to": "djornl_node/SDV", + "edge_type": "protein-protein-interaction_literature-curated_AraNet_v2", + "score": 2.4, + "directed": true + }, + { + "_key": "SDV__AT1G01100__protein-protein-interaction_literature-curated_AraNet_v2__False__8.4", + "_from": "djornl_node/SDV", + "_to": "djornl_node/AT1G01100", + "edge_type": "protein-protein-interaction_literature-curated_AraNet_v2", + "score": 8.4, + "directed": false + }, + { + "_key": "SDV__AT1G01100__protein-protein-interaction_literature-curated_AraNet_v2__True__5.4", + "_from": "djornl_node/SDV", + "_to": "djornl_node/AT1G01100", + "edge_type": "protein-protein-interaction_literature-curated_AraNet_v2", + "score": 5.4, + "directed": true + } + ] + }, + "load_clusters": { + "nodes": [ + {"_key": "AT1G01010", "clusters": ["markov_i2:1", "markov_i6:3"]}, + {"_key": "AT1G01030", "clusters": ["markov_i2:1", "markov_i6:3"]}, + {"_key": "AT1G01040", "clusters": ["markov_i2:1", "markov_i6:1"]}, + {"_key": "AT1G01050", "clusters": ["markov_i2:2"]}, + {"_key": "AT1G01060", "clusters": ["markov_i2:2"]}, + {"_key": "AT1G01070", "clusters": ["markov_i2:2", "markov_i6:2"]}, + {"_key": "AT1G01090", "clusters": ["markov_i2:3", "markov_i6:1"]}, + {"_key": "AT1G01020", "clusters": ["markov_i2:5", "markov_i6:3"]}, + {"_key": "AT1G01080", "clusters": ["markov_i4:3"]} + ] + }, + "load_nodes": { + "nodes": [ + {"_key": "As2", "node_type": "pheno", "transcript": "", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "", "tair_computational_description": "", "tair_curator_summary": "", "tair_short_description": "", "go_description": "", "go_terms": [], "mapman_bin": "", "mapman_name": "", "mapman_description": "", "pheno_aragwas_id": "10.21958/phenotype:103", "pheno_description": "", "pheno_pto_name": "bacterial disease resistance", "pheno_pto_description": "The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj]", "pheno_reference": "Atwell et. al, Nature 2010", "user_notes": ""}, + {"_key": "As75", "node_type": "pheno", "transcript": "", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "", "tair_computational_description": "", "tair_curator_summary": "", "tair_short_description": "", "go_description": "", "go_terms": [], "mapman_bin": "", "mapman_name": "", "mapman_description": "", "pheno_aragwas_id": "10.21958/phenotype:67", "pheno_description": "Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008", "pheno_pto_name": "arsenic concentration", "pheno_pto_description": "A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik]", "pheno_reference": "Atwell et. al, Nature 2010", "user_notes": ""}, + {"_key": "AT1G01010", "node_type": "gene", "transcript": "AT1G01010.1", "gene_symbol": "NTL10", "gene_full_name": "NAC domain containing protein 1", "gene_model_type": "protein_coding", "tair_computational_description": "NAC domain containing protein 1;(source:Araport11)", "tair_curator_summary": "", "tair_short_description": "NAC domain containing protein 1", "go_description": "DNA-binding transcription factor activity, DNA binding", "go_terms": ["GO:0003700", "GO:0003677"], "mapman_bin": "15.5.17", "mapman_name": ".RNA biosynthesis.transcriptional regulation.transcription factor (NAC)", "mapman_description": "transcription factor (NAC) (original description: pep chromosome:TAIR10:1:3631:5899:1 gene:AT1G01010 transcript:AT1G01010.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NAC001 description:NAC domain-containing protein 1 [Source:UniProtKB/Swiss-Prot;Acc:Q0WV96])", "pheno_aragwas_id": "", "pheno_description": "", "pheno_pto_name": "", "pheno_pto_description": "", "pheno_reference": "", "user_notes": ""}, + {"_key": "AT1G01020", "node_type": "gene", "transcript": "AT1G01020.6", "gene_symbol": "ARV1", "gene_full_name": "", "gene_model_type": "protein_coding", "tair_computational_description": "ARV1 family protein;(source:Araport11)", "tair_curator_summary": "", "tair_short_description": "", "go_description": "molecular_function", "go_terms": ["GO:0003674"], "mapman_bin": "35.1", "mapman_name": "not assigned.annotated", "mapman_description": "(original description: pep chromosome:TAIR10:1:6788:8737:-1 gene:AT1G01020 transcript:AT1G01020.6 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:ARV1 description:ARV1 family protein [Source:UniProtKB/TrEMBL;Acc:Q5MK24]) & no description available(sp|q5mk24|arv1_arath : 99.4)", "pheno_aragwas_id": "", "pheno_description": "", "pheno_pto_name": "", "pheno_pto_description": "", "pheno_reference": "", "user_notes": ""}, + {"_key": "AT1G01030", "node_type": "gene", "transcript": "AT1G01030.2", "gene_symbol": "NGA3", "gene_full_name": "NGATHA3", "gene_model_type": "protein_coding", "tair_computational_description": "AP2/B3-like transcriptional factor family protein;(source:Araport11)", "tair_curator_summary": "", "tair_short_description": "", "go_description": "DNA-binding transcription factor activity, DNA binding", "go_terms": ["GO:0003700", "GO:0003677"], "mapman_bin": "15.5.5.3", "mapman_name": ".RNA biosynthesis.transcriptional regulation.B3 transcription factor superfamily.transcription factor (RAV/NGATHA)", "mapman_description": "transcription factor (RAV/NGATHA) (original description: pep chromosome:TAIR10:1:11649:13714:-1 gene:AT1G01030 transcript:AT1G01030.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NGA3 description:B3 domain-containing transcription factor NGA3 [Source:UniProtKB/Swiss-Prot;Acc:Q9MAN1])", "pheno_aragwas_id": "", "pheno_description": "", "pheno_pto_name": "", "pheno_pto_description": "", "pheno_reference": "", "user_notes": ""}, + {"_key": "AT1G01040", "node_type": "gene", "transcript": "AT1G01040.2", "gene_symbol": "SUS1", "gene_full_name": "SUSPENSOR 1", "gene_model_type": "protein_coding", "tair_computational_description": "dicer-like 1;(source:Araport11)", "tair_curator_summary": "Encodes a Dicer homolog. Dicer is a RNA helicase involved in microRNA processing. Mutations in this locus can result in embryo lethality. Embryo shape at seed maturity is globular-elongate. Other mutants convert the floral meristems to an indeterminate state, others yet show defects in ovule development. mRNA is expressed in all shoot tissues. DCL1 is able to produce miRNAs and siRNAs. The mRNA is cell-to-cell mobile.", "tair_short_description": "dicer-like 1", "go_description": "metal ion binding, protein binding, ribonuclease III activity, ATP-dependent helicase activity, ATP binding, RNA binding, helicase activity, double-stranded RNA binding, DNA binding", "go_terms": ["GO:0046872", "GO:0005515", "GO:0004525", "GO:0008026", "GO:0005524", "GO:0003723", "GO:0004386", "GO:0003725", "GO:0003677"], "mapman_bin": "16.10.2.1.1", "mapman_name": ".RNA processing.mRNA silencing.miRNA pathway.DCL1-HYL1 miRNA biogenesis complex.endoribonuclease component DCL1", "mapman_description": "endoribonuclease component DCL1 of DCL1-HYL1 miRNA biogenesis complex (original description: pep chromosome:TAIR10:1:23416:31120:1 gene:AT1G01040 transcript:AT1G01040.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:DCL1 description:Dicer-like 1 [Source:UniProtKB/TrEMBL;Acc:F4HQG6])", "pheno_aragwas_id": "", "pheno_description": "", "pheno_pto_name": "", "pheno_pto_description": "", "pheno_reference": "", "user_notes": ""}, + {"_key": "AT1G01050", "node_type": "gene", "transcript": "AT1G01050.2", "gene_symbol": "PPa1", "gene_full_name": "pyrophosphorylase 1", "gene_model_type": "protein_coding", "tair_computational_description": "pyrophosphorylase 1;(source:Araport11)", "tair_curator_summary": "", "tair_short_description": "", "go_description": "inorganic diphosphatase activity", "go_terms": ["GO:0004427"], "mapman_bin": "35.1", "mapman_name": "not assigned.annotated", "mapman_description": "(original description: pep chromosome:TAIR10:1:31382:33009:-1 gene:AT1G01050 transcript:AT1G01050.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:PPA1 description:Soluble inorganic pyrophosphatase 1 [Source:UniProtKB/Swiss-Prot;Acc:Q93V56]) & Soluble inorganic pyrophosphatase 1 OS=Arabidopsis thaliana (sp|q93v56|ipyr1_arath : 419.0)", "pheno_aragwas_id": "", "pheno_description": "", "pheno_pto_name": "", "pheno_pto_description": "", "pheno_reference": "", "user_notes": ""}, + {"_key": "AT1G01060", "node_type": "gene", "transcript": "AT1G01060.8", "gene_symbol": "LHY1", "gene_full_name": "LATE ELONGATED HYPOCOTYL 1", "gene_model_type": "protein_coding", "tair_computational_description": "Homeodomain-like superfamily protein;(source:Araport11)", "tair_curator_summary": "", "tair_short_description": "", "go_description": "DNA-binding transcription factor activity, DNA binding, transcription regulatory region DNA binding", "go_terms": ["GO:0003700", "GO:0003677", "GO:0044212"], "mapman_bin": "27.1.1", "mapman_name": ".Multi-process regulation.circadian clock system.core oscillator protein (LHY|CCA1)", "mapman_description": "circadian clock core oscillator protein (LHY|CCA1) (original description: pep chromosome:TAIR10:1:33967:37230:-1 gene:AT1G01060 transcript:AT1G01060.8 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:LHY description:LHY1 [Source:UniProtKB/TrEMBL;Acc:A0A178W761])", "pheno_aragwas_id": "", "pheno_description": "", "pheno_pto_name": "", "pheno_pto_description": "", "pheno_reference": "", "user_notes": ""}, + {"_key": "AT1G01070", "node_type": "gene", "transcript": "AT1G01070.2", "gene_symbol": "UMAMIT28", "gene_full_name": "Usually multiple acids move in and out Transporters 28", "gene_model_type": "protein_coding", "tair_computational_description": "nodulin MtN21 /EamA-like transporter family protein;(source:Araport11)", "tair_curator_summary": "Encodes a plasma membrane-localized amino acid transporter likely involved in amino acid export in the developing seed.", "tair_short_description": "nodulin MtN21 /EamA-like transporter family protein", "go_description": "L-glutamine transmembrane transporter activity", "go_terms": ["GO:0015186"], "mapman_bin": "24.2.1.5", "mapman_name": ".Solute transport.carrier-mediated transport.DMT superfamily.solute transporter (UmamiT)", "mapman_description": "solute transporter (UmamiT) (original description: pep chromosome:TAIR10:1:38752:40945:-1 gene:AT1G01070 transcript:AT1G01070.2 gene_biotype:protein_coding transcript_biotype:protein_coding description:WAT1-related protein [Source:UniProtKB/TrEMBL;Acc:A0A178WFU3])", "pheno_aragwas_id": "", "pheno_description": "", "pheno_pto_name": "", "pheno_pto_description": "", "pheno_reference": "", "user_notes": ""}, + {"_key": "AT1G01080", "node_type": "gene", "transcript": "AT1G01080.3", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "protein_coding", "tair_computational_description": "RNA-binding (RRM/RBD/RNP motifs) family protein;(source:Araport11)", "tair_curator_summary": "", "tair_short_description": "", "go_description": "RNA binding, mRNA binding", "go_terms": ["GO:0003723", "GO:0003729"], "mapman_bin": "35.1", "mapman_name": "not assigned.annotated", "mapman_description": "(original description: pep chromosome:TAIR10:1:44970:47059:-1 gene:AT1G01080 transcript:AT1G01080.3 gene_biotype:protein_coding transcript_biotype:protein_coding description:RNA-binding (RRM/RBD/RNP motifs) family protein [Source:UniProtKB/TrEMBL;Acc:F4HQH8]) & 33 kDa ribonucleoprotein, chloroplastic OS=Nicotiana sylvestris (sp|p19684|roc5_nicsy : 109.0)", "pheno_aragwas_id": "", "pheno_description": "", "pheno_pto_name": "", "pheno_pto_description": "", "pheno_reference": "", "user_notes": ""}, + {"_key": "AT1G01090", "node_type": "gene", "transcript": "AT1G01090.1", "gene_symbol": "PDH-E1 ALPHA", "gene_full_name": "pyruvate dehydrogenase E1 alpha", "gene_model_type": "protein_coding", "tair_computational_description": "pyruvate dehydrogenase E1 alpha;(source:Araport11)", "tair_curator_summary": "pyruvate dehydrogenase E1 alpha subunit", "tair_short_description": "pyruvate dehydrogenase E1 alpha", "go_description": "pyruvate dehydrogenase (acetyl-transferring) activity, protein binding", "go_terms": ["GO:0004739", "GO:0005515"], "mapman_bin": "5.1.2.2.1.1", "mapman_name": ".Lipid metabolism.fatty acid biosynthesis.acetyl-CoA generation.plastidial pyruvate dehydrogenase complex.E1 pyruvate dehydrogenase subcomplex.subunit alpha", "mapman_description": "subunit alpha of E1 pyruvate dehydrogenase component (original description: pep chromosome:TAIR10:1:47234:49304:-1 gene:AT1G01090 transcript:AT1G01090.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:PDH-E1 ALPHA description:Pyruvate dehydrogenase E1 component subunit alpha [Source:UniProtKB/TrEMBL;Acc:A0A178W8A7])", "pheno_aragwas_id": "", "pheno_description": "", "pheno_pto_name": "", "pheno_pto_description": "", "pheno_reference": "", "user_notes": ""}, + {"_key": "AT1G01100", "node_type": "gene", "transcript": "AT1G01100.4", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "protein_coding", "tair_computational_description": "60S acidic ribosomal protein family;(source:Araport11)", "tair_curator_summary": "", "tair_short_description": "60S acidic ribosomal protein family", "go_description": "structural constituent of ribosome, ribonucleoprotein complex binding, protein kinase activator activity", "go_terms": ["GO:0003735", "GO:0043021", "GO:0030295"], "mapman_bin": "17.1.2.1.46", "mapman_name": ".Protein biosynthesis.ribosome biogenesis.large ribosomal subunit (LSU).LSU proteome.component RPP1", "mapman_description": "component RPP1 of LSU proteome component (original description: pep chromosome:TAIR10:1:50090:51187:-1 gene:AT1G01100 transcript:AT1G01100.4 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:RPP1A description:60S acidic ribosomal protein P1-1 [Source:UniProtKB/Swiss-Prot;Acc:Q8LCW9])", "pheno_aragwas_id": "", "pheno_description": "", "pheno_pto_name": "", "pheno_pto_description": "", "pheno_reference": "", "user_notes": ""}, + {"_key": "Na23", "node_type": "pheno", "transcript": "", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "", "tair_computational_description": "", "tair_curator_summary": "", "tair_short_description": "", "go_description": "", "go_terms": [], "mapman_bin": "", "mapman_name": "", "mapman_description": "", "pheno_aragwas_id": "10.21958/phenotype:5", "pheno_description": "Sodium concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008", "pheno_pto_name": "sodium concentration", "pheno_pto_description": "The total sodium ion concentration measured in a given volume of a plant or a plant part or plant extract. [GR:pj]", "pheno_reference": "Atwell et. al, Nature 2010", "user_notes": ""}, + {"_key": "SDV", "node_type": "pheno", "transcript": "", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "", "tair_computational_description": "", "tair_curator_summary": "", "tair_short_description": "", "go_description": "", "go_terms": [], "mapman_bin": "", "mapman_name": "", "mapman_description": "", "pheno_aragwas_id": "10.21958/phenotype:104", "pheno_description": "Number of days following stratification to opening of first flower. The experiment was stopped at 200 d, and accessions that had not flowered at that point were assigned a value of 200", "pheno_pto_name": "days to flowering trait", "pheno_pto_description": "A flowering time trait (TO:0002616)which is the number of days required for an individual flower (PO:0009046), a whole plant (PO:0000003) or a plant population to reach flowering stage (PO:0007616) from a predetermined time point (e.g. the date of seed sowing, seedling transplant, or seedling emergence). [GR:pj, TO:cooperl]", "pheno_reference": "Atwell et. al, Nature 2010", "user_notes": ""} + ] + }, + "queries": { + "djornl_fetch_phenotype": [ + { + "params": {"phenotype_keys": ["A", "B", "C"]}, + "error": { + "details": "Stored query 'djornl_fetch_phenotype' does not exist.", + "message": "Not found", + "name": "djornl_fetch_phenotype" + } + } + ], + "djornl_fetch_all": [ + { + "params": {}, + "results": { + "nodes": [ + "As2", + "As75", + "AT1G01010", + "AT1G01020", + "AT1G01030", + "AT1G01040", + "AT1G01050", + "AT1G01060", + "AT1G01070", + "AT1G01080", + "AT1G01090", + "AT1G01100", + "Na23", + "SDV" + ], + "edges": [ + "As2__AT1G01020__phenotype-association_AraGWAS__False__8.4", + "As2__AT1G01040__phenotype-association_AraGWAS__False__5.4", + "As75__AT1G01020__phenotype-association_AraGWAS__False__39.9", + "AT1G01010__AT1G01020__protein-protein-interaction_high-throughput_AraNet_v2__False__2.3", + "AT1G01010__AT1G01030__protein-protein-interaction_high-throughput_AraNet_v2__False__2.4", + "AT1G01010__AT1G01040__domain-co-occurrence_AraNet_v2__False__2.5", + "AT1G01010__AT1G01040__protein-protein-interaction_literature-curated_AraNet_v2__False__170.5", + "AT1G01030__AT1G01050__pairwise-gene-coexpression_AraNet_v2__False__2.6", + "AT1G01050__AT1G01060__protein-protein-interaction_literature-curated_AraNet_v2__False__2.7", + "AT1G01080__AT1G01090__protein-protein-interaction_literature-curated_AraNet_v2__False__2.8", + "AT1G01100__SDV__protein-protein-interaction_literature-curated_AraNet_v2__True__2.4", + "SDV__AT1G01100__protein-protein-interaction_literature-curated_AraNet_v2__False__8.4", + "SDV__AT1G01100__protein-protein-interaction_literature-curated_AraNet_v2__True__5.4" + ] + } + }, + { + "params": {"musical": "Mary Poppins"}, + "error": { + "failed_validator": "additionalProperties", + "message": "Additional properties are not allowed ('musical' was unexpected)", + "path": [], + "value": {"musical": "Mary Poppins"} + } + }, + { + "params": {"edge_types": ["straight", "curved"]}, + "error": { + "failed_validator": "oneOf", + "message": "'straight' is not valid under any of the given schemas", + "path": ["edge_types", 0], + "value": "straight" + } + }, + { + "params": {"edge_types": []}, + "results": { + "nodes": [ + "As2", + "As75", + "AT1G01010", + "AT1G01020", + "AT1G01030", + "AT1G01040", + "AT1G01050", + "AT1G01060", + "AT1G01070", + "AT1G01080", + "AT1G01090", + "AT1G01100", + "Na23", + "SDV" + ], + "edges": [ + "As2__AT1G01020__phenotype-association_AraGWAS__False__8.4", + "As2__AT1G01040__phenotype-association_AraGWAS__False__5.4", + "As75__AT1G01020__phenotype-association_AraGWAS__False__39.9", + "AT1G01010__AT1G01020__protein-protein-interaction_high-throughput_AraNet_v2__False__2.3", + "AT1G01010__AT1G01030__protein-protein-interaction_high-throughput_AraNet_v2__False__2.4", + "AT1G01010__AT1G01040__domain-co-occurrence_AraNet_v2__False__2.5", + "AT1G01010__AT1G01040__protein-protein-interaction_literature-curated_AraNet_v2__False__170.5", + "AT1G01030__AT1G01050__pairwise-gene-coexpression_AraNet_v2__False__2.6", + "AT1G01050__AT1G01060__protein-protein-interaction_literature-curated_AraNet_v2__False__2.7", + "AT1G01080__AT1G01090__protein-protein-interaction_literature-curated_AraNet_v2__False__2.8", + "AT1G01100__SDV__protein-protein-interaction_literature-curated_AraNet_v2__True__2.4", + "SDV__AT1G01100__protein-protein-interaction_literature-curated_AraNet_v2__False__8.4", + "SDV__AT1G01100__protein-protein-interaction_literature-curated_AraNet_v2__True__5.4" + ] + } + }, + { + "params": {"edge_types": ["phenotype-association_AraGWAS"]}, + "results": { + "nodes": [ + "As2", + "As75", + "AT1G01010", + "AT1G01020", + "AT1G01030", + "AT1G01040", + "AT1G01050", + "AT1G01060", + "AT1G01070", + "AT1G01080", + "AT1G01090", + "AT1G01100", + "Na23", + "SDV" + ], + "edges": [ + "As2__AT1G01020__phenotype-association_AraGWAS__False__8.4", + "As2__AT1G01040__phenotype-association_AraGWAS__False__5.4", + "As75__AT1G01020__phenotype-association_AraGWAS__False__39.9" + ] + } + }, + { + "params": {"edge_types": ["phenotype-association_AraGWAS", "protein-protein-interaction_high-throughput_AraNet_v2", "protein-protein-interaction_literature-curated_AraNet_v2"]}, + "results": { + "nodes": [ + "As2", + "As75", + "AT1G01010", + "AT1G01020", + "AT1G01030", + "AT1G01040", + "AT1G01050", + "AT1G01060", + "AT1G01070", + "AT1G01080", + "AT1G01090", + "AT1G01100", + "Na23", + "SDV" + ], + "edges": [ + "As2__AT1G01020__phenotype-association_AraGWAS__False__8.4", + "As2__AT1G01040__phenotype-association_AraGWAS__False__5.4", + "As75__AT1G01020__phenotype-association_AraGWAS__False__39.9", + "AT1G01010__AT1G01020__protein-protein-interaction_high-throughput_AraNet_v2__False__2.3", + "AT1G01010__AT1G01030__protein-protein-interaction_high-throughput_AraNet_v2__False__2.4", + "AT1G01010__AT1G01040__protein-protein-interaction_literature-curated_AraNet_v2__False__170.5", + "AT1G01050__AT1G01060__protein-protein-interaction_literature-curated_AraNet_v2__False__2.7", + "AT1G01080__AT1G01090__protein-protein-interaction_literature-curated_AraNet_v2__False__2.8", + "AT1G01100__SDV__protein-protein-interaction_literature-curated_AraNet_v2__True__2.4", + "SDV__AT1G01100__protein-protein-interaction_literature-curated_AraNet_v2__False__8.4", + "SDV__AT1G01100__protein-protein-interaction_literature-curated_AraNet_v2__True__5.4" + ] + } + } + ], + "djornl_fetch_genes": [ + { + "params": { "distance": 0 }, + "error": { + "failed_validator": "required", + "message": "'gene_keys' is a required property", + "path": [], + "value": {"distance": 0} + } + }, + { + "params": {"gene_keys": []}, + "error": { + "failed_validator": "minItems", + "message": "[] is too short", + "path": ["gene_keys"], + "value": [] + } + }, + { + "params": { "gene_keys": ["Mary Poppins"], "phenotype_keys": 0 }, + "error": { + "failed_validator": "additionalProperties", + "message": "Additional properties are not allowed ('phenotype_keys' was unexpected)", + "path": [], + "value": {"gene_keys": ["Mary Poppins"], "phenotype_keys": 0} + } + }, + { + "params": { "gene_keys": ["Mary Poppins"], "distance": 0 }, + "results": {"nodes": [], "edges": []} + }, + { + "params": { "gene_keys": ["Mary Poppins"], "distance": 1 }, + "results": {"nodes": [], "edges": []} + }, + { + "params": { "gene_keys": ["Mary Poppins"], "distance": 5 }, + "results": {"nodes": [], "edges": []} + }, + { + "params": { "gene_keys": ["AT1G01010"], "distance": 0 }, + "results": { + "nodes": ["AT1G01010"], + "edges": [] + } + }, + { + "params": { "gene_keys": ["AT1G01010"], "distance": 1 }, + "results": { + "nodes": [ + "AT1G01010", + "AT1G01020", + "AT1G01030", + "AT1G01040" + ], + "edges": [ + "AT1G01010__AT1G01020__protein-protein-interaction_high-throughput_AraNet_v2__False__2.3", + "AT1G01010__AT1G01030__protein-protein-interaction_high-throughput_AraNet_v2__False__2.4", + "AT1G01010__AT1G01040__domain-co-occurrence_AraNet_v2__False__2.5", + "AT1G01010__AT1G01040__protein-protein-interaction_literature-curated_AraNet_v2__False__170.5" + ] + } + }, + { + "params": { "gene_keys": ["AT1G01010"], "distance": 5 }, + "results": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060"], + "edges": [ + "As2__AT1G01020__phenotype-association_AraGWAS__False__8.4", + "As2__AT1G01040__phenotype-association_AraGWAS__False__5.4", + "As75__AT1G01020__phenotype-association_AraGWAS__False__39.9", + "AT1G01010__AT1G01020__protein-protein-interaction_high-throughput_AraNet_v2__False__2.3", + "AT1G01010__AT1G01030__protein-protein-interaction_high-throughput_AraNet_v2__False__2.4", + "AT1G01010__AT1G01040__domain-co-occurrence_AraNet_v2__False__2.5", + "AT1G01010__AT1G01040__protein-protein-interaction_literature-curated_AraNet_v2__False__170.5", + "AT1G01030__AT1G01050__pairwise-gene-coexpression_AraNet_v2__False__2.6", + "AT1G01050__AT1G01060__protein-protein-interaction_literature-curated_AraNet_v2__False__2.7" + ] + } + }, + { + "params": { "gene_keys": ["AT1G01010"], "distance": 5 }, + "results": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060"], + "edges": [ + "As2__AT1G01020__phenotype-association_AraGWAS__False__8.4", + "As2__AT1G01040__phenotype-association_AraGWAS__False__5.4", + "As75__AT1G01020__phenotype-association_AraGWAS__False__39.9", + "AT1G01010__AT1G01020__protein-protein-interaction_high-throughput_AraNet_v2__False__2.3", + "AT1G01010__AT1G01030__protein-protein-interaction_high-throughput_AraNet_v2__False__2.4", + "AT1G01010__AT1G01040__domain-co-occurrence_AraNet_v2__False__2.5", + "AT1G01010__AT1G01040__protein-protein-interaction_literature-curated_AraNet_v2__False__170.5", + "AT1G01030__AT1G01050__pairwise-gene-coexpression_AraNet_v2__False__2.6", + "AT1G01050__AT1G01060__protein-protein-interaction_literature-curated_AraNet_v2__False__2.7" + ] + } + }, + { + "params": {"gene_keys": ["AT1G01020", "AT1G01070"], "distance": 0 }, + "results": { + "nodes": ["AT1G01020", "AT1G01070"], + "edges": [] + } + }, + { + "params": {"gene_keys": ["AT1G01020", "AT1G01070"], "distance": 1 }, + "results": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01070"], + "edges": [ + "As2__AT1G01020__phenotype-association_AraGWAS__False__8.4", + "As75__AT1G01020__phenotype-association_AraGWAS__False__39.9", + "AT1G01010__AT1G01020__protein-protein-interaction_high-throughput_AraNet_v2__False__2.3" + ] + } + }, + { + "params": {"gene_keys": ["AT1G01020", "AT1G01070"], "distance": 1, "edge_types": ["phenotype-association_AraGWAS"] }, + "results": { + "nodes": ["As2", "As75", "AT1G01020", "AT1G01070"], + "edges": [ + "As2__AT1G01020__phenotype-association_AraGWAS__False__8.4", + "As75__AT1G01020__phenotype-association_AraGWAS__False__39.9" + ] + } + }, + { + "params": {"gene_keys": ["AT1G01020", "AT1G01070"], "distance": 5 }, + "results": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01070"], + "edges": [ + "As2__AT1G01020__phenotype-association_AraGWAS__False__8.4", + "As2__AT1G01040__phenotype-association_AraGWAS__False__5.4", + "As75__AT1G01020__phenotype-association_AraGWAS__False__39.9", + "AT1G01010__AT1G01020__protein-protein-interaction_high-throughput_AraNet_v2__False__2.3", + "AT1G01010__AT1G01030__protein-protein-interaction_high-throughput_AraNet_v2__False__2.4", + "AT1G01010__AT1G01040__domain-co-occurrence_AraNet_v2__False__2.5", + "AT1G01010__AT1G01040__protein-protein-interaction_literature-curated_AraNet_v2__False__170.5", + "AT1G01030__AT1G01050__pairwise-gene-coexpression_AraNet_v2__False__2.6", + "AT1G01050__AT1G01060__protein-protein-interaction_literature-curated_AraNet_v2__False__2.7" + ] + } + }, + { + "params": { + "gene_keys": ["AT1G01020", "AT1G01070"], + "distance": 5, + "edge_types": ["pairwise-gene-coexpression_AraNet_v2", "domain-co-occurrence_AraNet_v2", "protein-protein-interaction_high-throughput_AraNet_v2"] + }, + "results": { + "nodes": ["AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01070"], + "edges": [ + "AT1G01010__AT1G01020__protein-protein-interaction_high-throughput_AraNet_v2__False__2.3", + "AT1G01010__AT1G01030__protein-protein-interaction_high-throughput_AraNet_v2__False__2.4", + "AT1G01010__AT1G01040__domain-co-occurrence_AraNet_v2__False__2.5", + "AT1G01030__AT1G01050__pairwise-gene-coexpression_AraNet_v2__False__2.6" + ] + } + } + ], + "djornl_fetch_phenotypes": [ + { + "params": {"phenotype_keys": "Mary Poppins"}, + "error": { + "failed_validator": "type", + "message": "'Mary Poppins' is not of type 'array'", + "path": ["phenotype_keys"], + "value": "Mary Poppins" + }, + "coerce": {"nodes": [], "edges": []} + }, + { + "params": {"phenotype_keys": ["Mary Poppins"], "distance": 0}, + "results": {"nodes": [], "edges": []} + }, + { + "params": {"phenotype_keys": ["Mary Poppins"], "distance": 1}, + "results": {"nodes": [], "edges": []} + }, + { + "params": {"phenotype_keys": ["Mary Poppins"], "distance": 5}, + "results": {"nodes": [], "edges": []} + }, + { + "params": {"phenotype_keys": ["As2"]}, + "results": { + "nodes": ["As2"], + "edges": [] + } + }, + { + "params": {"phenotype_keys": ["As2"], "distance": 0}, + "results": { + "nodes": ["As2"], + "edges": [] + } + }, + { + "params": {"phenotype_keys": ["As2"], "distance": 1}, + "results": { + "nodes": ["As2", "AT1G01020", "AT1G01040"], + "edges": [ + "As2__AT1G01020__phenotype-association_AraGWAS__False__8.4", + "As2__AT1G01040__phenotype-association_AraGWAS__False__5.4" + ] + } + }, + { + "params": {"phenotype_keys": ["As2"], "distance": 5}, + "results": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060"], + "edges": [ + "As2__AT1G01020__phenotype-association_AraGWAS__False__8.4", + "As2__AT1G01040__phenotype-association_AraGWAS__False__5.4", + "As75__AT1G01020__phenotype-association_AraGWAS__False__39.9", + "AT1G01010__AT1G01020__protein-protein-interaction_high-throughput_AraNet_v2__False__2.3", + "AT1G01010__AT1G01030__protein-protein-interaction_high-throughput_AraNet_v2__False__2.4", + "AT1G01010__AT1G01040__domain-co-occurrence_AraNet_v2__False__2.5", + "AT1G01010__AT1G01040__protein-protein-interaction_literature-curated_AraNet_v2__False__170.5", + "AT1G01030__AT1G01050__pairwise-gene-coexpression_AraNet_v2__False__2.6", + "AT1G01050__AT1G01060__protein-protein-interaction_literature-curated_AraNet_v2__False__2.7" + ] + } + }, + { + "params": {"phenotype_keys": ["As2", "Na23"], "distance": 0}, + "results": { + "nodes": ["As2", "Na23"], + "edges": [] + } + }, + { + "params": {"phenotype_keys": ["As2", "Na23"], "distance": 1}, + "results": { + "nodes": ["As2", "Na23", "AT1G01020", "AT1G01040"], + "edges": [ + "As2__AT1G01020__phenotype-association_AraGWAS__False__8.4", + "As2__AT1G01040__phenotype-association_AraGWAS__False__5.4" + ] + } + }, + { + "params": {"phenotype_keys": ["As2", "Na23"], "distance": 5}, + "results": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "Na23"], + "edges": [ + "As2__AT1G01020__phenotype-association_AraGWAS__False__8.4", + "As2__AT1G01040__phenotype-association_AraGWAS__False__5.4", + "As75__AT1G01020__phenotype-association_AraGWAS__False__39.9", + "AT1G01010__AT1G01020__protein-protein-interaction_high-throughput_AraNet_v2__False__2.3", + "AT1G01010__AT1G01030__protein-protein-interaction_high-throughput_AraNet_v2__False__2.4", + "AT1G01010__AT1G01040__domain-co-occurrence_AraNet_v2__False__2.5", + "AT1G01010__AT1G01040__protein-protein-interaction_literature-curated_AraNet_v2__False__170.5", + "AT1G01030__AT1G01050__pairwise-gene-coexpression_AraNet_v2__False__2.6", + "AT1G01050__AT1G01060__protein-protein-interaction_literature-curated_AraNet_v2__False__2.7" + ] + } + }, + { + "params": {"phenotype_keys": ["As2", "Na23"], "distance": 0}, + "results": { + "nodes": ["As2", "Na23"], + "edges": [] + } + }, + { + "params": { + "phenotype_keys": ["As2", "Na23"], + "distance": 5, + "edge_types": ["pairwise-gene-coexpression_AraNet_v2", "domain-co-occurrence_AraNet_v2", "protein-protein-interaction_high-throughput_AraNet_v2", "protein-protein-interaction_literature-curated_AraNet_v2"] + }, + "results": { + "nodes": ["As2", "Na23"], + "edges": [] + } + }, + { + "params": { + "phenotype_keys": ["As2", "Na23"], + "distance": 5, + "edge_types": ["phenotype-association_AraGWAS"] + }, + "results": { + "nodes": ["As2", "As75", "AT1G01020", "AT1G01040", "Na23"], + "edges": [ + "As2__AT1G01020__phenotype-association_AraGWAS__False__8.4", + "As2__AT1G01040__phenotype-association_AraGWAS__False__5.4", + "As75__AT1G01020__phenotype-association_AraGWAS__False__39.9" + ] + } + } + ], + "djornl_search_nodes": [ + { + "params": {"search_text": "Mary Poppins", "distance": 500}, + "error": { + "failed_validator": "maximum", + "message": "500 is greater than the maximum of 100", + "path": ["distance"], + "value": 500 + } + }, + { + "params": "erm... what?", + "error": { + "failed_validator": "type", + "message": "'erm... what?' is not of type 'object'", + "path": [], + "value": "erm... what?" + } + }, + { + "params": {"search_text": "Mary Poppins", "distance": 0}, + "results": {"nodes": [], "edges": []} + }, + { + "params": {"search_text": "Mary Poppins", "distance": 1}, + "results": {"nodes": [], "edges": []} + }, + { + "params": {"search_text": "Mary Poppins", "distance": 5}, + "results": {"nodes": [], "edges": []} + }, + { + "params": {"search_text": "GO:0005515", "distance": 0}, + "results": { + "nodes": ["AT1G01040", "AT1G01090"], + "edges": [] + } + }, + { + "params": {"search_text": "GO:0005515", "distance": 1}, + "results": { + "nodes": ["As2", "AT1G01010", "AT1G01040", "AT1G01080", "AT1G01090"], + "edges": [ + "As2__AT1G01040__phenotype-association_AraGWAS__False__5.4", + "AT1G01010__AT1G01040__domain-co-occurrence_AraNet_v2__False__2.5", + "AT1G01010__AT1G01040__protein-protein-interaction_literature-curated_AraNet_v2__False__170.5", + "AT1G01080__AT1G01090__protein-protein-interaction_literature-curated_AraNet_v2__False__2.8" + ] + } + }, + { + "params": {"search_text": "GO:0005515", "distance": 5}, + "results": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01080", "AT1G01090"], + "edges": [ + "As2__AT1G01020__phenotype-association_AraGWAS__False__8.4", + "As2__AT1G01040__phenotype-association_AraGWAS__False__5.4", + "As75__AT1G01020__phenotype-association_AraGWAS__False__39.9", + "AT1G01010__AT1G01020__protein-protein-interaction_high-throughput_AraNet_v2__False__2.3", + "AT1G01010__AT1G01030__protein-protein-interaction_high-throughput_AraNet_v2__False__2.4", + "AT1G01010__AT1G01040__domain-co-occurrence_AraNet_v2__False__2.5", + "AT1G01010__AT1G01040__protein-protein-interaction_literature-curated_AraNet_v2__False__170.5", + "AT1G01030__AT1G01050__pairwise-gene-coexpression_AraNet_v2__False__2.6", + "AT1G01050__AT1G01060__protein-protein-interaction_literature-curated_AraNet_v2__False__2.7", + "AT1G01080__AT1G01090__protein-protein-interaction_literature-curated_AraNet_v2__False__2.8" + ] + } + }, + { + "params": {"search_text": "GO:0005515", "distance": 0, "edge_types": ["phenotype-association_AraGWAS"]}, + "results": { + "nodes": ["AT1G01040", "AT1G01090"], + "edges": [] + } + }, + { + "params": {"search_text": "GO:0005515", "distance": 1, "edge_types": ["phenotype-association_AraGWAS"]}, + "results": { + "nodes": ["As2", "AT1G01040", "AT1G01090"], + "edges": [ + "As2__AT1G01040__phenotype-association_AraGWAS__False__5.4" + ] + } + }, + { + "params": {"search_text": "GO:0005515", "distance": 5, "edge_types": ["phenotype-association_AraGWAS"]}, + "results": { + "nodes": ["As2", "As75", "AT1G01020", "AT1G01040", "AT1G01090"], + "edges": [ + "As2__AT1G01020__phenotype-association_AraGWAS__False__8.4", + "As2__AT1G01040__phenotype-association_AraGWAS__False__5.4", + "As75__AT1G01020__phenotype-association_AraGWAS__False__39.9" + ] + } + } + ], + "djornl_fetch_clusters": [ + { + "params": {"cluster_ids": "Mary Poppins"}, + "error": { + "failed_validator": "type", + "message": "'Mary Poppins' is not of type 'array'", + "path": ["cluster_ids"], + "value": "Mary Poppins" + } + }, + { + "params": {"cluster_ids": ["Mary Poppins"]}, + "error": { + "failed_validator": "pattern", + "message": "'Mary Poppins' does not match '^\\\\w+:\\\\d+$'", + "path": ["cluster_ids", 0], + "value": "Mary Poppins" + } + }, + { + "params": {"cluster_ids": []}, + "error": { + "failed_validator": "minItems", + "message": "[] is too short", + "path": ["cluster_ids"], + "value": [] + } + }, + { + "params": {"cluster_ids": ["MaryPoppins:1"], "distance": 0}, + "results": {"nodes": [], "edges": []} + }, + { + "params": {"cluster_ids": ["MaryPoppins:1"], "distance": 1}, + "results": {"nodes": [], "edges": []} + }, + { + "params": {"cluster_ids": ["MaryPoppins:1"], "distance": 5}, + "results": {"nodes": [], "edges": []} + }, + { + "params": {"cluster_ids": ["markov_i6:1"], "distance": 0}, + "results": { + "nodes": ["AT1G01040", "AT1G01090"], + "edges": [] + } + }, + { + "params": {"cluster_ids": ["markov_i6:1"], "distance": 1}, + "results": { + "nodes": ["As2", "AT1G01010", "AT1G01040", "AT1G01080", "AT1G01090"], + "edges": [ + "As2__AT1G01040__phenotype-association_AraGWAS__False__5.4", + "AT1G01010__AT1G01040__domain-co-occurrence_AraNet_v2__False__2.5", + "AT1G01010__AT1G01040__protein-protein-interaction_literature-curated_AraNet_v2__False__170.5", + "AT1G01080__AT1G01090__protein-protein-interaction_literature-curated_AraNet_v2__False__2.8" + ] + } + }, + { + "params": {"cluster_ids": ["markov_i6:1"], "distance": 5}, + "results": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01080", "AT1G01090"], + "edges": [ + "As2__AT1G01020__phenotype-association_AraGWAS__False__8.4", + "As2__AT1G01040__phenotype-association_AraGWAS__False__5.4", + "As75__AT1G01020__phenotype-association_AraGWAS__False__39.9", + "AT1G01010__AT1G01020__protein-protein-interaction_high-throughput_AraNet_v2__False__2.3", + "AT1G01010__AT1G01030__protein-protein-interaction_high-throughput_AraNet_v2__False__2.4", + "AT1G01010__AT1G01040__domain-co-occurrence_AraNet_v2__False__2.5", + "AT1G01010__AT1G01040__protein-protein-interaction_literature-curated_AraNet_v2__False__170.5", + "AT1G01030__AT1G01050__pairwise-gene-coexpression_AraNet_v2__False__2.6", + "AT1G01050__AT1G01060__protein-protein-interaction_literature-curated_AraNet_v2__False__2.7", + "AT1G01080__AT1G01090__protein-protein-interaction_literature-curated_AraNet_v2__False__2.8" + ] + } + }, + { + "params": {"cluster_ids": ["markov_i2:5", "markov_i6:2"]}, + "results": { + "nodes": ["AT1G01020", "AT1G01070"], + "edges": [] + } + }, + { + "params": {"cluster_ids": ["markov_i2:5", "markov_i6:2"], "distance": 0}, + "results": { + "nodes": ["AT1G01020", "AT1G01070"], + "edges": [] + } + }, + { + "params": {"cluster_ids": ["markov_i2:5", "markov_i6:2"], "distance": 1}, + "results": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01070"], + "edges": [ + "As2__AT1G01020__phenotype-association_AraGWAS__False__8.4", + "As75__AT1G01020__phenotype-association_AraGWAS__False__39.9", + "AT1G01010__AT1G01020__protein-protein-interaction_high-throughput_AraNet_v2__False__2.3" + ] + } + }, + { + "params": {"cluster_ids": ["markov_i2:5", "markov_i6:2"], "distance": 5}, + "results": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01070"], + "edges": [ + "As2__AT1G01020__phenotype-association_AraGWAS__False__8.4", + "As2__AT1G01040__phenotype-association_AraGWAS__False__5.4", + "As75__AT1G01020__phenotype-association_AraGWAS__False__39.9", + "AT1G01010__AT1G01020__protein-protein-interaction_high-throughput_AraNet_v2__False__2.3", + "AT1G01010__AT1G01030__protein-protein-interaction_high-throughput_AraNet_v2__False__2.4", + "AT1G01010__AT1G01040__domain-co-occurrence_AraNet_v2__False__2.5", + "AT1G01010__AT1G01040__protein-protein-interaction_literature-curated_AraNet_v2__False__170.5", + "AT1G01030__AT1G01050__pairwise-gene-coexpression_AraNet_v2__False__2.6", + "AT1G01050__AT1G01060__protein-protein-interaction_literature-curated_AraNet_v2__False__2.7" + ] + } + }, + { + "params": { + "cluster_ids": ["markov_i2:5", "markov_i6:2"], + "distance": 0, + "edge_types": ["protein-protein-interaction_high-throughput_AraNet_v2"] + }, + "results": { + "nodes": ["AT1G01020", "AT1G01070"], + "edges": [] + } + }, + { + "params": { + "cluster_ids": ["markov_i2:5", "markov_i6:2"], + "distance": 1, + "edge_types": ["protein-protein-interaction_high-throughput_AraNet_v2"] + }, + "results": { + "nodes": ["AT1G01010", "AT1G01020", "AT1G01070"], + "edges": [ + "AT1G01010__AT1G01020__protein-protein-interaction_high-throughput_AraNet_v2__False__2.3" + ] + } + }, + { + "params": { + "cluster_ids": ["markov_i2:5", "markov_i6:2"], + "distance": 5, + "edge_types": ["protein-protein-interaction_high-throughput_AraNet_v2"] + }, + "results": { + "nodes": ["AT1G01010", "AT1G01020", "AT1G01030", "AT1G01070"], + "edges": [ + "AT1G01010__AT1G01020__protein-protein-interaction_high-throughput_AraNet_v2__False__2.3", + "AT1G01010__AT1G01030__protein-protein-interaction_high-throughput_AraNet_v2__False__2.4" + ] + } + }, + { + "params": { + "cluster_ids": ["markov_i2:5", "markov_i6:2"], + "distance": 5, + "edge_types": [ + "phenotype-association_AraGWAS", + "protein-protein-interaction_high-throughput_AraNet_v2", + "protein-protein-interaction_literature-curated_AraNet_v2", + "pairwise-gene-coexpression_AraNet_v2" + ] + }, + "results": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01070"], + "edges": [ + "As2__AT1G01020__phenotype-association_AraGWAS__False__8.4", + "As2__AT1G01040__phenotype-association_AraGWAS__False__5.4", + "As75__AT1G01020__phenotype-association_AraGWAS__False__39.9", + "AT1G01010__AT1G01020__protein-protein-interaction_high-throughput_AraNet_v2__False__2.3", + "AT1G01010__AT1G01030__protein-protein-interaction_high-throughput_AraNet_v2__False__2.4", + "AT1G01010__AT1G01040__protein-protein-interaction_literature-curated_AraNet_v2__False__170.5", + "AT1G01030__AT1G01050__pairwise-gene-coexpression_AraNet_v2__False__2.6", + "AT1G01050__AT1G01060__protein-protein-interaction_literature-curated_AraNet_v2__False__2.7" + ] + } + } + ] + } +} diff --git a/spec/test/djornl/test_data/I2_named.tsv b/spec/test/djornl/test_data/I2_named.tsv new file mode 100644 index 00000000..c7a9c200 --- /dev/null +++ b/spec/test/djornl/test_data/I2_named.tsv @@ -0,0 +1,8 @@ +cluster_id node_ids +# data_type: cluster +# cluster_prefix: markov_i2 +# title: Markov clustering, inflation = 2 +Cluster1 AT1G01010,AT1G01030,AT1G01040 +Cluster2 AT1G01050,AT1G01060,AT1G01070 +Cluster3 AT1G01090 +Cluster5 AT1G01020 diff --git a/spec/test/djornl/test_data/I4_named.tsv b/spec/test/djornl/test_data/I4_named.tsv new file mode 100644 index 00000000..6e7d91e4 --- /dev/null +++ b/spec/test/djornl/test_data/I4_named.tsv @@ -0,0 +1,5 @@ +cluster_id node_ids +# cluster_prefix: markov_i4 +# title: Markov clustering, inflation = 4 +# data_type: cluster +Cluster3 AT1G01080 diff --git a/spec/test/djornl/test_data/I6_named.tsv b/spec/test/djornl/test_data/I6_named.tsv new file mode 100644 index 00000000..e7688f17 --- /dev/null +++ b/spec/test/djornl/test_data/I6_named.tsv @@ -0,0 +1,8 @@ +cluster_id node_ids +# data_type: cluster +# cluster_prefix: markov_i6 +# title: Markov clustering, inflation = 6 +Cluster1 AT1G01040,AT1G01090 +Cluster2 AT1G01070 +Cluster3 AT1G01010,AT1G01020,AT1G01030 +# Cluster4 diff --git a/spec/test/djornl/test_data/directed_edges.tsv b/spec/test/djornl/test_data/directed_edges.tsv new file mode 100644 index 00000000..83d970f5 --- /dev/null +++ b/spec/test/djornl/test_data/directed_edges.tsv @@ -0,0 +1,4 @@ +node1 node2 score edge_descrip edge_type directed +SDV AT1G01100 8.4 AraGWAS-Association_score protein-protein-interaction_literature-curated_AraNet_v2 0 +SDV AT1G01100 5.4 AraGWAS-Association_score protein-protein-interaction_literature-curated_AraNet_v2 1 +AT1G01100 SDV 2.4 AraGWAS-Association_score protein-protein-interaction_literature-curated_AraNet_v2 1 diff --git a/spec/test/djornl/test_data/edges.tsv b/spec/test/djornl/test_data/edges.tsv new file mode 100644 index 00000000..5924b991 --- /dev/null +++ b/spec/test/djornl/test_data/edges.tsv @@ -0,0 +1,9 @@ +node1 node2 score edge_descrip edge_type +As2 AT1G01020 8.4 AraGWAS-Association_score phenotype-association_AraGWAS +As2 AT1G01040 5.4 AraGWAS-Association_score phenotype-association_AraGWAS +As75 AT1G01020 39.9 AraGWAS-Association_score phenotype-association_AraGWAS +AT1G01010 AT1G01040 2.5 AraNetv2_log-likelihood-score domain-co-occurrence_AraNet_v2 +AT1G01010 AT1G01040 170.5 AraNetv2_log-likelihood-score protein-protein-interaction_literature-curated_AraNet_v2 +AT1G01030 AT1G01050 2.6 AraNetv2_log-likelihood-score pairwise-gene-coexpression_AraNet_v2 +AT1G01050 AT1G01060 2.7 AraNetv2_log-likelihood-score protein-protein-interaction_literature-curated_AraNet_v2 +AT1G01080 AT1G01090 2.8 AraNetv2_log-likelihood-score protein-protein-interaction_literature-curated_AraNet_v2 diff --git a/spec/test/djornl/test_data/extra_node.csv b/spec/test/djornl/test_data/extra_node.csv new file mode 100644 index 00000000..b9c0529c --- /dev/null +++ b/spec/test/djornl/test_data/extra_node.csv @@ -0,0 +1,3 @@ +# data_type: node +node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_description,GO_terms,MapMan_bin,MapMan_name,MapMan_description +AT1G01100,gene,AT1G01100.4,,,protein_coding,60S acidic ribosomal protein family;(source:Araport11),,60S acidic ribosomal protein family,"structural constituent of ribosome, ribonucleoprotein complex binding, protein kinase activator activity","GO:0003735, GO:0043021, GO:0030295",17.1.2.1.46,.Protein biosynthesis.ribosome biogenesis.large ribosomal subunit (LSU).LSU proteome.component RPP1,component RPP1 of LSU proteome component (original description: pep chromosome:TAIR10:1:50090:51187:-1 gene:AT1G01100 transcript:AT1G01100.4 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:RPP1A description:60S acidic ribosomal protein P1-1 [Source:UniProtKB/Swiss-Prot;Acc:Q8LCW9]) diff --git a/spec/test/djornl/test_data/extra_node.tsv b/spec/test/djornl/test_data/extra_node.tsv new file mode 100644 index 00000000..de069d70 --- /dev/null +++ b/spec/test/djornl/test_data/extra_node.tsv @@ -0,0 +1,3 @@ +# data_type: node +node_id node_type transcript gene_symbol gene_full_name gene_model_type TAIR_Computational_description TAIR_Curator_summary TAIR_short_description GO_description GO_terms MapMan_bin MapMan_name MapMan_description +AT1G01100 gene AT1G01100.4 protein_coding 60S acidic ribosomal protein family;(source:Araport11) 60S acidic ribosomal protein family structural constituent of ribosome, ribonucleoprotein complex binding, protein kinase activator activity GO:0003735, GO:0043021, GO:0030295 17.1.2.1.46 .Protein biosynthesis.ribosome biogenesis.large ribosomal subunit (LSU).LSU proteome.component RPP1 component RPP1 of LSU proteome component (original description: pep chromosome:TAIR10:1:50090:51187:-1 gene:AT1G01100 transcript:AT1G01100.4 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:RPP1A description:60S acidic ribosomal protein P1-1 [Source:UniProtKB/Swiss-Prot;Acc:Q8LCW9]) diff --git a/spec/test/djornl/test_data/hithruput-edges.csv b/spec/test/djornl/test_data/hithruput-edges.csv new file mode 100644 index 00000000..79a7deba --- /dev/null +++ b/spec/test/djornl/test_data/hithruput-edges.csv @@ -0,0 +1,3 @@ +node1,node2,score,edge_descrip,edge_type,directed +AT1G01010,AT1G01020,2.3,AraNetv2_log-likelihood-score,protein-protein-interaction_high-throughput_AraNet_v2,0 +AT1G01010,AT1G01030,2.4,AraNetv2_log-likelihood-score,protein-protein-interaction_high-throughput_AraNet_v2,0 diff --git a/spec/test/djornl/test_data/manifest.yaml b/spec/test/djornl/test_data/manifest.yaml new file mode 100644 index 00000000..2bddf5f3 --- /dev/null +++ b/spec/test/djornl/test_data/manifest.yaml @@ -0,0 +1,42 @@ +name: Dan Jacobson Exascale data +release_date: "2020-06-06" +home_url: "https://github.com/kbase/exascale_data" +file_list: + - data_type: edge + path: edges.tsv + date: "2020-12-25" + + - data_type: edge + path: hithruput-edges.csv + date: "2020-12-25" + + - data_type: edge + path: directed_edges.tsv + date: "2020-12-25" + + - data_type: node + date: "2019-01-01" + file_format: csv + path: nodes.csv + + - data_type: cluster + cluster_prefix: markov_i2 + path: I2_named.tsv + + - data_type: cluster + cluster_prefix: markov_i4 + path: I4_named.tsv + + - data_type: cluster + cluster_prefix: markov_i6 + path: I6_named.tsv + + - data_type: node + date: "2019-01-01" + file_format: csv + path: pheno_nodes.csv + + - data_type: node + date: "2019-01-01" + file_format: csv + path: extra_node.csv diff --git a/spec/test/djornl/test_data/nodes.csv b/spec/test/djornl/test_data/nodes.csv new file mode 100644 index 00000000..eef9e060 --- /dev/null +++ b/spec/test/djornl/test_data/nodes.csv @@ -0,0 +1,11 @@ +# data_type: node +User_Notes,node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_description,GO_terms,MapMan_bin,MapMan_name,MapMan_description,pheno_AraGWAS_ID,pheno_description,pheno_pto_name,pheno_pto_description,pheno_reference +,AT1G01010,gene,AT1G01010.1,NTL10,NAC domain containing protein 1,protein_coding,NAC domain containing protein 1;(source:Araport11),,NAC domain containing protein 1,"DNA-binding transcription factor activity, DNA binding","GO:0003700, GO:0003677",15.5.17,.RNA biosynthesis.transcriptional regulation.transcription factor (NAC),transcription factor (NAC) (original description: pep chromosome:TAIR10:1:3631:5899:1 gene:AT1G01010 transcript:AT1G01010.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NAC001 description:NAC domain-containing protein 1 [Source:UniProtKB/Swiss-Prot;Acc:Q0WV96]),,,,, +,AT1G01020,gene,AT1G01020.6,ARV1,,protein_coding,ARV1 family protein;(source:Araport11),,,molecular_function,GO:0003674,35.1,not assigned.annotated,(original description: pep chromosome:TAIR10:1:6788:8737:-1 gene:AT1G01020 transcript:AT1G01020.6 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:ARV1 description:ARV1 family protein [Source:UniProtKB/TrEMBL;Acc:Q5MK24]) & no description available(sp|q5mk24|arv1_arath : 99.4),,,,, +,AT1G01030,gene,AT1G01030.2,NGA3,NGATHA3,protein_coding,AP2/B3-like transcriptional factor family protein;(source:Araport11),,,"DNA-binding transcription factor activity, DNA binding","GO:0003700, GO:0003677",15.5.5.3,.RNA biosynthesis.transcriptional regulation.B3 transcription factor superfamily.transcription factor (RAV/NGATHA),transcription factor (RAV/NGATHA) (original description: pep chromosome:TAIR10:1:11649:13714:-1 gene:AT1G01030 transcript:AT1G01030.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NGA3 description:B3 domain-containing transcription factor NGA3 [Source:UniProtKB/Swiss-Prot;Acc:Q9MAN1]),,,,, +,AT1G01040,gene,AT1G01040.2,SUS1,SUSPENSOR 1,protein_coding,dicer-like 1;(source:Araport11),"Encodes a Dicer homolog. Dicer is a RNA helicase involved in microRNA processing. Mutations in this locus can result in embryo lethality. Embryo shape at seed maturity is globular-elongate. Other mutants convert the floral meristems to an indeterminate state, others yet show defects in ovule development. mRNA is expressed in all shoot tissues. DCL1 is able to produce miRNAs and siRNAs. The mRNA is cell-to-cell mobile.",dicer-like 1,"metal ion binding, protein binding, ribonuclease III activity, ATP-dependent helicase activity, ATP binding, RNA binding, helicase activity, double-stranded RNA binding, DNA binding","GO:0046872, GO:0005515, GO:0004525, GO:0008026, GO:0005524, GO:0003723, GO:0004386, GO:0003725, GO:0003677",16.10.2.1.1,.RNA processing.mRNA silencing.miRNA pathway.DCL1-HYL1 miRNA biogenesis complex.endoribonuclease component DCL1,endoribonuclease component DCL1 of DCL1-HYL1 miRNA biogenesis complex (original description: pep chromosome:TAIR10:1:23416:31120:1 gene:AT1G01040 transcript:AT1G01040.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:DCL1 description:Dicer-like 1 [Source:UniProtKB/TrEMBL;Acc:F4HQG6]),,,,, +,AT1G01050,gene,AT1G01050.2,PPa1,pyrophosphorylase 1,protein_coding,pyrophosphorylase 1;(source:Araport11),,,inorganic diphosphatase activity,GO:0004427,35.1,not assigned.annotated,(original description: pep chromosome:TAIR10:1:31382:33009:-1 gene:AT1G01050 transcript:AT1G01050.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:PPA1 description:Soluble inorganic pyrophosphatase 1 [Source:UniProtKB/Swiss-Prot;Acc:Q93V56]) & Soluble inorganic pyrophosphatase 1 OS=Arabidopsis thaliana (sp|q93v56|ipyr1_arath : 419.0),,,,, +,AT1G01060,gene,AT1G01060.8,LHY1,LATE ELONGATED HYPOCOTYL 1,protein_coding,Homeodomain-like superfamily protein;(source:Araport11),,,"DNA-binding transcription factor activity, DNA binding, transcription regulatory region DNA binding","GO:0003700, GO:0003677, GO:0044212",27.1.1,.Multi-process regulation.circadian clock system.core oscillator protein (LHY|CCA1),circadian clock core oscillator protein (LHY|CCA1) (original description: pep chromosome:TAIR10:1:33967:37230:-1 gene:AT1G01060 transcript:AT1G01060.8 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:LHY description:LHY1 [Source:UniProtKB/TrEMBL;Acc:A0A178W761]),,,,, +,AT1G01070,gene,AT1G01070.2,UMAMIT28,Usually multiple acids move in and out Transporters 28,protein_coding,nodulin MtN21 /EamA-like transporter family protein;(source:Araport11),Encodes a plasma membrane-localized amino acid transporter likely involved in amino acid export in the developing seed.,nodulin MtN21 /EamA-like transporter family protein,L-glutamine transmembrane transporter activity,GO:0015186,24.2.1.5,.Solute transport.carrier-mediated transport.DMT superfamily.solute transporter (UmamiT),solute transporter (UmamiT) (original description: pep chromosome:TAIR10:1:38752:40945:-1 gene:AT1G01070 transcript:AT1G01070.2 gene_biotype:protein_coding transcript_biotype:protein_coding description:WAT1-related protein [Source:UniProtKB/TrEMBL;Acc:A0A178WFU3]),,,,, +,AT1G01080,gene,AT1G01080.3,,,protein_coding,RNA-binding (RRM/RBD/RNP motifs) family protein;(source:Araport11),,,"RNA binding, mRNA binding","GO:0003723, GO:0003729",35.1,not assigned.annotated,"(original description: pep chromosome:TAIR10:1:44970:47059:-1 gene:AT1G01080 transcript:AT1G01080.3 gene_biotype:protein_coding transcript_biotype:protein_coding description:RNA-binding (RRM/RBD/RNP motifs) family protein [Source:UniProtKB/TrEMBL;Acc:F4HQH8]) & 33 kDa ribonucleoprotein, chloroplastic OS=Nicotiana sylvestris (sp|p19684|roc5_nicsy : 109.0)",,,,, +,AT1G01090,gene,AT1G01090.1,PDH-E1 ALPHA,pyruvate dehydrogenase E1 alpha,protein_coding,pyruvate dehydrogenase E1 alpha;(source:Araport11),pyruvate dehydrogenase E1 alpha subunit,pyruvate dehydrogenase E1 alpha,"pyruvate dehydrogenase (acetyl-transferring) activity, protein binding","GO:0004739, GO:0005515",5.1.2.2.1.1,.Lipid metabolism.fatty acid biosynthesis.acetyl-CoA generation.plastidial pyruvate dehydrogenase complex.E1 pyruvate dehydrogenase subcomplex.subunit alpha,subunit alpha of E1 pyruvate dehydrogenase component (original description: pep chromosome:TAIR10:1:47234:49304:-1 gene:AT1G01090 transcript:AT1G01090.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:PDH-E1 ALPHA description:Pyruvate dehydrogenase E1 component subunit alpha [Source:UniProtKB/TrEMBL;Acc:A0A178W8A7]),,,,, diff --git a/spec/test/djornl/test_data/pheno_nodes.csv b/spec/test/djornl/test_data/pheno_nodes.csv new file mode 100644 index 00000000..9add7b7d --- /dev/null +++ b/spec/test/djornl/test_data/pheno_nodes.csv @@ -0,0 +1,5 @@ +node_id,node_type,pheno_AraGWAS_ID,pheno_description,pheno_pto_name,pheno_pto_description,pheno_reference,User_Notes +As2,pheno,10.21958/phenotype:103,,bacterial disease resistance,The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj],"Atwell et. al, Nature 2010", +As75,pheno,10.21958/phenotype:67,"Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",arsenic concentration,A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik],"Atwell et. al, Nature 2010", +Na23,pheno,10.21958/phenotype:5,"Sodium concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",sodium concentration,The total sodium ion concentration measured in a given volume of a plant or a plant part or plant extract. [GR:pj],"Atwell et. al, Nature 2010", +SDV,pheno,10.21958/phenotype:104,"Number of days following stratification to opening of first flower. The experiment was stopped at 200 d, and accessions that had not flowered at that point were assigned a value of 200",days to flowering trait,"A flowering time trait (TO:0002616)which is the number of days required for an individual flower (PO:0009046), a whole plant (PO:0000003) or a plant population to reach flowering stage (PO:0007616) from a predetermined time point (e.g. the date of seed sowing, seedling transplant, or seedling emergence). [GR:pj, TO:cooperl]","Atwell et. al, Nature 2010", diff --git a/spec/test/djornl/valid_manifest/no_file_ext.yaml b/spec/test/djornl/valid_manifest/no_file_ext.yaml new file mode 100644 index 00000000..3120e553 --- /dev/null +++ b/spec/test/djornl/valid_manifest/no_file_ext.yaml @@ -0,0 +1,40 @@ +# nodes does not have a file extension, so file format must be used +name: Dan Jacobson Exascale data +release_date: "2020-08-06" +description: Preliminary Jacobson dataset +file_list: + - data_type: edge + path: merged_edges-AMW-060820_AF.tsv + file_format: tsv + description: Merged edge data. AraGWAS phenotype-GWAS layer has an FDR filter was applied, removing some of the edges. The Aranetv2 coexpression layer network contains the top (highest log-likelihood scores) 15% coexpression edges to compensate for the decreased network size of the phenotype-GWAS layer. + date_created: "2020-06-08" + + - data_type: node + path: nodes + file_format: csv + description: Merged AraNet AraGWAS gene and phenotype data + date_created: "2019-09-13" + + - data_type: cluster + cluster_prefix: markov_i2 + path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv + file_format: tsv + title: Markov clustering, inflation 2 + description: Iterative random forest Markov clustering, inflation set to 2 + date_created: "2019-08-19" + + - data_type: cluster + cluster_prefix: markov_i4 + path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv + file_format: tsv + title: Markov clustering, inflation 4 + description: Iterative random forest Markov clustering, inflation set to 4 + date_created: "2019-08-19" + + - data_type: cluster + cluster_prefix: markov_i6 + path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv + file_format: tsv + title: Markov clustering, inflation 6 + description: Iterative random forest Markov clustering, inflation set to 6 + date_created: "2019-08-19" diff --git a/spec/test/djornl/valid_manifest/no_file_format.yaml b/spec/test/djornl/valid_manifest/no_file_format.yaml new file mode 100644 index 00000000..d3010deb --- /dev/null +++ b/spec/test/djornl/valid_manifest/no_file_format.yaml @@ -0,0 +1,39 @@ +# node file has no format specified, so the path must be parsed +name: Dan Jacobson Exascale data +release_date: "2020-08-06" +description: Preliminary Jacobson dataset +file_list: + - data_type: edge + path: merged_edges-AMW-060820_AF.tsv + file_format: tsv + description: Merged edge data. AraGWAS phenotype-GWAS layer has an FDR filter was applied, removing some of the edges. The Aranetv2 coexpression layer network contains the top (highest log-likelihood scores) 15% coexpression edges to compensate for the decreased network size of the phenotype-GWAS layer. + date_created: "2020-06-08" + + - data_type: node + path: nodes.csv + description: Merged AraNet AraGWAS gene and phenotype data + date_created: "2019-09-13" + + - data_type: cluster + cluster_prefix: markov_i2 + path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv + file_format: tsv + title: Markov clustering, inflation 2 + description: Iterative random forest Markov clustering, inflation set to 2 + date_created: "2019-08-19" + + - data_type: cluster + cluster_prefix: markov_i4 + path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv + file_format: tsv + title: Markov clustering, inflation 4 + description: Iterative random forest Markov clustering, inflation set to 4 + date_created: "2019-08-19" + + - data_type: cluster + cluster_prefix: markov_i6 + path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv + file_format: tsv + title: Markov clustering, inflation 6 + description: Iterative random forest Markov clustering, inflation set to 6 + date_created: "2019-08-19" diff --git a/spec/test/djornl/valid_manifest/with_descriptions.yaml b/spec/test/djornl/valid_manifest/with_descriptions.yaml new file mode 100644 index 00000000..6c39a234 --- /dev/null +++ b/spec/test/djornl/valid_manifest/with_descriptions.yaml @@ -0,0 +1,39 @@ +name: Dan Jacobson Exascale data +release_date: "2020-08-06" +description: Preliminary Jacobson dataset +file_list: + - data_type: edge + path: merged_edges-AMW-060820_AF.tsv + file_format: tsv + description: Merged edge data. AraGWAS phenotype-GWAS layer has an FDR filter was applied, removing some of the edges. The Aranetv2 coexpression layer network contains the top (highest log-likelihood scores) 15% coexpression edges to compensate for the decreased network size of the phenotype-GWAS layer. + date_created: "2020-06-08" + + - data_type: node + path: aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv + file_format: csv + description: Merged AraNet AraGWAS gene and phenotype data + date_created: "2019-09-13" + + - data_type: cluster + cluster_prefix: markov_i2 + path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv + file_format: tsv + title: Markov clustering, inflation 2 + description: Iterative random forest Markov clustering, inflation set to 2 + date_created: "2019-08-19" + + - data_type: cluster + cluster_prefix: markov_i4 + path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv + file_format: tsv + title: Markov clustering, inflation 4 + description: Iterative random forest Markov clustering, inflation set to 4 + date_created: "2019-08-19" + + - data_type: cluster + cluster_prefix: markov_i6 + path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv + file_format: tsv + title: Markov clustering, inflation 6 + description: Iterative random forest Markov clustering, inflation set to 6 + date_created: "2019-08-19" diff --git a/spec/test/helpers.py b/spec/test/helpers.py new file mode 100644 index 00000000..a01a7888 --- /dev/null +++ b/spec/test/helpers.py @@ -0,0 +1,122 @@ +""" +Test helpers +""" +import contextlib +import functools +import io +import json +import os +import requests +import sys +import shutil +from relation_engine_server.utils.wait_for import wait_for_api +from relation_engine_server.utils.pull_spec import download_specs +from relation_engine_server.utils.config import get_config as get_re_config + + +@functools.lru_cache(maxsize=1) +def get_config(): + """Return configuration data for tests.""" + return { + "re_api_url": os.environ["RE_API_URL"], + "re_query_results_url": os.environ["RE_API_URL"] + "/api/v1/query_results", + "db_url": os.environ["DB_URL"], + "db_auth": (os.environ["DB_USER"], os.environ.get("DB_PASS", "")), + } + + +def run_query(query_name, query_data={}): + """submit a database query""" + + query_results_url = get_config()["re_query_results_url"] + + return requests.post( + query_results_url, + params={"stored_query": query_name}, + data=json.dumps(query_data), + ).json() + + +def assert_subset(testCls, subset, _dict): + """Replacement for the deprecated `assertDictContainsSubset` method.""" + for (key, val) in subset.items(): + testCls.assertEqual(subset.get(key), _dict.get(key)) + + +def create_test_docs(coll_name, docs, update_on_dupe=False): + """Create a set of documents for use in tests.""" + body = "\n".join([json.dumps(d) for d in docs]) + params = {"overwrite": True, "collection": coll_name, "display_errors": "1"} + + if update_on_dupe: + del params["overwrite"] + params["on_duplicate"] = "update" + + conf = get_config() + + resp = requests.put( + conf["re_api_url"] + "/api/v1/documents", + params=params, + data=body, + headers={"Authorization": "admin_token"}, + ) + if not resp.ok: + raise RuntimeError(resp.text) + + return resp + + +def check_spec_test_env(): + """ensure that the environment is prepared for running the spec tests""" + if os.environ.get("SPEC_TEST_READY", None) is None: + wait_for_api() + _CONF = get_re_config() + # Remove the spec directory, ignoring if it is already missing + shutil.rmtree(_CONF["spec_paths"]["root"], ignore_errors=True) + # Recreate the spec directory so we have a clean slate, avoiding name conflicts + os.makedirs(_CONF["spec_paths"]["root"]) + # copy the contents of /app/spec into /spec + shutil.rmtree(_CONF["spec_paths"]["root"], ignore_errors=True) + shutil.copytree("/app/spec", _CONF["spec_paths"]["root"]) + download_specs() + os.environ.update({"SPEC_TEST_READY": "Done"}) + + +def capture_stdout(function, *args, **kwargs): + """capture and return the standard output from a function""" + io_stdout = io.StringIO() + sys.stdout = io_stdout + function(*args, **kwargs) + sys.stdout = sys.__stdout__ + return io_stdout.getvalue() + + +@contextlib.contextmanager +def modified_environ(*remove, **update): + """ + Temporarily updates the ``os.environ`` dictionary in-place. + + The ``os.environ`` dictionary is updated in-place so that the modification + is sure to work in all situations. + + :param remove: Environment variables to remove. + :param update: Dictionary of environment variables and values to add/update. + """ + env = os.environ + update = update or {} + remove = remove or [] + + # List of environment variables being updated or removed. + stomped = (set(update.keys()) | set(remove)) & set(env.keys()) + # Environment variables and values to restore on exit. + update_after = {k: env[k] for k in stomped} + # Environment variables and values to remove on exit. + remove_after = frozenset(k for k in update if k not in env) + + try: + env.update(update) + [env.pop(k, None) for k in remove] + yield + finally: + env.update(update_after) + [env.pop(k) for k in remove_after] diff --git a/spec/test/sample_schemas/collections/edge_delta_missing_to_from.yaml b/spec/test/sample_schemas/collections/edge_delta_missing_to_from.yaml new file mode 100644 index 00000000..67fa7941 --- /dev/null +++ b/spec/test/sample_schemas/collections/edge_delta_missing_to_from.yaml @@ -0,0 +1,14 @@ +# Time-travel edge schemas must require "from" and "to" attributes +name: edge_delta_missing_to_from +delta: true +type: edge +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + required: + - name + description: Example edge schema for testing. + properties: + name: {type: string} + _from: {type: string} + _to: {type: string} diff --git a/spec/test/sample_schemas/collections/edge_missing_to_from.yaml b/spec/test/sample_schemas/collections/edge_missing_to_from.yaml new file mode 100644 index 00000000..3b7bc780 --- /dev/null +++ b/spec/test/sample_schemas/collections/edge_missing_to_from.yaml @@ -0,0 +1,12 @@ +# Edge schemas must require "_from" and "_to" attributes +name: edge_missing_to_from +type: edge +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + required: [_key, _from] + description: Example edge schema for testing. + properties: + _key: {type: string} + _from: {type: string} + _to: {type: string} diff --git a/spec/test/sample_schemas/collections/extra_top_level_entries.yaml b/spec/test/sample_schemas/collections/extra_top_level_entries.yaml new file mode 100644 index 00000000..a4263ccf --- /dev/null +++ b/spec/test/sample_schemas/collections/extra_top_level_entries.yaml @@ -0,0 +1,12 @@ +name: extra_top_level_entries +title: Extra Top-Level Entries +type: vertex +delta: false +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + required: [_key] + properties: + _key: + type: string + title: Key diff --git a/spec/test/sample_schemas/collections/not_a_schema.yaml b/spec/test/sample_schemas/collections/not_a_schema.yaml new file mode 100644 index 00000000..b909289d --- /dev/null +++ b/spec/test/sample_schemas/collections/not_a_schema.yaml @@ -0,0 +1,6 @@ +name: not_a_schema +type: edge +schema: + required: ['whatever'] + properties: + type: "17" diff --git a/spec/test/sample_schemas/collections/schema_not_object.yaml b/spec/test/sample_schemas/collections/schema_not_object.yaml new file mode 100644 index 00000000..78001d2f --- /dev/null +++ b/spec/test/sample_schemas/collections/schema_not_object.yaml @@ -0,0 +1,4 @@ +name: schema_not_object +type: vertex +delta: false +schema: http://json-schema.org/draft-07/schema# diff --git a/spec/test/sample_schemas/collections/test_delta_edge.yaml b/spec/test/sample_schemas/collections/test_delta_edge.yaml new file mode 100644 index 00000000..388b31f0 --- /dev/null +++ b/spec/test/sample_schemas/collections/test_delta_edge.yaml @@ -0,0 +1,11 @@ +name: test_delta_edge +type: edge +delta: true +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + required: [from, to] + description: Example edge schema for testing. + properties: + from: {type: string} + to: {type: string} diff --git a/spec/test/sample_schemas/collections/test_delta_vertex.yaml b/spec/test/sample_schemas/collections/test_delta_vertex.yaml new file mode 100644 index 00000000..272e3188 --- /dev/null +++ b/spec/test/sample_schemas/collections/test_delta_vertex.yaml @@ -0,0 +1,11 @@ +name: test_delta_vertex +type: vertex +delta: true +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + required: [id] + description: An example vertex schema for testing + properties: + id: {type: string} + quality: {type: string} diff --git a/spec/test/sample_schemas/collections/test_edge.yaml b/spec/test/sample_schemas/collections/test_edge.yaml new file mode 100644 index 00000000..fab7ad6e --- /dev/null +++ b/spec/test/sample_schemas/collections/test_edge.yaml @@ -0,0 +1,10 @@ +name: test_edge +type: edge +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + required: [_from, _to] + description: Example edge schema for testing. + properties: + _from: {type: string} + _to: {type: string} diff --git a/spec/test/sample_schemas/collections/test_vertex.yaml b/spec/test/sample_schemas/collections/test_vertex.yaml new file mode 100644 index 00000000..b2d34668 --- /dev/null +++ b/spec/test/sample_schemas/collections/test_vertex.yaml @@ -0,0 +1,11 @@ +name: test_vertex +type: vertex +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + required: [_key] + description: An example vertex schema for testing + properties: + _key: {type: string} + is_public: {type: boolean} + ws_id: {type: integer} diff --git a/spec/test/sample_schemas/collections/vertex_missing_id.yaml b/spec/test/sample_schemas/collections/vertex_missing_id.yaml new file mode 100644 index 00000000..5275049c --- /dev/null +++ b/spec/test/sample_schemas/collections/vertex_missing_id.yaml @@ -0,0 +1,13 @@ +# Time-travel vertex schemas must require the "id" attribute +name: vertex_missing_id +type: vertex +delta: true +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + required: [_key] + description: An example vertex schema for testing + properties: + _key: {type: string} + is_public: {type: boolean} + ws_id: {type: integer} diff --git a/spec/test/sample_schemas/collections/vertex_missing_key.yaml b/spec/test/sample_schemas/collections/vertex_missing_key.yaml new file mode 100644 index 00000000..e94b8f14 --- /dev/null +++ b/spec/test/sample_schemas/collections/vertex_missing_key.yaml @@ -0,0 +1,13 @@ +# Vertex schemas must require the "_key" attribute +name: vertex_missing_key +type: vertex +delta: false +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + required: [id] + description: An example vertex schema for testing + properties: + id: {type: string} + is_public: {type: boolean} + ws_id: {type: integer} diff --git a/spec/test/sample_schemas/collections/wrong_name.yaml b/spec/test/sample_schemas/collections/wrong_name.yaml new file mode 100644 index 00000000..29b6f98f --- /dev/null +++ b/spec/test/sample_schemas/collections/wrong_name.yaml @@ -0,0 +1,11 @@ +name: test_nodes +type: vertex +delta: false +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + required: [_key] + properties: + _key: + type: string + title: Key diff --git a/spec/test/sample_schemas/data_sources/invalid_additional_property.json b/spec/test/sample_schemas/data_sources/invalid_additional_property.json new file mode 100644 index 00000000..3759e962 --- /dev/null +++ b/spec/test/sample_schemas/data_sources/invalid_additional_property.json @@ -0,0 +1,6 @@ +{ + "name": "invalid_additional_property", + "type": "invalid", + "category": "something boring", + "title": "An invalid additional property" +} diff --git a/spec/test/sample_schemas/data_sources/minimal.yaml b/spec/test/sample_schemas/data_sources/minimal.yaml new file mode 100644 index 00000000..7027a4e1 --- /dev/null +++ b/spec/test/sample_schemas/data_sources/minimal.yaml @@ -0,0 +1,3 @@ +name: minimal +category: network +title: Example minimal data source diff --git a/spec/test/sample_schemas/data_sources/uri_validation.json b/spec/test/sample_schemas/data_sources/uri_validation.json new file mode 100644 index 00000000..30b4a2f7 --- /dev/null +++ b/spec/test/sample_schemas/data_sources/uri_validation.json @@ -0,0 +1,6 @@ +{ + "name": "uri_validation", + "category": "validator testing", + "title": "URI vaildation test", + "home_url": "this is not a valid URI" +} diff --git a/spec/test/sample_schemas/duplicate_names/ncbi/ncbi_taxon.yaml b/spec/test/sample_schemas/duplicate_names/ncbi/ncbi_taxon.yaml new file mode 100644 index 00000000..39c97168 --- /dev/null +++ b/spec/test/sample_schemas/duplicate_names/ncbi/ncbi_taxon.yaml @@ -0,0 +1,64 @@ +name: ncbi_taxon +type: vertex +delta: true + +indexes: + - type: fulltext + fields: [scientific_name] + - type: persistent + fields: [id, expired, created] + - type: persistent + fields: [expired, created, last_version] + +schema: + "$schema": http://json-schema.org/draft-07/schema# + type: object + description: Template for a vertex entry in the NCBI taxonomy tree. + required: [id, scientific_name, rank, strain] + properties: + id: + type: string + description: NCBI Taxon id (positive integer) + examples: ['1', '2053699'] + scientific_name: + type: string + title: Taxon name. + examples: ['Methylophilus methylotrophus', 'Bacteria', 'Firmicutes'] + aliases: + type: array + description: Aliases + examples: + - - category: authority + name: Borreliella burgdorferi (Johnson et al. 1984) Adeolu and Gupta 2015 + - category: genbank common name + name: Lyme disease spirochet + - category: synonym + name: Borrelia burgdorferi + - - category: common name + name: E. coli + - category: authority + name: '"Bacterium coli commune" Escherich 1885' + - category: synonym + name: Bacterium coli + items: + type: object + required: ['category', 'name'] + properties: + category: {type: string} + name: {type: string} + rank: + type: string + title: Taxonomic rank + examples: ["Domain", "Phylum", "no rank"] + strain: + type: boolean + title: Strain flag + description: Whether this node corresponds to a strain. Strains are considered to be nodes + that have a rank of "no rank" and whose parents' rank is either species or subspecies or + where the parent's strain flag is true. + ncbi_taxon_id: + type: integer + title: The NCBI taxon ID as a number + gencode: + type: integer + title: The numerc ID of the genetic code for this organism. diff --git a/spec/test/sample_schemas/duplicate_names/ncbi/test_vertex.yaml b/spec/test/sample_schemas/duplicate_names/ncbi/test_vertex.yaml new file mode 100644 index 00000000..b2d34668 --- /dev/null +++ b/spec/test/sample_schemas/duplicate_names/ncbi/test_vertex.yaml @@ -0,0 +1,11 @@ +name: test_vertex +type: vertex +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + required: [_key] + description: An example vertex schema for testing + properties: + _key: {type: string} + is_public: {type: boolean} + ws_id: {type: integer} diff --git a/spec/test/sample_schemas/duplicate_names/test/test_edge.yaml b/spec/test/sample_schemas/duplicate_names/test/test_edge.yaml new file mode 100644 index 00000000..fab7ad6e --- /dev/null +++ b/spec/test/sample_schemas/duplicate_names/test/test_edge.yaml @@ -0,0 +1,10 @@ +name: test_edge +type: edge +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + required: [_from, _to] + description: Example edge schema for testing. + properties: + _from: {type: string} + _to: {type: string} diff --git a/spec/test/sample_schemas/duplicate_names/test/test_vertex.yaml b/spec/test/sample_schemas/duplicate_names/test/test_vertex.yaml new file mode 100644 index 00000000..b2d34668 --- /dev/null +++ b/spec/test/sample_schemas/duplicate_names/test/test_vertex.yaml @@ -0,0 +1,11 @@ +name: test_vertex +type: vertex +schema: + "$schema": "http://json-schema.org/draft-07/schema#" + type: object + required: [_key] + description: An example vertex schema for testing + properties: + _key: {type: string} + is_public: {type: boolean} + ws_id: {type: integer} diff --git a/spec/test/sample_schemas/stored_queries/invalid_aql.yaml b/spec/test/sample_schemas/stored_queries/invalid_aql.yaml new file mode 100644 index 00000000..3fe1e85b --- /dev/null +++ b/spec/test/sample_schemas/stored_queries/invalid_aql.yaml @@ -0,0 +1,27 @@ +# Return genes associated with reactions similar to a query reaction +name: invalid_aql +params: + type: object + required: [sf_sim, df_sim, rid] + properties: + rid: + type: string + title: Reaction id (rxn_reaction vertex id) + sf_sim: + type: number + title: Minimum structural fingerprint similarity score + df_sim: + type: number + title: Minimum difference fingerprint similarity score + exclude_self: + type: boolean + description: If true, don't include the query reactions genes + default: false + +query_prefix: WITH rxn_reaction +query: | + LET start = @exclude_self ? 1 : 0 + LET us pray + FOR a + RETURN to + NORMALITY diff --git a/spec/test/sample_schemas/stored_queries/invalid_bind_params.yaml b/spec/test/sample_schemas/stored_queries/invalid_bind_params.yaml new file mode 100644 index 00000000..0dc1f7ac --- /dev/null +++ b/spec/test/sample_schemas/stored_queries/invalid_bind_params.yaml @@ -0,0 +1,32 @@ +name: invalid_bind_params +params: + type: object + required: [keys] + properties: + distance_to_nearest_star: + type: integer + default: 1 + minimum: 0 + maximum: 100 + keys: + type: array + items: {type: string} +query: | + LET node_ids = ( + FOR n IN djornl_node + FILTER n._key IN @door_keys AND n.node_type == 'gene' + FOR node IN 0..@distance ANY n djornl_edge + OPTIONS {bfs: true, uniqueVertices: "global"} + RETURN DISTINCT node._id + ) + LET edges = ( + FOR edge IN djornl_edge + FILTER edge._from IN node_ids AND edge._to IN node_ids + RETURN edge + ) + LET nodes = ( + FOR node IN djornl_node + FILTER node._id IN node_ids + RETURN node + ) + RETURN {nodes, edges} diff --git a/spec/test/sample_schemas/stored_queries/params_not_object.yaml b/spec/test/sample_schemas/stored_queries/params_not_object.yaml new file mode 100644 index 00000000..a5e6f937 --- /dev/null +++ b/spec/test/sample_schemas/stored_queries/params_not_object.yaml @@ -0,0 +1,3 @@ +name: params_not_object +query: whatever +params: false diff --git a/spec/test/sample_schemas/views/minimal.json b/spec/test/sample_schemas/views/minimal.json new file mode 100644 index 00000000..fdf8b8a7 --- /dev/null +++ b/spec/test/sample_schemas/views/minimal.json @@ -0,0 +1,4 @@ +{ + "name": "minimal", + "type": "arangosearch" +} diff --git a/spec/test/sample_schemas/views/wrong_type.json b/spec/test/sample_schemas/views/wrong_type.json new file mode 100644 index 00000000..49282bed --- /dev/null +++ b/spec/test/sample_schemas/views/wrong_type.json @@ -0,0 +1,4 @@ +{ + "name": "wrong_type", + "type": "from the shore" +} diff --git a/spec/test/stored_queries/__init__.py b/spec/test/stored_queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/spec/test/stored_queries/test_djornl.py b/spec/test/stored_queries/test_djornl.py new file mode 100644 index 00000000..eca29b1c --- /dev/null +++ b/spec/test/stored_queries/test_djornl.py @@ -0,0 +1,147 @@ +""" +Tests for the Dan Jacobson ORNL Arabidopsis stored queries. + +These tests run within the re_api docker image, and require access to the ArangoDB, auth, and workspace images. +""" +import json +import unittest +import os + +from spec.test.helpers import ( + get_config, + modified_environ, + create_test_docs, + run_query, + check_spec_test_env, +) +from importers.djornl.parser import DJORNL_Parser + +_CONF = get_config() +_TEST_DIR = "/app/spec/test" +_VERBOSE = 0 + + +def print_db_update(response, collection): + if not _VERBOSE: + return + print(f"Saved docs to collection {collection}!") + print(response.text) + print("=" * 80) + + +class Test_DJORNL_Stored_Queries(unittest.TestCase): + @classmethod + def setUpClass(cls): + + check_spec_test_env() + # import the results file + results_file = os.path.join(_TEST_DIR, "djornl", "results.json") + with open(results_file) as fh: + cls.json_data = json.load(fh) + + cls.no_results = {"nodes": [], "edges": []} + cls.maxDiff = None + + # load the DB + root_path = os.path.join(_TEST_DIR, "djornl", "test_data") + with modified_environ(RES_ROOT_DATA_PATH=root_path): + parser = DJORNL_Parser() + node_name = parser.config("node_name") + edge_name = parser.config("edge_name") + + edge_data = parser.load_edges() + r = create_test_docs(node_name, edge_data["nodes"]) + print_db_update(r, node_name) + r = create_test_docs(edge_name, edge_data["edges"]) + print_db_update(r, edge_name) + + node_metadata = parser.load_nodes() + r = create_test_docs(node_name, node_metadata["nodes"], True) + print_db_update(r, node_name) + + cluster_data = parser.load_clusters() + r = create_test_docs(node_name, cluster_data["nodes"], True) + print_db_update(r, node_name) + + def test_expected_results(self, query_name=None, test_data=None): + + # don't run the tests if they're being called automatically + if query_name is None: + self.assertTrue(True) + return + + # ensure we have either 'results' or 'error' in the test data + self.assertTrue("results" in test_data or "error" in test_data) + + params = {} + if "params" in test_data: + params = test_data["params"] + + response = run_query(query_name, params) + + if _VERBOSE: + print("Running query " + query_name) + if "params" in test_data: + print({"params": params}) + + # expecting an error response + if "error" in test_data: + if "error" not in response: + print({"response": response}) + + self.assertIn("error", response) + self.assertEqual(response["error"], test_data["error"]) + return response + + # expecting a valid response + if "results" not in response: + print({"response": response}) + + self.assertIn("results", response) + results = response["results"][0] + + self.assertEqual( + set([n["_key"] for n in results["nodes"]]), + set(test_data["results"]["nodes"]), + ) + + self.assertEqual( + set([e["_key"] for e in results["edges"]]), + set(test_data["results"]["edges"]), + ) + return response + + # indexing schema in results.json + # self.json_data['queries'][query_name] + # e.g. for fetch_clusters data: + # "djornl_fetch_clusters": { + # "params": { "cluster_ids": ["markov_i2:6", "markov_i4:3"], "distance": "1"}, + # "results": { + # "nodes": [ node IDs ], + # "edges": [ edge data ] + # } + # } + # nodes are represented as a list of node[_key] + # edges are objects with keys _to, _from, edge_type and score + + def test_fetch_all(self): + """Ensure that data returned by the fetch all query has all the information that we expect""" + response = self.test_expected_results( + "djornl_fetch_all", self.json_data["queries"]["djornl_fetch_all"][0] + ) + + # ensure that all the cluster data is returned OK + node_data = response["results"][0]["nodes"] + expected_node_data = self.json_data["load_clusters"]["nodes"] + self.assertEqual( + {n["_key"]: n["clusters"] for n in node_data if "clusters" in n}, + {n["_key"]: n["clusters"] for n in expected_node_data if "clusters" in n}, + ) + + def test_queries(self): + """Run parameterised queries and check for results or error messages""" + + for query in self.json_data["queries"].keys(): + for test in self.json_data["queries"][query]: + with self.subTest(query=query, params=test["params"]): + self.test_expected_results(query, test) diff --git a/spec/test/stored_queries/test_fulltext_search.py b/spec/test/stored_queries/test_fulltext_search.py new file mode 100644 index 00000000..99bd4d44 --- /dev/null +++ b/spec/test/stored_queries/test_fulltext_search.py @@ -0,0 +1,554 @@ +""" +Tests for stored queries involving a fulltext search: +* Generic fulltext_search (should be used with caution because it can be slow and timeout at 60s) +* Taxonomy taxonomy_search_species_strain +* Taxonomy taxonomy_search_species_strain_no_sort + +The latter two are switched between depending on the length of the search text. +These stored query tests are all bundled in one test file because their original purpose is to do a species/strain +name search on the ncbi_taxon collection + +These tests run within the re_api docker image, and require access to the ArangoDB, auth, and workspace images. +""" +import json +import time +import unittest +import requests +import os + +from spec.test.helpers import ( + get_config, + check_spec_test_env, + create_test_docs, +) + +_CONF = get_config() +_NOW = int(time.time() * 1000) +LIMIT = 20 # default + +TEST_DATA_DIR = os.path.normpath( + os.path.join(os.path.dirname(os.path.abspath(__file__)), "../data/") +) + +ncbi_taxon_fp = os.path.join(TEST_DATA_DIR, "ncbi_taxon.json") +with open(ncbi_taxon_fp) as fh: + ncbi_taxa = json.load(fh) + +# scinames_test_all are all the test scinames +# These are selected from the ncbi_taxon collection +scinames_test_all = [ + # --- Token preceded by punctuation --- + "Lactobacillus sp. 'thermophilus'", + "Rabbit fibroma virus (strain Kasza)", + "'Prunus dulcis' phytoplasma", + # --- Tokens joined by punctuation --- + "Lactococcus phage 936 group phage Phi13.16", + "Pseudogobio cf. esocinus CBM:ZF:12684", + "Klebsormidium sp. BIOTA 14615.5a", + # --- Misc gnarly --- + "Influenza C virus (C/PIG/Beijing/439/1982)", + "Bovine herpesvirus type 1.1 (strain P8-2)", + "Porcine transmissible gastroenteritis coronavirus strain FS772/70", + "Salmonella enterica subsp. houtenae serovar 16:z4,z32:--", + "Influenza A virus PX8-XIII(A/USSR/90/77(H1N1)xA/Pintail Duck/Primorie/695/76(H2N3))", + "Influenza B virus (B/Ann Arbor/1/1966 [cold-adapted and wild- type])", + # --- Prefix 1 --- + "Vaccinia virus WR 65-16", + "Dengue virus 2 Jamaica/1409/1983", + "Dengue virus 2 Thailand/NGS-C/1944", + # --- Escape chars ( ,:+-|"' ) --- + # --- TODO sample scinames with the escape chars in different variety of syntaxes --- + "Salmonella enterica subsp. diarizonae serovar 60:r:e,n,x,z15", + "Fusarium cf. solani 3+4-uuu DPGS-2011", + "Integrating expression vector pJEB403+drrA", + "Vector pEntry-attR2-IRES-eGFP-luc+-pA-attL3", + "low G+C Gram-positive bacterium HTA462", + "Reporter vector p1168hIL6mC/EBP-luc+", + "Pleurocapsales cyanobacterium 'Beach rock 4+5\"'", + "Nostoc sp. 'Peltigera sp. \"hawaiensis\" P1236 cyanobiont'", + "|Fake|fake|fake| ||fake||", + # --- Dups (technically only applicable to live data) --- + "environmental samples", + "Listeria sp. FSL_L7-0091", + "Listeria sp. FSL_L7-1519", + # --- Misc --- + "Norovirus GII.9", + "Corticiaceae sp.", + "Escherichia coli", +] +# scinames_test_latest are the test scinames that are not expired and +# compatible with a current timestamp +scinames_test_latest = [ + "Lactobacillus sp. 'thermophilus'", + "Rabbit fibroma virus (strain Kasza)", + "'Prunus dulcis' phytoplasma", + "Lactococcus phage 936 group phage Phi13.16", + "Influenza C virus (C/PIG/Beijing/439/1982)", + "Bovine herpesvirus type 1.1 (strain P8-2)", + "Porcine transmissible gastroenteritis coronavirus strain FS772/70", + "Salmonella enterica subsp. houtenae serovar 16:z4,z32:--", + "Influenza A virus PX8-XIII(A/USSR/90/77(H1N1)xA/Pintail Duck/Primorie/695/76(H2N3))", + "Influenza B virus (B/Ann Arbor/1/1966 [cold-adapted and wild- type])", + "Vaccinia virus WR 65-16", + "Dengue virus 2 Jamaica/1409/1983", + "Dengue virus 2 Thailand/NGS-C/1944", + "Salmonella enterica subsp. diarizonae serovar 60:r:e,n,x,z15", + "Fusarium cf. solani 3+4-uuu DPGS-2011", + "Integrating expression vector pJEB403+drrA", + "Vector pEntry-attR2-IRES-eGFP-luc+-pA-attL3", + "low G+C Gram-positive bacterium HTA462", + "Reporter vector p1168hIL6mC/EBP-luc+", + "Pleurocapsales cyanobacterium 'Beach rock 4+5\"'", + "Nostoc sp. 'Peltigera sp. \"hawaiensis\" P1236 cyanobiont'", + "|Fake|fake|fake| ||fake||", + "environmental samples", + "Listeria sp. FSL_L7-0091", + "Listeria sp. FSL_L7-1519", + "Corticiaceae sp.", + "Escherichia coli", +] + + +class TestTaxonomySearchSpeciesStrainStoredQueries(unittest.TestCase): + @classmethod + def setUpClass(cls): + check_spec_test_env() + create_test_docs("ncbi_taxon", ncbi_taxa) + + def test_ncbi_taxon_scinames(self): + """Happy path""" + for sciname in scinames_test_all: + _taxonomy_search_species_strain_queries( + self, + taxon_coll="ncbi_taxon", + sciname_field="scientific_name", + search_text=sciname, + ts=_NOW if sciname in scinames_test_latest else None, + offset=None, + limit=LIMIT, + select="scientific_name", + # --- + expect_error=False, + expect_hit=True, + ) + + def test_null_bind_params(self): + """Leave off parameters""" + for sciname in scinames_test_all: + _taxonomy_search_species_strain_queries( + self, + taxon_coll="ncbi_taxon", + sciname_field="scientific_name", + search_text=sciname, + ts=None, + offset=None, + limit=None, + select=None, + # --- + expect_error=False, + expect_hit=True, + ) + + def test_fully_specified_bind_params(self): + """Specify all parameters""" + for sciname in scinames_test_all: + _taxonomy_search_species_strain_queries( + self, + taxon_coll="ncbi_taxon", + sciname_field="scientific_name", + search_text=sciname, + ts=_NOW if sciname in scinames_test_latest else None, + offset=0, + limit=LIMIT, + select=["id", "scientific_name"], + # --- + expect_error=False, + expect_hit=True, + ) + + def test_extra_params(self): + """Extra params not in spec/aql""" + _taxonomy_search_species_strain_queries( + self, + taxon_coll="ncbi_taxon", + sciname_field="scientific_name", + search_text="esch", + ts=None, + offset=0, + limit=LIMIT, + select=["id", "scientific_name"], + extra_unused_param=42, + # --- + expect_error=("Additional properties are not allowed"), + ) + + def test_validation_fail(self): + _taxonomy_search_species_strain_queries( + self, + taxon_coll=[], + sciname_field=42, + search_text={"hi": 1}, + ts=None, + offset=None, + limit=None, + select=None, + # --- + expect_error="[] is not of type 'string'", + ) + + def test_aql_error(self): + for sciname in scinames_test_all: + _taxonomy_search_species_strain_queries( + self, + taxon_coll="ncbi_taxon", + sciname_field="fake_attrkey", + search_text=sciname, + ts=None, + offset=None, + limit=None, + select=None, + # --- + expect_error=True, + ) + + def test_no_hit(self): + for sciname in scinames_test_all: + _taxonomy_search_species_strain_queries( + self, + taxon_coll="ncbi_taxon", + sciname_field="scientific_name", + search_text=sciname[::-1], + ts=None, + offset=None, + limit=None, + select=None, + # --- + expect_error=False, + expect_hit=False, + expected_hits=[], + ) + + def test_prefix_hit(self): + """Test search text len being lte 3""" + _taxonomy_search_species_strain_queries( + self, + taxon_coll="ncbi_taxon", + sciname_field="scientific_name", + search_text="inf", + ts=None, + offset=None, + limit=None, + select=None, + # --- + expect_error=False, + expect_hit=False, + expected_hits=[ + "Influenza A virus PX8-XIII(A/USSR/90/77(H1N1)xA/Pintail Duck/Primorie/695/76(H2N3))", + "Influenza C virus (C/PIG/Beijing/439/1982)", + "Influenza B virus (B/Ann Arbor/1/1966 [cold-adapted and wild- type])", + "Influenza B virus (B/Brisbane/FSS700/2017)", + ], + ) + + +class TestFulltextSearchStoredQuery(unittest.TestCase): + @classmethod + def setUpClass(cls): + check_spec_test_env() + create_test_docs("ncbi_taxon", ncbi_taxa) + + def test_ncbi_taxon_scinames(self): + """Happy path""" + for sciname in scinames_test_all: + _fulltext_search_query( + self, + coll="ncbi_taxon", + search_attrkey="scientific_name", + search_text=sciname, + ts=_NOW if sciname in scinames_test_latest else None, + filter_attr_expr=[ + {"rank": "species"}, + {"rank": "strain"}, + {"strain": True}, + ], + offset=None, + limit=LIMIT, + select="scientific_name", + # --- + expect_error=False, + expect_hit=True, + ) + + def test_null_bind_params(self): + """Leave off parameters""" + for sciname in scinames_test_all: + _fulltext_search_query( + self, + coll="ncbi_taxon", + search_attrkey="scientific_name", + search_text=sciname, + ts=None, + filter_attr_expr=None, + offset=None, + limit=None, + select=None, + # --- + expect_error=False, + expect_hit=True, + ) + + def test_fully_specified_bind_params(self): + """Specify all parameters""" + for sciname in scinames_test_all: + _fulltext_search_query( + self, + coll="ncbi_taxon", + search_attrkey="scientific_name", + search_text=sciname, + ts=_NOW if sciname in scinames_test_latest else None, + filter_attr_expr=[ + {"rank": "species"}, + {"rank": "strain"}, + {"strain": True}, + ], + offset=0, + limit=LIMIT, + select=["id", "scientific_name"], + # --- + expect_error=False, + expect_hit=True, + ) + + def test_extra_params(self): + """Extra params not in spec/aql""" + _fulltext_search_query( + self, + coll="ncbi_taxon", + search_attrkey="scientific_name", + search_text="esch", + ts=None, + filter_attr_expr=[ + {"rank": "species"}, + {"rank": "strain"}, + {"strain": True}, + ], + offset=0, + limit=LIMIT, + select=["id", "scientific_name"], + extra_unused_param=42, + # --- + expect_error=("Additional properties are not allowed"), + ) + + def test_validation_fail(self): + _fulltext_search_query( + self, + coll=[], + search_attrkey=42, + search_text={"hi": 1}, + ts=None, + filter_attr_expr=None, + offset=None, + limit=None, + select=None, + # --- + expect_error="[] is not of type 'string'", + ) + + def test_aql_error(self): + for sciname in scinames_test_all: + _fulltext_search_query( + self, + coll="ncbi_taxon", + search_attrkey="fake_attrkey", + search_text=sciname, + ts=None, + filter_attr_expr=None, + offset=None, + limit=None, + select=None, + # --- + expect_error=True, + ) + + def test_no_hit(self): + for sciname in scinames_test_all: + _fulltext_search_query( + self, + coll="ncbi_taxon", + search_attrkey="scientific_name", + search_text=sciname[::-1], + ts=None, + filter_attr_expr=None, + offset=None, + limit=None, + select=None, + # --- + expect_error=False, + expect_hit=False, + expected_hits=[], + ) + + +# --- Test helpers --- + + +def _switch_taxonomy_search_species_strain_queries(search_text): + return ( + "taxonomy_search_species_strain_no_sort" + if len(search_text) <= 3 + else "taxonomy_search_species_strain" + ) + + +def _taxonomy_search_species_strain_queries( + self, + taxon_coll, + sciname_field, + search_text, + ts, + offset, + limit, + select, + expect_error=False, + expect_hit=True, + expected_hits=None, + **kw, # for testing passing disallowed properties +): + """ + Run query against ArangoDB server + """ + data = { + "@taxon_coll": taxon_coll, + "sciname_field": sciname_field, + "search_text": search_text, + "ts": ts, + "offset": offset, + "limit": limit, + "select": select, + **kw, + } + stored_query = _switch_taxonomy_search_species_strain_queries(search_text) + _check_query_results( + self, + data, + stored_query, + sciname_field, + search_text, + limit, + expect_error, + expect_hit, + expected_hits, + ) + + +def _fulltext_search_query( + self, + coll, + search_attrkey, + search_text, + ts, + filter_attr_expr, + offset, + limit, + select, + expect_error=False, + expect_hit=True, + expected_hits=None, + **kw, # for testing passing disallowed properties +): + """ + Run query against ArangoDB server + """ + data = { + "@coll": coll, + "search_attrkey": search_attrkey, + "search_text": search_text, + "ts": ts, + "filter_attr_expr": filter_attr_expr, + "offset": offset, + "limit": limit, + "select": select, + **kw, + } + stored_query = "fulltext_search" + _check_query_results( + self, + data, + stored_query, + search_attrkey, + search_text, + limit, + expect_error, + expect_hit, + expected_hits, + ) + + +def _check_query_results( + self, + data, + stored_query, + search_attrkey, + search_text, + limit, + expect_error, + expect_hit, + expected_hits, +): + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": stored_query}, + data=json.dumps(data), + ) + + if expect_error: + self.assertIn("error", resp.json()) + if isinstance(expect_error, str): + self.assertIn(expect_error, json.dumps(resp.json())) + + else: + self.assertNotIn("error", resp.json(), json.dumps(resp.json(), indent=4)) + + docs = resp.json()["results"] + hits = [doc[search_attrkey] for doc in docs] + if expect_hit: + self.assertIn(search_text, hits, f"`{search_text}` not in `{hits}`") + self.assertFalse( + len(hits) == limit and len(set(hits) == 1) + ) # check not just overflowing with dups + else: + self.assertNotIn(search_text, hits) + + if expected_hits is not None: + self.assertCountEqual(expected_hits, hits) + + # Filter out null values + # to see if their default null values would kick in properly + data = {k: v for k, v in data.items() if v is not None} + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": stored_query}, + data=json.dumps(data), + ) + + if expect_error: + self.assertIn("error", resp.json()) + if isinstance(expect_error, str): + self.assertIn(expect_error, json.dumps(resp.json())) + + else: + self.assertNotIn("error", resp.json(), json.dumps(resp.json(), indent=4)) + + docs = resp.json()["results"] + hits = [doc[search_attrkey] for doc in docs] + if expect_hit: + self.assertIn(search_text, hits, f"`{search_text}` not in `{hits}`") + self.assertFalse( + len(hits) == limit and len(set(hits) == 1) + ) # check not just overflowing with dups + else: + self.assertNotIn(search_text, hits) + + if expected_hits is not None: + self.assertCountEqual(expected_hits, hits) diff --git a/spec/test/stored_queries/test_list_test_vertices.py b/spec/test/stored_queries/test_list_test_vertices.py new file mode 100644 index 00000000..e4f5ae0e --- /dev/null +++ b/spec/test/stored_queries/test_list_test_vertices.py @@ -0,0 +1,79 @@ +""" +Test the 'list_test_vertices' stored query (see +relation_engine_server/test/spec_release/sample_spec_release/stored_queries/test for the query). + +These tests run within the re_api docker image, and require access to the ArangoDB, auth, and workspace images. + +""" +import unittest +import requests + +from spec.test.helpers import create_test_docs, get_config, check_spec_test_env + +_CONF = get_config() +_QUERY_URL = _CONF["re_api_url"] + "/api/v1/query_results?view=list_test_vertices" + + +class TestListTestVertices(unittest.TestCase): + @classmethod + def setUpClass(cls): + check_spec_test_env() + + def test_valid(self): + """Test a valid query.""" + docs_created = create_test_docs( + "test_vertex", + [ + {"is_public": True, "_key": "a", "ws_id": 10}, # public access + {"is_public": False, "_key": "b", "ws_id": 1}, # private access + {"is_public": False, "_key": "c", "ws_id": 99}, # no access + ], + ) + self.assertEqual( + docs_created.json(), + { + "created": 3, + "details": [], + "empty": 0, + "error": False, + "errors": 0, + "ignored": 0, + "updated": 0, + }, + ) + resp = requests.post( + _QUERY_URL, + headers={ + "Authorization": "valid_token" + }, # gives access to workspaces [1,2,3] + ).json() + self.assertEqual(resp["count"], 2) + # 'c' is inaccessible + self.assertEqual({r["_key"] for r in resp["results"]}, {"a", "b"}) + + def test_no_auth(self): + """Test with blank auth.""" + docs_created = create_test_docs( + "test_vertex", + [ + {"is_public": True, "_key": "a", "ws_id": 10}, # public access + {"is_public": False, "_key": "b", "ws_id": 1}, # private access + {"is_public": False, "_key": "c", "ws_id": 99}, # no access + ], + ) + self.assertEqual( + docs_created.json(), + { + "created": 3, + "details": [], + "empty": 0, + "error": False, + "errors": 0, + "ignored": 0, + "updated": 0, + }, + ) + resp = requests.post(_QUERY_URL).json() + self.assertEqual(resp["count"], 1) + # 'b' and 'c' are inaccessible + self.assertEqual([r["_key"] for r in resp["results"]], ["a"]) diff --git a/spec/test/stored_queries/test_ncbi_tax.py b/spec/test/stored_queries/test_ncbi_tax.py new file mode 100644 index 00000000..b37a57a4 --- /dev/null +++ b/spec/test/stored_queries/test_ncbi_tax.py @@ -0,0 +1,632 @@ +""" +Tests for the ncbi taxonomy stored queries. + +These tests require access to the ArangoDB, auth, and workspace images. +""" +import json +import time +import unittest +import requests + +from spec.test.helpers import ( + get_config, + assert_subset, + create_test_docs, + check_spec_test_env, +) + +_CONF = get_config() +_NOW = int(time.time() * 1000) + + +class TestNcbiTax(unittest.TestCase): + @classmethod + def setUpClass(cls): + """Create test documents""" + + check_spec_test_env() + + taxon_docs = [ + { + "_key": "1", + "scientific_name": "Bacteria", + "rank": "Domain", + "strain": False, + }, + { + "_key": "2", + "scientific_name": "Firmicutes", + "rank": "Phylum", + "strain": False, + }, + { + "_key": "3", + "scientific_name": "Bacilli", + "rank": "Class", + "strain": False, + }, + { + "_key": "4", + "scientific_name": "Proteobacteria", + "rank": "Phylum", + "strain": False, + }, + { + "_key": "5", + "scientific_name": "Alphaproteobacteria", + "rank": "Class", + "strain": False, + }, + { + "_key": "6", + "scientific_name": "Gammaproteobacteria", + "rank": "Class", + "strain": False, + }, + { + "_key": "7", + "scientific_name": "Deltaproteobacteria", + "rank": "Class", + "strain": False, + }, + { + "_key": "8", + "scientific_name": "Bacillus subtilis 168", + "rank": "no rank", + "strain": True, + }, + ] + child_docs = [ + { + "_from": "ncbi_taxon/2", + "_to": "ncbi_taxon/1", + "from": "2", + "to": "1", + "id": "2", + }, + { + "_from": "ncbi_taxon/4", + "_to": "ncbi_taxon/1", + "from": "4", + "to": "1", + "id": "4", + }, + { + "_from": "ncbi_taxon/3", + "_to": "ncbi_taxon/2", + "from": "3", + "to": "2", + "id": "3", + }, + { + "_from": "ncbi_taxon/5", + "_to": "ncbi_taxon/4", + "from": "5", + "to": "4", + "id": "5", + }, + { + "_from": "ncbi_taxon/6", + "_to": "ncbi_taxon/4", + "from": "6", + "to": "4", + "id": "6", + }, + { + "_from": "ncbi_taxon/7", + "_to": "ncbi_taxon/4", + "from": "7", + "to": "4", + "id": "7", + }, + # a few levels missing here + { + "_from": "ncbi_taxon/8", + "_to": "ncbi_taxon/3", + "from": "8", + "to": "3", + "id": "8", + }, + ] + obj_ver_docs = [ + _construct_ws_obj_ver(1, 1, 1, is_public=True), + _construct_ws_obj_ver(1, 1, 2, is_public=True), + _construct_ws_obj_ver(2, 1, 1, is_public=False), + ] + obj_docs = [ + _construct_ws_obj(1, 1, is_public=True), + _construct_ws_obj(2, 1, is_public=False), + ] + obj_to_taxa_docs = [ + { + "_from": "ws_object_version/1:1:1", + "_to": "ncbi_taxon/1", + "assigned_by": "assn1", + }, + { + "_from": "ws_object_version/1:1:2", + "_to": "ncbi_taxon/1", + "assigned_by": "assn2", + }, + { + "_from": "ws_object_version/2:1:1", + "_to": "ncbi_taxon/1", + "assigned_by": "assn2", + }, + ] + # Create workspace objects associated to taxa + ws_docs = [ + _ws_defaults({"_key": "1", "is_public": True}), + _ws_defaults({"_key": "2", "is_public": False}), + ] + ws_to_obj = [ + {"_from": "ws_workspace/1", "_to": "ws_object/1:1"}, + {"_from": "ws_workspace/2", "_to": "ws_object/2:1"}, + ] + ws_type_version_docs = [ + { + "_key": "KBaseGenomes.Genome-99.77", + "module_name": "KBaseGenomes", + "type_name": "Genome", + "maj_ver": 99, + "min_ver": 77, + } + ] + ws_obj_instance_of_type_docs = [ + { + "_from": "ws_object_version/1:1:1", + "_to": "ws_type_version/KBaseGenomes.Genome-99.77", + }, + { + "_from": "ws_object_version/1:1:2", + "_to": "ws_type_version/KBaseGenomes.Genome-99.77", + }, + ] + _create_delta_test_docs("ncbi_taxon", taxon_docs) + _create_delta_test_docs("ncbi_child_of_taxon", child_docs, edge=True) + create_test_docs("ws_obj_version_has_taxon", obj_to_taxa_docs) + create_test_docs("ws_object", obj_docs) + create_test_docs("ws_workspace", ws_docs) + create_test_docs("ws_workspace_contains_obj", ws_to_obj) + create_test_docs("ws_object_version", obj_ver_docs) + create_test_docs("ws_obj_instance_of_type", ws_obj_instance_of_type_docs) + create_test_docs("ws_type_version", ws_type_version_docs) + + def test_get_lineage_valid(self): + """Test a valid query of taxon lineage.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ncbi_taxon_get_lineage"}, + data=json.dumps( + {"ts": _NOW, "id": "7", "select": ["rank", "scientific_name"]} + ), + ).json() + self.assertEqual(resp["count"], 2) + ranks = [r["rank"] for r in resp["results"]] + names = [r["scientific_name"] for r in resp["results"]] + self.assertEqual(ranks, ["Domain", "Phylum"]) + self.assertEqual(names, ["Bacteria", "Proteobacteria"]) + + def test_get_children(self): + """Test a valid query of taxon descendants.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ncbi_taxon_get_children"}, + data=json.dumps( + { + "id": "1", + "ts": _NOW, + "search_text": "firmicutes,|proteobacteria", + "select": ["rank", "scientific_name"], + } + ), + ).json() + result = resp["results"][0] + self.assertEqual(result["total_count"], 2) + ranks = {r["rank"] for r in result["results"]} + names = [r["scientific_name"] for r in result["results"]] + self.assertEqual(ranks, {"Phylum"}) + self.assertEqual(names, ["Firmicutes", "Proteobacteria"]) + + def test_get_children_cursor(self): + """Test a valid query to get children with a cursor.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ncbi_taxon_get_children_cursor"}, + data=json.dumps({"ts": _NOW, "id": "1"}), + ).json() + self.assertEqual(len(resp["results"]), 2) + + def test_siblings_valid(self): + """Test a valid query for siblings.""" + # Querying from "Alphaproteobacteria" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ncbi_taxon_get_siblings"}, + data=json.dumps( + {"ts": _NOW, "id": "5", "select": ["rank", "scientific_name"]} + ), + ).json() + result = resp["results"][0] + self.assertEqual(result["total_count"], 2) + ranks = {r["rank"] for r in result["results"]} + names = [r["scientific_name"] for r in result["results"]] + self.assertEqual(ranks, {"Class"}) + self.assertEqual(names, ["Deltaproteobacteria", "Gammaproteobacteria"]) + + def test_siblings_root(self): + """Test a query for siblings on the root node with no parent.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ncbi_taxon_get_siblings"}, + data=json.dumps({"ts": _NOW, "id": "1"}), # Querying from "Bacteria" + ).json() + self.assertEqual(resp["results"][0]["total_count"], 0) + + def test_siblings_nonexistent_node(self): + """Test a query for siblings on the root node with no parent.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ncbi_taxon_get_siblings"}, + data=json.dumps({"ts": _NOW, "id": "xyz"}), # Nonexistent node + ).json() + self.assertEqual(resp["results"][0]["total_count"], 0) + + def test_search_sciname_prefix(self): + """Test a query to search sciname.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ncbi_taxon_search_sci_name"}, + data=json.dumps( + { + "ts": _NOW, + "search_text": "prefix:bact", + "select": ["scientific_name"], + } + ), + ).json() + result = resp["results"][0] + self.assertEqual(result["total_count"], 1) + self.assertEqual(result["results"][0]["scientific_name"], "Bacteria") + + def test_search_sciname_nonexistent(self): + """Test a query to search sciname for empty results.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ncbi_taxon_search_sci_name"}, + data=json.dumps({"ts": _NOW, "search_text": "xyzabc"}), + ).json() + self.assertEqual(resp["results"][0]["total_count"], 0) + + def test_search_sciname_wrong_type(self): + """Test a query to search sciname with the wrong type for the search_text param.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ncbi_taxon_search_sci_name"}, + data=json.dumps({"ts": _NOW, "search_text": 123}), + ) + self.assertEqual(resp.status_code, 400) + self.assertEqual(resp.json()["error"]["message"], "123 is not of type 'string'") + + def test_search_sciname_missing_search(self): + """Test a query to search sciname with the search_text param missing.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ncbi_taxon_search_sci_name"}, + data=json.dumps({"ts": _NOW}), + ) + self.assertEqual(resp.status_code, 400) + self.assertEqual( + resp.json()["error"]["message"], "'search_text' is a required property" + ) + + def test_search_sciname_more_complicated(self): + """Test a query to search sciname with some more keyword options.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ncbi_taxon_search_sci_name"}, + data=json.dumps( + {"ts": _NOW, "search_text": "prefix:gamma,|prefix:alpha,|prefix:delta"} + ), + ).json() + result = resp["results"][0] + self.assertEqual(result["total_count"], 3) + names = {r["scientific_name"] for r in result["results"]} + self.assertEqual( + names, {"Gammaproteobacteria", "Alphaproteobacteria", "Deltaproteobacteria"} + ) + + def test_search_sciname_offset_max(self): + """Test a query to search sciname with an invalid offset (greater than max).""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ncbi_taxon_search_sci_name"}, + data=json.dumps( + {"ts": _NOW, "search_text": "prefix:bact", "offset": 100001} + ), + ) + self.assertEqual(resp.status_code, 400) + self.assertEqual( + resp.json()["error"]["message"], + "100001 is greater than the maximum of 100000", + ) + + def test_search_sciname_limit_max(self): + """Test a query to search sciname with an invalid offset (greater than max).""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ncbi_taxon_search_sci_name"}, + data=json.dumps({"ts": _NOW, "search_text": "prefix:bact", "limit": 1001}), + ) + self.assertEqual(resp.status_code, 400) + self.assertEqual( + resp.json()["error"]["message"], "1001 is greater than the maximum of 1000" + ) + + def test_search_sciname_limit_ranks_implicit_defaults(self): + """Test queries where the results are limited by the rank or strain flag.""" + _run_search_sciname( + self, + ranks=None, + include_strains=None, + expected_count=3, + expected_sci_names={"Bacteria", "Bacilli", "Bacillus subtilis 168"}, + ) + + def test_search_sciname_limit_ranks_explicit_defaults(self): + """Test queries where the results are limited by the rank or strain flag.""" + _run_search_sciname( + self, + ranks=[], + include_strains=False, + expected_count=3, + expected_sci_names={"Bacteria", "Bacilli", "Bacillus subtilis 168"}, + ) + + def test_search_sciname_limit_ranks_2(self): + """Test queries where the results are limited by the rank or strain flag.""" + _run_search_sciname( + self, + ranks=["Domain", "Class"], + include_strains=None, + expected_count=2, + expected_sci_names={"Bacteria", "Bacilli"}, + ) + + def test_search_sciname_limit_ranks_1(self): + """Test queries where the results are limited by the rank or strain flag.""" + _run_search_sciname( + self, + ranks=["Class"], + include_strains=None, + expected_count=1, + expected_sci_names={"Bacilli"}, + ) + + def test_search_sciname_limit_ranks_1_with_strain(self): + """Test queries where the results are limited by the rank or strain flag.""" + _run_search_sciname( + self, + ranks=["Class"], + include_strains=True, + expected_count=2, + expected_sci_names={"Bacilli", "Bacillus subtilis 168"}, + ) + + def test_search_sciname_limit_ranks_1_with_false_strain(self): + """Test queries where the results are limited by the rank or strain flag.""" + _run_search_sciname( + self, + ranks=["Class"], + include_strains=False, + expected_count=1, + expected_sci_names={"Bacilli"}, + ) + + def test_select_fields(self): + """Test that the 'select' works properly for one query.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ncbi_taxon_get_lineage"}, + data=json.dumps({"ts": _NOW, "id": "7", "select": ["rank"]}), + ).json() + self.assertEqual(resp["count"], 2) + self.assertEqual(resp["results"], [{"rank": "Domain"}, {"rank": "Phylum"}]) + + def test_fetch_taxon(self): + """Test a valid query to fetch a taxon.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ncbi_fetch_taxon"}, + data=json.dumps({"ts": _NOW, "id": "1"}), + ).json() + self.assertEqual(resp["count"], 1) + self.assertEqual(resp["results"][0]["id"], "1") + + def test_get_associated_objs(self): + """ + Test a valid query to get associated objects for a taxon. + Two objects are public and one is private, so total_count will be 3 while only the public objects are returned. + """ + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ncbi_taxon_get_associated_ws_objects"}, + data=json.dumps( + { + "ts": _NOW, + "taxon_id": "1", + "select_obj": ["_id", "type", "ws_info"], + "select_edge": ["assigned_by"], + } + ), + ).json() + self.assertEqual(resp["count"], 1) + results = resp["results"][0] + self.assertEqual(results["total_count"], 3) + self.assertEqual(len(results["results"]), 2) + assignments = {ret["edge"]["assigned_by"] for ret in results["results"]} + ids = {ret["ws_obj"]["_id"] for ret in results["results"]} + self.assertEqual(assignments, {"assn1", "assn2"}) + self.assertEqual(ids, {"ws_object_version/1:1:1", "ws_object_version/1:1:2"}) + self.assertEqual( + results["results"][0]["ws_obj"]["type"], + { + "type_name": "Genome", + "module_name": "KBaseGenomes", + "maj_ver": 99, + "min_ver": 77, + "_key": "KBaseGenomes.Genome-99.77", + }, + ) + self.assertEqual( + results["results"][0]["ws_obj"]["ws_info"], + { + "owner": "owner", + "metadata": {"narrative_nice_name": "narrname"}, + "is_public": True, + "mod_epoch": 1, + }, + ) + + def test_get_taxon_from_ws_obj(self): + """Fetch the taxon vertex from a workspace versioned id.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ncbi_taxon_get_taxon_from_ws_obj"}, + data=json.dumps({"ts": _NOW, "obj_ref": "1:1:1"}), + ).json() + self.assertEqual(resp["count"], 1) + assert_subset( + self, + {"id": "1", "scientific_name": "Bacteria", "rank": "Domain"}, + resp["results"][0], + ) + + def test_fetch_taxon_by_sciname(self): + """Test the ncbi_fetch_taxon_by_sciname query.""" + sciname = "Deltaproteobacteria" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ncbi_fetch_taxon_by_sciname"}, + data=json.dumps({"ts": _NOW, "sciname": "Deltaproteobacteria"}), + ).json() + self.assertEqual(resp["count"], 1) + assert_subset( + self, + { + "id": "7", + "scientific_name": sciname, + "rank": "Class", + }, + resp["results"][0], + ) + + def test_fetch_taxon_by_sciname_failures(self): + """Test invalid cases for ncbi_fetch_taxon_by_sciname.""" + # No sciname + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ncbi_fetch_taxon_by_sciname"}, + data=json.dumps({"ts": _NOW}), + ).json() + self.assertEqual(resp["error"]["message"], "'sciname' is a required property") + # No ts + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ncbi_fetch_taxon_by_sciname"}, + data=json.dumps({"sciname": "Deltaproteobacteria"}), + ).json() + self.assertEqual(resp["error"]["message"], "'ts' is a required property") + # sciname not found + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ncbi_fetch_taxon_by_sciname"}, + data=json.dumps({"ts": _NOW, "sciname": "xyzabc"}), + ).json() + self.assertEqual(resp["count"], 0) + self.assertEqual(len(resp["results"]), 0) + + +# -- Test helpers + + +def _run_search_sciname( + self, ranks, include_strains, expected_count, expected_sci_names +): + """ + Helper to run the ncbi_taxon_search_sci_name query and make some standard + assertions on the response. + """ + data = {"ts": _NOW, "search_text": "prefix:bac"} + if ranks is not None: + data["ranks"] = ranks + if include_strains is not None: + data["include_strains"] = include_strains + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ncbi_taxon_search_sci_name"}, + data=json.dumps(data), + ).json() + result = resp["results"][0] + self.assertEqual(result["total_count"], expected_count) + names = {r["scientific_name"] for r in result["results"]} + self.assertEqual(names, expected_sci_names) + + +def _ws_defaults(data): + """Set some defaults for the required workspace fields.""" + defaults = { + "owner": "owner", + "max_obj_id": 1, + "lock_status": "n", + "name": "wsname", + "mod_epoch": 1, + "is_public": True, + "is_deleted": False, + "metadata": {"narrative_nice_name": "narrname"}, + } + # Merge the data with the above defaults + return dict(defaults, **data) + + +def _construct_ws_obj_ver(wsid, objid, ver, is_public=False): + """Test helper to create a ws_object_version vertex.""" + return { + "_key": f"{wsid}:{objid}:{ver}", + "workspace_id": wsid, + "object_id": objid, + "version": ver, + "name": f"obj_name{objid}", + "hash": "xyz", + "size": 100, + "epoch": 0, + "deleted": False, + "is_public": is_public, + } + + +def _construct_ws_obj(wsid, objid, is_public=False): + """Test helper to create a ws_object vertex.""" + return { + "_key": f"{wsid}:{objid}", + "workspace_id": wsid, + "object_id": objid, + "deleted": False, + "is_public": is_public, + } + + +def _create_delta_test_docs(coll_name, docs, edge=False): + """Add in delta required fields.""" + if edge: + for doc in docs: + # Replicate the time-travel system by just setting 'from' and 'to' to the keys + doc["from"] = doc["_from"].split("/")[1] + doc["to"] = doc["_to"].split("/")[1] + else: + for doc in docs: + doc["id"] = doc["_key"] + for doc in docs: + doc["expired"] = 9007199254740991 + doc["created"] = 0 + create_test_docs(coll_name, docs) diff --git a/spec/test/stored_queries/test_ontology.py b/spec/test/stored_queries/test_ontology.py new file mode 100644 index 00000000..7f6f5263 --- /dev/null +++ b/spec/test/stored_queries/test_ontology.py @@ -0,0 +1,389 @@ +""" +Tests for the ontology stored queries. + +These tests run within the re_api docker image, and require access to the ArangoDB, auth, and workspace images. +""" +import json +import time +import unittest +import requests + +from spec.test.helpers import ( + get_config, + create_test_docs, + check_spec_test_env, +) + +_CONF = get_config() +_NOW = int(time.time() * 1000) + + +class TestOntology(unittest.TestCase): + @classmethod + def setUpClass(cls): + """Create test documents""" + + check_spec_test_env() + term_docs = [ + { + "_key": "1", + "id": "ENVO:00000446", + "name": "terrestrial biome", + "type": "CLASS", + "namespace": "ENVO", + "alt_ids": [], + "def": {"val": ""}, + "comments": [], + "subsets": [], + "synonyms": [], + "xrefs": [], + }, + { + "_key": "2", + "id": "ENVO:00000428", + "name": "biome", + "type": "CLASS", + "namespace": "ENVO", + "alt_ids": [], + "def": {"val": ""}, + "comments": [], + "subsets": [], + "synonyms": [], + "xrefs": [], + }, + { + "_key": "3", + "id": "ENVO:01001110", + "name": "ecosystem", + "type": "CLASS", + "namespace": "ENVO", + "alt_ids": [], + "def": {"val": ""}, + "comments": [], + "subsets": [], + "synonyms": [], + "xrefs": [], + }, + { + "_key": "4", + "id": "ENVO:01000254", + "name": "environmental system", + "type": "CLASS", + "namespace": "ENVO", + "alt_ids": [], + "def": {"val": ""}, + "comments": [], + "subsets": [], + "synonyms": [], + "xrefs": [], + }, + { + "_key": "5", + "id": "ENVO:00002030", + "name": "aquatic biome", + "type": "CLASS", + "namespace": "ENVO", + "alt_ids": [], + "def": {"val": ""}, + "comments": [], + "subsets": [], + "synonyms": [], + "xrefs": [], + }, + ] + edge_docs = [ + { + "_from": "ENVO_terms/1", + "_to": "ENVO_terms/2", + "from": "1", + "to": "2", + "id": "1", + "type": "is_a", + }, + { + "_from": "ENVO_terms/2", + "_to": "ENVO_terms/3", + "from": "2", + "to": "3", + "id": "2", + "type": "is_a", + }, + { + "_from": "ENVO_terms/3", + "_to": "ENVO_terms/4", + "from": "3", + "to": "4", + "id": "3", + "type": "is_a", + }, + { + "_from": "ENVO_terms/5", + "_to": "ENVO_terms/2", + "from": "5", + "to": "2", + "id": "4", + "type": "is_a", + }, + ] + _create_delta_test_docs("ENVO_terms", term_docs) + _create_delta_test_docs("ENVO_edges", edge_docs, edge=True) + + def test_get_term_by_name(self): + """Test query of retrieving onotlogy term by searching name""" + resp1 = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ontology_get_term_by_name"}, + data=json.dumps( + { + "ts": _NOW, + "name": "terrestrial biome", + "ancestor_term": "ENVO:01001110", + "@onto_terms": "ENVO_terms", + "@onto_edges": "ENVO_edges", + } + ), + ).json() + self.assertEqual(resp1["count"], 1) + ids = [r["id"] for r in resp1["results"]] + self.assertEqual(ids, ["ENVO:00000446"]) + + resp2 = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ontology_get_term_by_name"}, + data=json.dumps( + { + "ts": _NOW, + "name": "terrestrial", + "ancestor_term": "ENVO:01001110", + "@onto_terms": "ENVO_terms", + "@onto_edges": "ENVO_edges", + } + ), + ).json() + self.assertEqual(resp2["count"], 0) + + resp3 = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ontology_get_term_by_name"}, + data=json.dumps( + { + "ts": _NOW, + "name": "terrestrial biome", + "ancestor_term": "ENVO:00002030", + "@onto_terms": "ENVO_terms", + "@onto_edges": "ENVO_edges", + } + ), + ).json() + self.assertEqual(resp3["count"], 0) + + resp4 = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ontology_get_term_by_name"}, + data=json.dumps( + { + "ts": _NOW, + "name": "terrestrial biome", + "ancestor_term": "", + "@onto_terms": "ENVO_terms", + "@onto_edges": "ENVO_edges", + } + ), + ).json() + self.assertEqual(resp4["count"], 1) + ids = [r["id"] for r in resp4["results"]] + self.assertEqual(ids, ["ENVO:00000446"]) + + def test_get_children(self): + """Test query of ontology children.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ontology_get_children"}, + data=json.dumps( + { + "id": "ENVO:00000428", + "ts": _NOW, + "@onto_terms": "ENVO_terms", + "@onto_edges": "ENVO_edges", + } + ), + ).json() + self.assertEqual(resp["count"], 2) + ids = [r["term"]["id"] for r in resp["results"]] + self.assertCountEqual(ids, ["ENVO:00000446", "ENVO:00002030"]) + + def test_get_parents(self): + """Test query of ontology parents.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ontology_get_parents"}, + data=json.dumps( + { + "id": "ENVO:00000428", + "ts": _NOW, + "@onto_terms": "ENVO_terms", + "@onto_edges": "ENVO_edges", + } + ), + ).json() + self.assertEqual(resp["count"], 1) + ids = [r["term"]["id"] for r in resp["results"]] + self.assertEqual(ids, ["ENVO:01001110"]) + + def test_get_descendants(self): + """Test query of ontology descendants.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ontology_get_descendants"}, + data=json.dumps( + { + "id": "ENVO:01001110", + "ts": _NOW, + "@onto_terms": "ENVO_terms", + "@onto_edges": "ENVO_edges", + } + ), + ).json() + self.assertEqual(resp["count"], 3) + ids = [r["term"]["id"] for r in resp["results"]] + self.assertCountEqual(ids, ["ENVO:00000446", "ENVO:00000428", "ENVO:00002030"]) + + def test_get_ancestors(self): + """Test query of ontology ancestors.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ontology_get_ancestors"}, + data=json.dumps( + { + "id": "ENVO:00000446", + "ts": _NOW, + "@onto_terms": "ENVO_terms", + "@onto_edges": "ENVO_edges", + } + ), + ).json() + self.assertEqual(resp["count"], 3) + ids = [r["term"]["id"] for r in resp["results"]] + self.assertCountEqual(ids, ["ENVO:00000428", "ENVO:01000254", "ENVO:01001110"]) + + def test_get_siblings(self): + """Test query of ontology siblings.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ontology_get_siblings"}, + data=json.dumps( + { + "id": "ENVO:00000446", + "ts": _NOW, + "@onto_terms": "ENVO_terms", + "@onto_edges": "ENVO_edges", + } + ), + ).json() + self.assertEqual(resp["count"], 1) + ids = [r["id"] for r in resp["results"]] + self.assertEqual(ids, ["ENVO:00002030"]) + + def test_get_terms(self): + """Test query of ontology terms.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ontology_get_terms"}, + data=json.dumps( + { + "ids": ["ENVO:00000446", "ENVO:00002030", "abcd"], + "ts": _NOW, + "@onto_terms": "ENVO_terms", + } + ), + ).json() + self.assertEqual(resp["count"], 2) + ids = [r["id"] for r in resp["results"]] + self.assertCountEqual(ids, ["ENVO:00000446", "ENVO:00002030"]) + + +# -- Test helpers + + +def _run_search_sciname( + self, ranks, include_strains, expected_count, expected_sci_names +): + """ + Helper to run the taxonomy_search_sci_name query and make some standard + assertions on the response. + """ + data = { + "ts": _NOW, + "search_text": "prefix:bac", + "@taxon_coll": "ncbi_taxon", + "sciname_field": "scientific_name", + } + if ranks is not None: + data["ranks"] = ranks + if include_strains is not None: + data["include_strains"] = include_strains + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "taxonomy_search_sci_name"}, + data=json.dumps(data), + ).json() + result = resp["results"][0] + self.assertEqual(result["total_count"], expected_count) + names = {r["scientific_name"] for r in result["results"]} + self.assertEqual(names, expected_sci_names) + + +def _ws_defaults(data): + """Set some defaults for the required workspace fields.""" + defaults = { + "owner": "owner", + "max_obj_id": 1, + "lock_status": "n", + "name": "wsname", + "mod_epoch": 1, + "is_public": True, + "is_deleted": False, + "metadata": {"narrative_nice_name": "narrname"}, + } + # Merge the data with the above defaults + return dict(defaults, **data) + + +def _construct_ws_obj_ver(wsid, objid, ver, is_public=False): + """Test helper to create a ws_object_version vertex.""" + return { + "_key": f"{wsid}:{objid}:{ver}", + "workspace_id": wsid, + "object_id": objid, + "version": ver, + "name": f"obj_name{objid}", + "hash": "xyz", + "size": 100, + "epoch": 0, + "deleted": False, + "is_public": is_public, + } + + +def _construct_ws_obj(wsid, objid, is_public=False): + """Test helper to create a ws_object vertex.""" + return { + "_key": f"{wsid}:{objid}", + "workspace_id": wsid, + "object_id": objid, + "deleted": False, + "is_public": is_public, + } + + +def _create_delta_test_docs(coll_name, docs, edge=False): + """Add in delta required fields.""" + if edge: + for doc in docs: + # Replicate the time-travel system by just setting 'from' and 'to' to the keys + doc["from"] = doc["_from"].split("/")[1] + doc["to"] = doc["_to"].split("/")[1] + for doc in docs: + doc["expired"] = 9007199254740991 + doc["created"] = 0 + create_test_docs(coll_name, docs) diff --git a/spec/test/stored_queries/test_query.py b/spec/test/stored_queries/test_query.py new file mode 100644 index 00000000..e76d19f5 --- /dev/null +++ b/spec/test/stored_queries/test_query.py @@ -0,0 +1,631 @@ +""" +This script can be run from `make` +Essentially it was created to run stored queries against the ncbi_taxon collection +and collect data and stats. +""" + +import os +import unittest + +# Skip entire module if env var not set +# to avoid non-Docker-container imports or otherwise +# specific/costly operations in script +if not os.environ.get("DO_QUERY_TESTING"): + raise unittest.SkipTest( + "Env var DO_QUERY_TESTING not set. Skipping query testing module" + ) + +import traceback as tb # noqa E402 +import sys # noqa E402 +import json # noqa E402 +import datetime # noqa E402 +import time # noqa E402 +import random # noqa E402 +import textwrap # noqa E402 +import warnings # noqa E402 +import pytest # noqa E402 +from typing import Tuple, List # noqa E402 +from requests.exceptions import ReadTimeout # noqa E402 + +from arango import ArangoClient # noqa E402 +import numpy as np # noqa E402 +import pandas as pd # noqa E402 +import seaborn as sns # noqa E402 +import matplotlib.pyplot as plt # noqa E402 + +from relation_engine_server.utils import json_validation # noqa E402 + +warnings.filterwarnings("ignore") + +# Directories and files +ROOT_DIR = os.getcwd() +CURR_DIR = os.path.join(ROOT_DIR, "spec/test/stored_queries") +CONFIG_FP = os.path.join(ROOT_DIR, "arango_live_server_config.json") +TEST_DATA_DIR = os.path.join(CURR_DIR, "../data") +TMP_OUT_DIR = os.path.join(ROOT_DIR, "tmp") +SCINAMES_LATEST_FP = os.path.join(TMP_OUT_DIR, "ncbi_scinames_latest.json") +SAMPLINGS_FP = os.path.join(TMP_OUT_DIR, "samplings.json") +STORED_QUERY_FP = os.path.join( + ROOT_DIR, "spec/stored_queries/taxonomy/taxonomy_search_species_strain.yaml" +) +STORED_QUERY_NO_SORT_FP = os.path.join( + ROOT_DIR, "spec/stored_queries/taxonomy/taxonomy_search_species_strain_no_sort.yaml" +) +STORED_QUERY_OLD_FP = os.path.join( + ROOT_DIR, "spec/stored_queries/taxonomy/taxonomy_search_species.yaml" +) + +if not os.path.exists(TMP_OUT_DIR): + os.mkdir(TMP_OUT_DIR) + +# Read config +try: + with open(CONFIG_FP) as fh: + CONFIG = json.load(fh) + CLIENT = ArangoClient(hosts=CONFIG["host"]) + DB = CLIENT.db("ci", username=CONFIG["username"], password=CONFIG["password"]) +except Exception: + help_msg = """ +Please set host URL, username, and password in arango_live_server_config.json, e.g., +{ + "username": "doe_j", + "password": "cat-sat-hat", + "host": "http://10.58.1.211:8532" +} +Note: if you are on a local machine +you may have to proxy into the live ArangoDB server first, e.g., +`ssh -L 8532:10.58.1.211:8532 j_doe@login1.berkeley.kbase.us` +Then, the url would be `http://localhost:8532` +""" + print(help_msg) + raise + +# Get pointer to collection +NCBI_TAXON = DB.collection("ncbi_taxon") + +# Load the queries +QUERY = json_validation.load_json_yaml(STORED_QUERY_FP)["query"] +QUERY_NO_SORT = json_validation.load_json_yaml(STORED_QUERY_NO_SORT_FP)["query"] +QUERY_OLD = json_validation.load_json_yaml(STORED_QUERY_OLD_FP)["query"] + +# Set query bind parameters +LIMIT = 20 +NOW = time.time() * 1000 + +# Load/cache the scinames +# This probably won't work well and will need some fiddling/improvement +# because doing it this way can lead to a timeout on some machine setups +if os.path.isfile(SCINAMES_LATEST_FP): + with open(SCINAMES_LATEST_FP) as fh: + SCINAMES_LATEST = json.load(fh) +else: + print("Fetching latest NCBI scinames ...") + try: + taxa_all = list(NCBI_TAXON.all()) + except ReadTimeout: + print("Sorry, there is a read timeout. Please try again on a different machine") + sys.exit() + SCINAMES_LATEST = [ + taxa["scientific_name"] + for taxa in taxa_all + if (taxa["rank"] in ["species", "strain"] or taxa["strain"]) + and taxa["created"] <= NOW + and NOW <= taxa["expired"] + ] + # Cache latest scinames + with open(SCINAMES_LATEST_FP, "w") as fh: + json.dump(SCINAMES_LATEST, fh) + + +def use_sort(search_text): + """ + Determine whether to use the sorting or non-sorting stored query for the new query. + Smaller search texts' results will not be sorted on. + """ + return len(search_text) > 3 + + +def is_simple(search_text): + """ + Somewhat arbitrary determination of whether a fulltext's search text is "simple" + relative to its search time + """ + return len(search_text.split()) == 2 and all( + [tok.isalnum() and len(tok) >= 3 for tok in search_text.split()] + ) + + +def jprint(jo, dry=False): + txt = json.dumps(jo, indent=3) + if dry: + return txt + else: + print(txt) + + +def do_taxonomy_search_species_query(search_text): + """Do the old query""" + cursor = DB.aql.execute( + QUERY_OLD, + bind_vars={ + "@taxon_coll": "ncbi_taxon", + "sciname_field": "scientific_name", + "search_text": "prefix:" + search_text, # how the old query was set up + "ts": NOW, + "offset": None, + "limit": LIMIT, + "select": ["scientific_name"], + }, + ) + return { + "results": [e["scientific_name"] for e in list(cursor.batch())], + **cursor.statistics(), + } + + +def do_taxonomy_search_species_strain_query(search_text): + """Do the new query""" + cursor = DB.aql.execute( + QUERY if use_sort(search_text) else QUERY_NO_SORT, + bind_vars={ + "@taxon_coll": "ncbi_taxon", + "sciname_field": "scientific_name", + "search_text": search_text, + "ts": NOW, + "offset": None, + "limit": LIMIT, + "select": ["scientific_name"], + }, + ) + return { + "results": [e["scientific_name"] for e in list(cursor.batch())], + **cursor.statistics(), + } + + +def get_search_text_samplings( + resample=True, + cap_scinames=1000, + cap_scinames_prefixes=1000, +): + """ + Get samplings of scinames or prefixes thereof to gauge execution time + + Things to include: + * Simple genus/species epithets with two non-short words + * "Wild" scientific names, defined as the exclusion of the simple scientific names + * All prefixes of all the preceding, respectively, and deduplicated + * 36 alphanumeric characters + * Any edge cases? + """ + # Read if cached + if not resample and os.path.isfile(SAMPLINGS_FP): + with open(SAMPLINGS_FP) as fh: + samplings = json.load(fh) + return samplings + + print("\nSampling search texts and prefixes thereof ...") + + def get_capped_samplings(styp: str) -> Tuple[list, list]: + """ + Randomly sample scinames + Then take all prefixes, deduplicated + "Wild" just means the exclusion of "simple" + """ + if styp not in ["simple", "wild"]: + raise RuntimeError(f"Unknown sampling type {styp}") + print(f"Sampling {styp} scinames ...") + + sampling = [ + sciname + for sciname in SCINAMES_LATEST + if is_simple(sciname) == (styp == "simple") + ] + random.shuffle(sampling) + sampling = sampling[ + :cap_scinames + ] # cap this first to avoid generating overabundant prefixes + + sampling_prefixes = list( + set([sciname[:i] for sciname in sampling for i in range(1, len(sciname))]) + ) + random.shuffle(sampling_prefixes) + sampling_prefixes = sampling_prefixes[:cap_scinames_prefixes] + + return sampling, sampling_prefixes + + scinames_simple, scinames_simple_prefixes = get_capped_samplings("simple") + scinames_wild, scinames_wild_prefixes = get_capped_samplings("wild") + alphanum_chars = list("abcdefghijklmnopqrstuvwxyz0123456789") + edge_cases = [ + "~!@#$%^&*()_+hi", + "hi~!@#$%^&*()_+", + ] # would cause AQL issue: "", "~!@#$%^&*()_+", "[", + + # Aggregate + samplings = { + "scinames_simple": scinames_simple, + "scinames_wild": scinames_wild, + "scinames_simple_prefixes": scinames_simple_prefixes, + "scinames_wild_prefixes": scinames_wild_prefixes, + "alphanum_chars": alphanum_chars, + "edge_cases": edge_cases, + } + + # Manual peek to stdout + peek_len = 10 + jprint( + { + styp: sampling[:peek_len] + (["..."] if len(sampling) > peek_len else []) + for styp, sampling in samplings.items() + } + ) + + # Cache samplings + with open(SAMPLINGS_FP, "w") as fh: + json.dump(samplings, fh) + + return samplings + + +def handle_err(msg, dat=None): + """ + During sampling/sciname/query loops, + if error arises, + log/record + """ + print(msg) + tb.print_exc() + if dat: + dat["failed"] = True + jprint(dat) + + +def update_print_timekeepers(i, t0, exe_times, sampling, num_failed): + """ + Calculate and print + * Running average time per iteration + * Running average time per query execition + * Running median time per query execution + + Precondition: t0, exe_times + """ + if i == 0: + tper_iter, tper_exe, tmed_exe, tmin_exe, tmax_exe = 0, 0, 0, 0, 0 + else: + tper_iter = (time.time() - t0) / i + tper_exe = np.nanmean(exe_times) + tmed_exe = np.nanmedian(exe_times) + tmin_exe = np.nanmin(exe_times) + tmax_exe = np.nanmax(exe_times) + print( + f"[{datetime.datetime.now().strftime('%b%d %H:%M').upper()}]", + "...", + f"{i}/{len(sampling)} search texts tested", + "...", + f"{'%.3fs' % tmin_exe} (min)", + "|", + f"{'%.3fs' % tper_exe} (mean)", + "|", + f"{'%.3fs' % tmed_exe} (median)", + "|", + f"{'%.3fs' % tmax_exe} (max) exe time", + "...", + f"{'%.3fs' % tper_iter} per round trip", + "...", + f"{'%d/%d' % (num_failed, i)} failed", + ) + + +######################################################################################################################## +######################################################################################################################## +def do_query_testing( + samplings: dict, + do_query_func=do_taxonomy_search_species_strain_query, + expect_hits: list = [ + "scinames_simple", + "scinames_wild", + "scinames_latest", + "scinames_latest_permute", + ], + permute: bool = True, + update_period: int = 100, +): + """ + Test search texts, gather statistics, and check for hits + Periodically outputs accumulated mean and median execution times + """ + # Permute since the scinames tend to start out simpler + if permute: + for styp, sampling in samplings.items(): + samplings[styp] = sampling[:] + random.shuffle(samplings[styp]) + + # Get some nice stats to print out + samplings_metadata = [ + {"styp": styp, "num": len(sampling)} for styp, sampling in samplings.items() + ] + total_num_queries = sum([len(sampling) for sampling in samplings.values()]) + + # Print some preliminary info + w = 120 + dec = "=" * w + prelude = textwrap.wrap( + ( + f"do_query_func={do_query_func.__name__}, " + f"samplings_num_queries={samplings_metadata}, " + f"total_num_queries={total_num_queries}, " + ), + width=w, + ) + print("\n\n") + print(dec) + print(dec) + print(*prelude, sep="\n") + print(dec) + print(dec) + print() + + # Data structures accumulating all info + data_all = dict() # For all queries + + try: + + for j, (styp, sampling) in enumerate(samplings.items()): + num_failed: int = 0 + data: List[dict] = [] + data_all[styp] = data + + t0 = time.time() # Wall clock start time for this sampling + exe_times: List[float] = [] # Query execution times for this sampling + + print( + f"\nTesting with sampling_metadata={samplings_metadata[j]},", + f"sampling_assert_hit={styp in expect_hits},", + "...", + ) + print(dec) + + # Traverse all samples in sampling + for i, search_text in enumerate(sampling): + # Calculate and print running time stats + if not i % update_period: + update_print_timekeepers(i, t0, exe_times, sampling, num_failed) + + dat = { + "i": i, + "search_text": search_text, + "failed": False, + } + data.append(dat) + + try: + query_res = do_query_func(search_text) + except Exception: + handle_err("Something went wrong in the query!", dat) + query_res = { + "execution_time": np.nan, + "results": [], + } + + exe_times.append(query_res["execution_time"]) + dat.update(query_res) + + # Set `has_results` + dat["has_results"] = len(query_res["results"]) > 0 + # Set `failed` + if styp in expect_hits: + hits = query_res["results"] + # Given that limit=20, + # test that sciname is in top 20, + # and they aren't >20 duplicates. + # Raise to get traceback in stdout + try: + assert search_text in hits # nosec B101 + assert not ( # nosec B101 + len(hits) == LIMIT + and all([hit == search_text for hit in hits]) + ) + except AssertionError: + num_failed += 1 + handle_err( + "Something went wrong in the expect hit assertion!", + dat, + ) + + # One last time after all of sampling has run + update_print_timekeepers(i + 1, t0, exe_times, sampling, num_failed) + + except Exception: + handle_err("Something went wrong in the samplings/scinames/query loops!") + + finally: + results_fp = os.path.join( + TMP_OUT_DIR, + ( + "res" + "__" + f"{datetime.datetime.now().strftime('%d%b%Y_%H:%M').upper()}" + "__" + f"{do_query_func.__name__}" + "__" + f"{len(samplings)}_samplings" + "__" + f"{total_num_queries}_search_texts" + ".json" + ), + ) + data_meta = { + "do_query_func": do_query_func.__name__, + "samplings": list(samplings.keys()), + "expect_hits": expect_hits, + "total_num_queries": total_num_queries, + "_sampling": styp, # where it may have + "_i": i, # stopped at + "data_all": data_all, + } + print(dec) + print(f"\nWriting results to {results_fp}") + print(dec) + with open(results_fp, "w") as fh: + json.dump(data_meta, fh, indent=3) + + return data_meta + + +######################################################################################################################## +######################################################################################################################## +@pytest.mark.skipif( + not os.environ.get("DO_QUERY_TESTING") == "full", + reason="This can take a couple days, and only needs to be ascertained sporadically", +) +def test_all_ncbi_latest_scinames(): + do_query_testing({"scinames_latest": SCINAMES_LATEST}) + + +@pytest.mark.skipif( + not os.environ.get("DO_QUERY_TESTING") == "sampling", + reason="This can take an hour or so, and only needs to be ascertained sporadically", +) +def test_samplings(): + do_query_testing( + samplings=get_search_text_samplings(resample=True), + do_query_func=do_taxonomy_search_species_strain_query, + ) + + +@pytest.mark.skipif( + not os.environ.get("DO_QUERY_TESTING") == "compare", + reason="This can take an hour or so, and only needs to be ascertained sporadically", +) +def test_compare_queries(): + do_query_testing( + samplings=get_search_text_samplings( + resample=True, cap_scinames=500, cap_scinames_prefixes=500 + ), + do_query_func=do_taxonomy_search_species_strain_query, + permute=False, + ) + do_query_testing( + samplings=get_search_text_samplings(resample=False), + do_query_func=do_taxonomy_search_species_query, + permute=False, + ) + + +def do_graph(data_new_fp, data_old_fp): + """ + { + "data_all": { + "styp0": [ + { + "i": int, # index in sampling + "search_text": str, + "failed": bool, + "results": [ # resulting scinames + ... + ], + "execution_time": float, # s + ... + }, + ... + ], + "styp1": [ + ... + ], + ... + }, + ... + } + """ + with open(data_new_fp) as fh: + data_new = json.load(fh)["data_all"] + with open(data_old_fp) as fh: + data_old = json.load(fh)["data_all"] + + # Not meaningful/large enough to make the figure + if "edge_cases" in data_new: + del data_new["edge_cases"] + if "edge_cases" in data_old: + del data_old["edge_cases"] + + # Count num queries where the old stored query `has_results`/`failed` + old_failed_counts = { + styp: ( + len([1 for dat in data if not dat["failed"]]), + len([1 for dat in data if dat["failed"]]), + ) + for styp, data in data_old.items() + } + old_has_results_counts = { + styp: ( + len([1 for dat in data if not dat["results"]]), + len([1 for dat in data if dat["results"]]), + ) + for styp, data in data_old.items() + } + + # Sanity checks + # Should have same ordering in `styp` and `search_text` + for (styp0, data0), (styp1, data1) in zip(data_new.items(), data_old.items()): + assert styp0 == styp1 # nosec B101 + assert len(data0) == len(data1) # nosec B101 + for dat0, dat1 in zip(data0, data1): + assert dat0["search_text"] == dat1["search_text"] # nosec B101 + assert not np.isnan(dat0["execution_time"]) # nosec B101 + assert not np.isnan(dat1["execution_time"]) # nosec B101 + # old_has_results and old_failed counts should add up + for counts in [old_failed_counts, old_has_results_counts]: + for styp, count in counts.items(): + assert sum(count) == len(data_old[styp]) # nosec B101 + + df_data = [] + df_columns = [ + "exe_time_ms", + "stored_query", + "sampling", + "failed", + "has_results", + "old_failed", + "old_has_results", + ] + for sq, data_epoch in zip(["new", "old"], [data_new, data_old]): + for styp, data in data_epoch.items(): + for i, dat in enumerate(data): + # Toggle the literal strings here in tandem with + # toggling the `hue` below + df_row = [ + int(dat["execution_time"] * 1000), + sq, + f"{styp}\nn = {len(data)} ({old_failed_counts[styp][0]}/{old_failed_counts[styp][1]})", + # f"{styp}\nn = {len(data)} ({old_has_results_counts[styp][0]}/{old_has_results_counts[styp][1]})", + dat["failed"], + dat["has_results"], + data_old[styp][i]["failed"], + data_old[styp][i]["has_results"], + ] + df_data.append(df_row) + + df = pd.DataFrame(df_data, columns=df_columns) + + sns.catplot( + x="stored_query", + y="exe_time_ms", + hue="old_failed", # Toggle the `hue` here in tandem with + # hue="old_has_results", # toggling the literal strings n `df_row` above + scale="area", + scale_hue=False, + col="sampling", + data=df, + kind="violin", + split=True, + cut=0, + aspect=0.7, + bw=0.2, + ) + + plt.show() + + +if __name__ == "__main__": + do_graph(sys.argv[1], sys.argv[2]) diff --git a/spec/test/stored_queries/test_taxonomy.py b/spec/test/stored_queries/test_taxonomy.py new file mode 100644 index 00000000..4d307ca5 --- /dev/null +++ b/spec/test/stored_queries/test_taxonomy.py @@ -0,0 +1,791 @@ +""" +Tests for the ncbi taxonomy stored queries. + +These tests run within the re_api docker image, and require access to the ArangoDB, auth, and workspace images. +""" +import json +import time +import unittest +import requests + +from spec.test.helpers import ( + get_config, + assert_subset, + create_test_docs, + check_spec_test_env, +) + +_CONF = get_config() +_NOW = int(time.time() * 1000) + + +class TestTaxonomy(unittest.TestCase): + @classmethod + def setUpClass(cls): + """Create test documents""" + + check_spec_test_env() + taxon_docs = [ + { + "_key": "1", + "scientific_name": "Bacteria", + "rank": "Domain", + "strain": False, + }, + { + "_key": "2", + "scientific_name": "Firmicutes", + "rank": "Phylum", + "strain": False, + }, + { + "_key": "3", + "scientific_name": "Bacilli", + "rank": "Class", + "strain": False, + }, + { + "_key": "4", + "scientific_name": "Proteobacteria", + "rank": "Phylum", + "strain": False, + }, + { + "_key": "5", + "scientific_name": "Alphaproteobacteria", + "rank": "Class", + "strain": False, + }, + { + "_key": "6", + "scientific_name": "Gammaproteobacteria", + "rank": "Class", + "strain": False, + }, + { + "_key": "7", + "scientific_name": "Deltaproteobacteria", + "rank": "Class", + "strain": False, + }, + { + "_key": "8", + "scientific_name": "Bacillus subtilis 168", + "rank": "no rank", + "strain": True, + }, + ] + gtdb_taxon_docs = [ + {"_key": "1", "scientific_name": "Bacteria", "rank": "Domain"}, + ] + child_docs = [ + { + "_from": "ncbi_taxon/2", + "_to": "ncbi_taxon/1", + "from": "2", + "to": "1", + "id": "2", + }, + { + "_from": "ncbi_taxon/4", + "_to": "ncbi_taxon/1", + "from": "4", + "to": "1", + "id": "4", + }, + { + "_from": "ncbi_taxon/3", + "_to": "ncbi_taxon/2", + "from": "3", + "to": "2", + "id": "3", + }, + { + "_from": "ncbi_taxon/5", + "_to": "ncbi_taxon/4", + "from": "5", + "to": "4", + "id": "5", + }, + { + "_from": "ncbi_taxon/6", + "_to": "ncbi_taxon/4", + "from": "6", + "to": "4", + "id": "6", + }, + { + "_from": "ncbi_taxon/7", + "_to": "ncbi_taxon/4", + "from": "7", + "to": "4", + "id": "7", + }, + # a few levels missing here + { + "_from": "ncbi_taxon/8", + "_to": "ncbi_taxon/3", + "from": "8", + "to": "3", + "id": "8", + }, + ] + obj_ver_docs = [ + _construct_ws_obj_ver(1, 1, 1, is_public=True), + _construct_ws_obj_ver(1, 1, 2, is_public=True), + _construct_ws_obj_ver(2, 1, 1, is_public=False), + ] + obj_docs = [ + _construct_ws_obj(1, 1, is_public=True), + _construct_ws_obj(2, 1, is_public=False), + ] + obj_to_taxa_docs = [ + { + "_from": "ws_object_version/1:1:1", + "_to": "ncbi_taxon/1", + "assigned_by": "assn1", + }, + { + "_from": "ws_object_version/1:1:2", + "_to": "ncbi_taxon/1", + "assigned_by": "assn2", + }, + { + "_from": "ws_object_version/2:1:1", + "_to": "ncbi_taxon/1", + "assigned_by": "assn2", + }, + ] + # Create workspace objects associated to taxa + ws_docs = [ + _ws_defaults({"_key": "1", "is_public": True}), + _ws_defaults({"_key": "2", "is_public": False}), + ] + ws_to_obj = [ + {"_from": "ws_workspace/1", "_to": "ws_object/1:1"}, + {"_from": "ws_workspace/2", "_to": "ws_object/2:1"}, + ] + ws_type_version_docs = [ + { + "_key": "KBaseGenomes.Genome-99.77", + "module_name": "KBaseGenomes", + "type_name": "Genome", + "maj_ver": 99, + "min_ver": 77, + } + ] + ws_obj_instance_of_type_docs = [ + { + "_from": "ws_object_version/1:1:1", + "_to": "ws_type_version/KBaseGenomes.Genome-99.77", + }, + { + "_from": "ws_object_version/1:1:2", + "_to": "ws_type_version/KBaseGenomes.Genome-99.77", + }, + ] + _create_delta_test_docs("ncbi_taxon", taxon_docs) + _create_delta_test_docs("gtdb_taxon", gtdb_taxon_docs) + _create_delta_test_docs("ncbi_child_of_taxon", child_docs, edge=True) + create_test_docs("ws_obj_version_has_taxon", obj_to_taxa_docs) + create_test_docs("ws_object", obj_docs) + create_test_docs("ws_workspace", ws_docs) + create_test_docs("ws_workspace_contains_obj", ws_to_obj) + create_test_docs("ws_object_version", obj_ver_docs) + create_test_docs("ws_obj_instance_of_type", ws_obj_instance_of_type_docs) + create_test_docs("ws_type_version", ws_type_version_docs) + + def test_get_lineage_valid(self): + """Test a valid query of taxon lineage.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "taxonomy_get_lineage"}, + data=json.dumps( + { + "ts": _NOW, + "id": "7", + "select": ["rank", "scientific_name"], + "@taxon_coll": "ncbi_taxon", + "@taxon_child_of": "ncbi_child_of_taxon", + } + ), + ).json() + self.assertEqual(resp["count"], 2) + ranks = [r["rank"] for r in resp["results"]] + names = [r["scientific_name"] for r in resp["results"]] + self.assertEqual(ranks, ["Domain", "Phylum"]) + self.assertEqual(names, ["Bacteria", "Proteobacteria"]) + + def test_get_children(self): + """Test a valid query of taxon descendants.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "taxonomy_get_children"}, + data=json.dumps( + { + "id": "1", + "ts": _NOW, + "search_text": "firmicutes,|proteobacteria", + "select": ["rank", "scientific_name"], + "sciname_field": "scientific_name", + "@taxon_coll": "ncbi_taxon", + "@taxon_child_of": "ncbi_child_of_taxon", + } + ), + ).json() + result = resp["results"][0] + self.assertEqual(result["total_count"], 2) + ranks = {r["rank"] for r in result["results"]} + names = [r["scientific_name"] for r in result["results"]] + self.assertEqual(ranks, {"Phylum"}) + self.assertEqual(names, ["Firmicutes", "Proteobacteria"]) + + def test_get_children_cursor(self): + """Test a valid query to get children with a cursor.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "taxonomy_get_children_cursor"}, + data=json.dumps( + { + "ts": _NOW, + "id": "1", + "@taxon_coll": "ncbi_taxon", + "@taxon_child_of": "ncbi_child_of_taxon", + } + ), + ).json() + self.assertEqual(len(resp["results"]), 2) + + def test_siblings_valid(self): + """Test a valid query for siblings.""" + # Querying from "Alphaproteobacteria" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "taxonomy_get_siblings"}, + data=json.dumps( + { + "ts": _NOW, + "id": "5", + "select": ["rank", "scientific_name"], + "sciname_field": "scientific_name", + "@taxon_coll": "ncbi_taxon", + "@taxon_child_of": "ncbi_child_of_taxon", + } + ), + ).json() + result = resp["results"][0] + self.assertEqual(result["total_count"], 2) + ranks = {r["rank"] for r in result["results"]} + names = [r["scientific_name"] for r in result["results"]] + self.assertEqual(ranks, {"Class"}) + self.assertEqual(names, ["Deltaproteobacteria", "Gammaproteobacteria"]) + + def test_siblings_root(self): + """Test a query for siblings on the root node with no parent.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "taxonomy_get_siblings"}, + data=json.dumps( + { + "ts": _NOW, + "id": "1", + "sciname_field": "scientific_name", + "@taxon_coll": "ncbi_taxon", + "@taxon_child_of": "ncbi_child_of_taxon", + } + ), # Querying from "Bacteria" + ).json() + self.assertEqual(resp["results"][0]["total_count"], 0) + + def test_siblings_nonexistent_node(self): + """Test a query for siblings on the root node with no parent.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "taxonomy_get_siblings"}, + data=json.dumps( + { + "ts": _NOW, + "id": "xyz", # Nonexistent node + "sciname_field": "scientific_name", + "@taxon_coll": "ncbi_taxon", + "@taxon_child_of": "ncbi_child_of_taxon", + } + ), + ).json() + self.assertEqual(resp["results"][0]["total_count"], 0) + + def test_search_sci_name_no_count(self): + """Test a valid query to search sciname without a count.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "taxonomy_search_sci_name"}, + data=json.dumps( + { + "ts": _NOW, + "no_count": True, + "search_text": "prefix:bact", + "select": ["scientific_name"], + "sciname_field": "scientific_name", + "ranks": ["Domain"], + "@taxon_coll": "ncbi_taxon", + } + ), + ).json() + result = resp["results"][0] + self.assertTrue("total_count" not in result) + self.assertEqual(result["results"][0]["scientific_name"], "Bacteria") + + def test_search_sciname_prefix(self): + """Test a query to search sciname.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "taxonomy_search_sci_name"}, + data=json.dumps( + { + "ts": _NOW, + "search_text": "prefix:bact", + "select": ["scientific_name"], + "sciname_field": "scientific_name", + "@taxon_coll": "ncbi_taxon", + } + ), + ).json() + result = resp["results"][0] + self.assertEqual(result["total_count"], 1) + self.assertEqual(result["results"][0]["scientific_name"], "Bacteria") + + def test_search_sciname_gtdb(self): + """Test a search on scientific name against the gtdb taxonomy.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "taxonomy_search_sci_name"}, + data=json.dumps( + { + "ts": _NOW, + "search_text": "prefix:bact", + "select": ["scientific_name"], + "sciname_field": "scientific_name", + "@taxon_coll": "gtdb_taxon", + } + ), + ).json() + result = resp["results"][0] + self.assertEqual(result["total_count"], 1) + self.assertEqual(result["results"][0]["scientific_name"], "Bacteria") + + def test_search_sciname_nonexistent(self): + """Test a query to search sciname for empty results.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "taxonomy_search_sci_name"}, + data=json.dumps( + { + "ts": _NOW, + "search_text": "xyzabc", + "sciname_field": "scientific_name", + "@taxon_coll": "ncbi_taxon", + } + ), + ).json() + self.assertEqual(resp["results"][0]["total_count"], 0) + + def test_search_sciname_wrong_type(self): + """Test a query to search sciname with the wrong type for the search_text param.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "taxonomy_search_sci_name"}, + data=json.dumps( + { + "ts": _NOW, + "search_text": 123, + "@taxon_coll": "ncbi_taxon", + "sciname_field": "scientific_name", + } + ), + ) + self.assertEqual(resp.status_code, 400) + self.assertEqual(resp.json()["error"]["message"], "123 is not of type 'string'") + + def test_search_sciname_missing_search(self): + """Test a query to search sciname with the search_text param missing.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "taxonomy_search_sci_name"}, + data=json.dumps({"ts": _NOW, "@taxon_coll": "ncbi_taxon"}), + ) + self.assertEqual(resp.status_code, 400) + self.assertEqual( + resp.json()["error"]["message"], "'search_text' is a required property" + ) + + def test_search_sciname_more_complicated(self): + """Test a query to search sciname with some more keyword options.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "taxonomy_search_sci_name"}, + data=json.dumps( + { + "ts": _NOW, + "search_text": "prefix:gamma,|prefix:alpha,|prefix:delta", + "sciname_field": "scientific_name", + "@taxon_coll": "ncbi_taxon", + } + ), + ).json() + result = resp["results"][0] + self.assertEqual(result["total_count"], 3) + names = {r["scientific_name"] for r in result["results"]} + self.assertEqual( + names, {"Gammaproteobacteria", "Alphaproteobacteria", "Deltaproteobacteria"} + ) + + def test_search_sciname_offset_max(self): + """Test a query to search sciname with an invalid offset (greater than max).""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "taxonomy_search_sci_name"}, + data=json.dumps( + { + "ts": _NOW, + "search_text": "prefix:bact", + "offset": 100001, + "@taxon_coll": "ncbi_taxon", + "sciname_field": "scientific_name", + } + ), + ) + self.assertEqual(resp.status_code, 400) + self.assertEqual( + resp.json()["error"]["message"], + "100001 is greater than the maximum of 100000", + ) + + def test_search_sciname_limit_max(self): + """Test a query to search sciname with an invalid offset (greater than max).""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "taxonomy_search_sci_name"}, + data=json.dumps( + { + "ts": _NOW, + "search_text": "prefix:bact", + "limit": 1001, + "@taxon_coll": "ncbi_taxon", + "sciname_field": "scientific_name", + } + ), + ) + self.assertEqual(resp.status_code, 400) + self.assertEqual( + resp.json()["error"]["message"], "1001 is greater than the maximum of 1000" + ) + + def test_search_sciname_limit_ranks_implicit_defaults(self): + """Test queries where the results are limited by the rank or strain flag.""" + _run_search_sciname( + self, + ranks=None, + include_strains=None, + expected_count=3, + expected_sci_names={"Bacteria", "Bacilli", "Bacillus subtilis 168"}, + ) + + def test_search_sciname_limit_ranks_explicit_defaults(self): + """Test queries where the results are limited by the rank or strain flag.""" + _run_search_sciname( + self, + ranks=[], + include_strains=False, + expected_count=3, + expected_sci_names={"Bacteria", "Bacilli", "Bacillus subtilis 168"}, + ) + + def test_search_sciname_limit_ranks_2(self): + """Test queries where the results are limited by the rank or strain flag.""" + _run_search_sciname( + self, + ranks=["Domain", "Class"], + include_strains=None, + expected_count=2, + expected_sci_names={"Bacteria", "Bacilli"}, + ) + + def test_search_sciname_limit_ranks_1(self): + """Test queries where the results are limited by the rank or strain flag.""" + _run_search_sciname( + self, + ranks=["Class"], + include_strains=None, + expected_count=1, + expected_sci_names={"Bacilli"}, + ) + + def test_search_sciname_limit_ranks_1_with_strain(self): + """Test queries where the results are limited by the rank or strain flag.""" + _run_search_sciname( + self, + ranks=["Class"], + include_strains=True, + expected_count=2, + expected_sci_names={"Bacilli", "Bacillus subtilis 168"}, + ) + + def test_search_sciname_limit_ranks_1_with_false_strain(self): + """Test queries where the results are limited by the rank or strain flag.""" + _run_search_sciname( + self, + ranks=["Class"], + include_strains=False, + expected_count=1, + expected_sci_names={"Bacilli"}, + ) + + def test_select_fields(self): + """Test that the 'select' works properly for one query.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "taxonomy_get_lineage"}, + data=json.dumps( + { + "ts": _NOW, + "id": "7", + "select": ["rank"], + "@taxon_coll": "ncbi_taxon", + "@taxon_child_of": "ncbi_child_of_taxon", + } + ), + ).json() + self.assertEqual(resp["count"], 2) + self.assertEqual(resp["results"], [{"rank": "Domain"}, {"rank": "Phylum"}]) + + def test_fetch_taxon(self): + """Test a valid query to fetch a taxon.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "taxonomy_fetch_taxon"}, + data=json.dumps({"ts": _NOW, "id": "1", "@taxon_coll": "ncbi_taxon"}), + ).json() + self.assertEqual(resp["count"], 1) + self.assertEqual(resp["results"][0]["id"], "1") + + def test_get_associated_objs(self): + """ + Test a valid query to get associated objects for a taxon. + Two objects are public and one is private, so total_count will be 3 while only the public objects are returned. + """ + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "taxonomy_get_associated_ws_objects"}, + data=json.dumps( + { + "ts": _NOW, + "taxon_id": "1", + "select_obj": ["_id", "type", "ws_info"], + "select_edge": ["assigned_by"], + "@taxon_coll": "ncbi_taxon", + } + ), + ).json() + self.assertEqual(resp["count"], 1) + results = resp["results"][0] + self.assertEqual(results["total_count"], 3) + self.assertEqual(len(results["results"]), 2) + assignments = {ret["edge"]["assigned_by"] for ret in results["results"]} + ids = {ret["ws_obj"]["_id"] for ret in results["results"]} + self.assertEqual(assignments, {"assn1", "assn2"}) + self.assertEqual(ids, {"ws_object_version/1:1:1", "ws_object_version/1:1:2"}) + self.assertEqual( + results["results"][0]["ws_obj"]["type"], + { + "type_name": "Genome", + "module_name": "KBaseGenomes", + "maj_ver": 99, + "min_ver": 77, + "_key": "KBaseGenomes.Genome-99.77", + }, + ) + self.assertEqual( + results["results"][0]["ws_obj"]["ws_info"], + { + "owner": "owner", + "metadata": {"narrative_nice_name": "narrname"}, + "is_public": True, + "mod_epoch": 1, + }, + ) + + def test_get_taxon_from_ws_obj(self): + """Fetch the taxon vertex from a workspace versioned id.""" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "taxonomy_get_taxon_from_ws_obj"}, + data=json.dumps( + {"ts": _NOW, "obj_ref": "1:1:1", "@taxon_coll": "ncbi_taxon"} + ), + ).json() + self.assertEqual(resp["count"], 1) + assert_subset( + self, + {"id": "1", "scientific_name": "Bacteria", "rank": "Domain"}, + resp["results"][0], + ) + + def test_fetch_taxon_by_sciname(self): + """Test the ncbi_fetch_taxon_by_sciname query.""" + sciname = "Deltaproteobacteria" + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "taxonomy_fetch_taxon_by_sciname"}, + data=json.dumps( + { + "ts": _NOW, + "sciname": "Deltaproteobacteria", + "sciname_field": "scientific_name", + "@taxon_coll": "ncbi_taxon", + } + ), + ).json() + self.assertEqual(resp["count"], 1) + assert_subset( + self, + { + "id": "7", + "scientific_name": sciname, + "rank": "Class", + }, + resp["results"][0], + ) + + def test_fetch_taxon_by_sciname_failures(self): + """Test invalid cases for ncbi_fetch_taxon_by_sciname.""" + # No sciname + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "taxonomy_fetch_taxon_by_sciname"}, + data=json.dumps( + { + "ts": _NOW, + "sciname_field": "scientific_name", + "@taxon_coll": "ncbi_taxon", + } + ), + ).json() + self.assertEqual(resp["error"]["message"], "'sciname' is a required property") + # No ts + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ncbi_fetch_taxon_by_sciname"}, + data=json.dumps( + { + "sciname": "Deltaproteobacteria", + "sciname_field": "scientific_name", + "@taxon_coll": "ncbi_taxon", + } + ), + ).json() + self.assertEqual(resp["error"]["message"], "'ts' is a required property") + # sciname not found + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "taxonomy_fetch_taxon_by_sciname"}, + data=json.dumps( + { + "ts": _NOW, + "sciname": "xyzabc", + "sciname_field": "scientific_name", + "@taxon_coll": "ncbi_taxon", + } + ), + ).json() + self.assertEqual(resp["count"], 0) + self.assertEqual(len(resp["results"]), 0) + + +# -- Test helpers + + +def _run_search_sciname( + self, ranks, include_strains, expected_count, expected_sci_names +): + """ + Helper to run the taxonomy_search_sci_name query and make some standard + assertions on the response. + """ + data = { + "ts": _NOW, + "search_text": "prefix:bac", + "@taxon_coll": "ncbi_taxon", + "sciname_field": "scientific_name", + } + if ranks is not None: + data["ranks"] = ranks + if include_strains is not None: + data["include_strains"] = include_strains + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "taxonomy_search_sci_name"}, + data=json.dumps(data), + ).json() + result = resp["results"][0] + self.assertEqual(result["total_count"], expected_count) + names = {r["scientific_name"] for r in result["results"]} + self.assertEqual(names, expected_sci_names) + + +def _ws_defaults(data): + """Set some defaults for the required workspace fields.""" + defaults = { + "owner": "owner", + "max_obj_id": 1, + "lock_status": "n", + "name": "wsname", + "mod_epoch": 1, + "is_public": True, + "is_deleted": False, + "metadata": {"narrative_nice_name": "narrname"}, + } + # Merge the data with the above defaults + return dict(defaults, **data) + + +def _construct_ws_obj_ver(wsid, objid, ver, is_public=False): + """Test helper to create a ws_object_version vertex.""" + return { + "_key": f"{wsid}:{objid}:{ver}", + "workspace_id": wsid, + "object_id": objid, + "version": ver, + "name": f"obj_name{objid}", + "hash": "xyz", + "size": 100, + "epoch": 0, + "deleted": False, + "is_public": is_public, + } + + +def _construct_ws_obj(wsid, objid, is_public=False): + """Test helper to create a ws_object vertex.""" + return { + "_key": f"{wsid}:{objid}", + "workspace_id": wsid, + "object_id": objid, + "deleted": False, + "is_public": is_public, + } + + +def _create_delta_test_docs(coll_name, docs, edge=False): + """Add in delta required fields.""" + if edge: + for doc in docs: + # Replicate the time-travel system by just setting 'from' and 'to' to the keys + doc["from"] = doc["_from"].split("/")[1] + doc["to"] = doc["_to"].split("/")[1] + else: + for doc in docs: + doc["id"] = doc["_key"] + for doc in docs: + doc["expired"] = 9007199254740991 + doc["created"] = 0 + create_test_docs(coll_name, docs) diff --git a/spec/test/stored_queries/test_ws.py b/spec/test/stored_queries/test_ws.py new file mode 100644 index 00000000..573a3981 --- /dev/null +++ b/spec/test/stored_queries/test_ws.py @@ -0,0 +1,132 @@ +""" +Tests for workspace stored queries under the ws* namespace + + +These tests run within the re_api docker image, and require access to the ArangoDB, auth, and workspace images. +""" +import unittest +import json +import requests +from spec.test.helpers import get_config, create_test_docs, check_spec_test_env + +_CONF = get_config() + + +def _ws_obj(wsid, objid, ver, is_public=True): + """Create data for a dummy test workspace obj""" + return { + "_key": ":".join((str(n) for n in (wsid, objid, ver))), + "name": "obj", + "workspace_id": wsid, + "object_id": objid, + "version": ver, + "hash": "x", + "size": 0, + "epoch": 0, + "deleted": False, + "is_public": is_public, + } + + +class TestWs(unittest.TestCase): + @classmethod + def setUpClass(cls): + """ + Create all test data. + """ + + check_spec_test_env() + + ws_object_version = [ + _ws_obj(1, 1, 1), # root/origin object + _ws_obj(1, 2, 1), # copy object + _ws_obj(1, 3, 1), # provenance object + _ws_obj(1, 4, 1), # reference object + _ws_obj(1, 5, 1, is_public=False), # private copy obj + _ws_obj(1, 6, 1, is_public=False), # private prov obj + _ws_obj(1, 7, 1, is_public=False), # private ref obj + ] + create_test_docs("ws_object_version", ws_object_version) + ws_type_version = [{"_key": "Module.Type1-1.0"}] + create_test_docs("ws_type_version", ws_type_version) + ws_obj_instance_of_type = [ + { + "_from": "ws_object_version/1:1:1", + "_to": "ws_type_version/Module.Type1-1.0", + }, + { + "_from": "ws_object_version/1:2:1", + "_to": "ws_type_version/Module.Type1-1.0", + }, + { + "_from": "ws_object_version/1:3:1", + "_to": "ws_type_version/Module.Type1-1.0", + }, + { + "_from": "ws_object_version/1:4:1", + "_to": "ws_type_version/Module.Type1-1.0", + }, + ] + create_test_docs("ws_obj_instance_of_type", ws_obj_instance_of_type) + ws_prov_descendant_of = [ + {"_from": "ws_object_version/1:1:1", "_to": "ws_object_version/1:3:1"}, + {"_from": "ws_object_version/1:1:1", "_to": "ws_object_version/1:6:1"}, + ] + create_test_docs("ws_prov_descendant_of", ws_prov_descendant_of) + ws_refers_to = [ + {"_from": "ws_object_version/1:1:1", "_to": "ws_object_version/1:4:1"}, + {"_from": "ws_object_version/1:1:1", "_to": "ws_object_version/1:7:1"}, + ] + create_test_docs("ws_refers_to", ws_refers_to) + ws_copied_from = [ + {"_from": "ws_object_version/1:1:1", "_to": "ws_object_version/1:2:1"}, + {"_from": "ws_object_version/1:1:1", "_to": "ws_object_version/1:5:1"}, + ] + create_test_docs("ws_copied_from", ws_copied_from) + + def test_fetch_related_data_valid(self): + """ + Test for the basic happy path. + This also covers the case of private-scope object results, which will be hidden from results. + """ + resp = requests.post( + _CONF["re_api_url"] + "/api/v1/query_results", + params={"stored_query": "ws_fetch_related_data", "show_public": True}, + data=json.dumps({"obj_key": "1:1:1"}), + ).json() + self.assertEqual(resp["count"], 1) + self.assertEqual(resp["has_more"], False) + res = resp["results"][0] + # Check the root object results + self.assertEqual(res["obj"]["_key"], "1:1:1") + self.assertEqual(res["obj_type"]["_key"], "Module.Type1-1.0") + # Check the copy results + self.assertEqual(res["copies"]["count"], 1) + self.assertEqual(len(res["copies"]["data"]), 1) + self.assertEqual( + res["copies"]["data"][0]["data"]["_id"], "ws_object_version/1:2:1" + ) + self.assertEqual(res["copies"]["data"][0]["hops"], 1) + self.assertEqual( + res["copies"]["data"][0]["type"]["_id"], "ws_type_version/Module.Type1-1.0" + ) + # Check the provenance results + self.assertEqual(res["prov"]["count"], 1) + self.assertEqual(len(res["prov"]["data"]), 1) + self.assertEqual( + res["prov"]["data"][0]["data"]["_id"], "ws_object_version/1:3:1" + ) + self.assertEqual(res["prov"]["data"][0]["hops"], 1) + self.assertEqual( + res["prov"]["data"][0]["type"]["_id"], "ws_type_version/Module.Type1-1.0" + ) + # Check the ref results + self.assertEqual(res["refs"]["count"], 1) + self.assertEqual(len(res["refs"]["data"]), 1) + self.assertEqual( + res["refs"]["data"][0]["data"]["_id"], "ws_object_version/1:4:1" + ) + self.assertEqual(res["refs"]["data"][0]["hops"], 1) + self.assertEqual( + res["refs"]["data"][0]["type"]["_id"], "ws_type_version/Module.Type1-1.0" + ) diff --git a/spec/test/test_ensure_specs.py b/spec/test/test_ensure_specs.py new file mode 100644 index 00000000..3f95cad9 --- /dev/null +++ b/spec/test/test_ensure_specs.py @@ -0,0 +1,366 @@ +import unittest +from unittest import mock +import copy +import json + +from relation_engine_server.utils import arango_client +from relation_engine_server.utils.ensure_specs import ( + get_local_coll_indexes, + get_local_views, + get_local_analyzers, + ensure_indexes, + ensure_views, + ensure_analyzers, + ensure_all, + is_obj_subset_rec, + mod_obj_literal, + round_float, + excise_namespace, + get_names, +) +from spec.test.helpers import check_spec_test_env + + +def ensure_borked_indexes(): + """Get all the test server indexes, but with 1st one borked""" + coll_name_2_indexes_server = arango_client.get_all_indexes() + borked_coll_name = list(coll_name_2_indexes_server.keys())[0] + borked_index = coll_name_2_indexes_server[borked_coll_name][0] + borked_index["type"] = "fake_type" + borked_name = f"{borked_coll_name}/{borked_index['type']}/{borked_index['fields']}" + return ([borked_name], {borked_coll_name: [borked_index]}) + + +def ensure_borked_views(): + """Get all the test server views, but with 1st one borked""" + all_views_server = arango_client.get_all_views() + borked_view = all_views_server[0] + borked_view["type"] = "fake_type" + borked_name = f"{borked_view['name']}/{borked_view['type']}" + return ([borked_name], [borked_view]) + + +def ensure_borked_analyzers(): + """Get all the test server analyzers, but with 1st one borked""" + all_analyzers_server = arango_client.get_all_analyzers() + borked_analyzer = all_analyzers_server[0] + borked_analyzer["type"] = "fake_type" + borked_name = f"{borked_analyzer['name']}/{borked_analyzer['type']}" + return ([borked_name], [borked_analyzer]) + + +class TestEnsureSpecs(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.maxDiff = None + check_spec_test_env() + + def test_ensure_indexes(self): + failed_names, failed_specs = ensure_indexes() + self.assertFalse(len(failed_names)) + self.assertFalse(len(failed_specs)) + + def test_ensure_views(self): + failed_names, failed_specs = ensure_views() + self.assertFalse(len(failed_names)) + self.assertFalse(len(failed_specs)) + + def test_ensure_analyzers(self): + failed_names, failed_specs = ensure_analyzers() + self.assertFalse(len(failed_names)) + self.assertFalse(len(failed_specs)) + + def test_ensure_all(self): + failed_names = ensure_all() + self.assertEqual( + failed_names, + { + "indexes": [], + "views": [], + "analyzers": [], + }, + ) + + @mock.patch( + "relation_engine_server.utils.ensure_specs.ensure_indexes", + ensure_borked_indexes, + ) + @mock.patch( + "relation_engine_server.utils.ensure_specs.ensure_views", ensure_borked_views + ) + @mock.patch( + "relation_engine_server.utils.ensure_specs.ensure_analyzers", + ensure_borked_analyzers, + ) + def test_ensure_all__fail__mock_ensure_things(self): + """Mock ensure_things calls so that 1st spec is borked""" + borked_index_names, _ = ensure_borked_indexes() + borked_view_names, _ = ensure_borked_views() + borked_analyzer_names, _ = ensure_borked_analyzers() + failed_names = ensure_all() + + self.assertEqual( + { + "indexes": borked_index_names, + "views": borked_view_names, + "analyzers": borked_analyzer_names, + }, + failed_names, + ) + + @mock.patch( + "relation_engine_server.utils.arango_client.get_all_indexes", lambda: {} + ) + @mock.patch("relation_engine_server.utils.arango_client.get_all_views", lambda: []) + @mock.patch( + "relation_engine_server.utils.arango_client.get_all_analyzers", lambda: [] + ) + def test_ensure_all__fail__mock_arango_client_get_all_things(self): + """Mock more upstream in server spec fetches""" + failed_names = ensure_all() + + self.assertEqual( + { + "indexes": get_names(get_local_coll_indexes()[1], "indexes"), + "views": get_names(get_local_views()[1], "views"), + "analyzers": get_names(get_local_analyzers()[1], "analyzers"), + }, + failed_names, + ) + + # ------------------ + # --- Unit tests --- + # ------------------ + + def test_is_obj_subset_rec(self): + """ + For comparing JSON objects + Roughly check l <= r, with recursive checks done with dicts + """ + exp_pass = [ + ({"hi": 1}, {"hi": 1}), + ({"hi": 1}, {"hi": 1, "hello": 2}), + ({}, {}), + ({}, {"hi": 1}), + ( + {"hi": 1, "hello": {"cat": 3, "sat": 2}}, + { + "hi": 1, + "hello": {"cat": 3, "sat": 2, "hat": 3, "bat": {}, "map": []}, + "hey": 5, + "aloha": [{}], + }, + ), + ] + exp_fail = [ + ({"hi": 1}, {}), + ({"hi": {}}, {}), + ({"hi": 1}, {"hi": {}}), + ({"hi": 1, "hello": 2}, {"hi": 1}), + ( + {"hi": 1, "hello": {"cat": 3, "sat": 2, "hat": 3}}, + {"hi": 1, "hello": {"cat": 3, "sat": 2}}, + ), + ( + {"hi": 1, "hello": {"cat": 3, "sat": {}}}, + {"hi": 1, "hello": {"cat": 3, "sat": 2}}, + ), + ( + {"hi": 1, "hello": {"cat": 3}, "hey": 5}, + {"hi": 1, "hello": {"cat": 3, "sat": 2}}, + ), + ( + { + "hi": 1, + "hello": {"cat": 3, "sat": 2, "hat": 3}, + "hey": 5, + "howdy": 6, + }, + {"hi": 1, "hello": {"cat": 3, "sat": 2}}, + ), + ] + + for loc, srv in exp_pass: + self.assertTrue(is_obj_subset_rec(loc, srv)) + for loc, srv in exp_fail: + self.assertFalse(is_obj_subset_rec(loc, srv)) + + def test_is_obj_subset_rec__Reactions(self): + """ + Test the recursive subset functions using Reactions.json view spec + """ + # Local spec + local = [view for view in get_local_views()[1] if view["name"] == "Reactions"][ + 0 + ] + # Server spec + # From Aardvark, but with "name" key/field added + # as seems to happen with GET + server = json.loads( + """ +{ + "name": "Reactions", + "writebufferIdle": 64, + "writebufferActive": 0, + "type": "arangosearch", + "primarySort": [], + "writebufferSizeMax": 33554432, + "commitIntervalMsec": 1000, + "consolidationPolicy": { + "type": "bytes_accum", + "threshold": 0.10000000149011612 + }, + "globallyUniqueId": "h5455DEB9D2A1/9853332", + "cleanupIntervalStep": 10, + "id": "9853332", + "links": { + "rxn_reaction": { + "analyzers": [ + "identity" + ], + "fields": { + "name": { + "analyzers": [ + "text_en" + ] + }, + "aliases": { + "analyzers": [ + "text_en" + ] + }, + "id": { + "analyzers": [ + "text_en" + ] + } + }, + "includeAllFields": true, + "storeValues": "none", + "trackListPositions": false + } + }, + "consolidationIntervalMsec": 60000 +}""" + ) + mod_obj_literal(server, float, round_float) + self.assertTrue(is_obj_subset_rec(local, server)) + + def _copy_mod_obj_literal(self, obj, literal_type, func): + obj = copy.deepcopy(obj) + mod_obj_literal(obj, literal_type, func) + return obj + + def test_mod_obj_literal__round_float(self): + """Test recursively finding floats in obj to correct round off error""" + obj = { + "english": { + "hello": "hello", + "one": 1.00000, + }, + "spanish": { + "hello": "hola", + "one": 1.0000000089, + "_castilian": { + "hello": "hola", + "one": 1, + }, + }, + "japanese": { + "hello": "konichiwa", + "one": 0.999999999999, + }, + } + + exp = { + "english": { + "hello": "hello", + "one": 1.0, + }, + "spanish": { + "hello": "hola", + "one": 1.0, + "_castilian": { + "hello": "hola", + "one": 1, + }, + }, + "japanese": { + "hello": "konichiwa", + "one": 1.0, + }, + } + + self.assertEqual(exp, self._copy_mod_obj_literal(obj, float, round_float)) + + def test_mod_obj_literal__excise_namespace(self): + """Test recursively find namespace::name strings in obj to excise namespace prefix""" + obj = { + "english": { + "hello": "hello", + "thing": "thing", + }, + "spanish": { + "hello": "hola", + "thing": "spanish::cosa", + "_castilian": { + "hello": "hola", + "thing": "spanish_castilian::cosa", + }, + }, + "japanese": { + "hello": "konichiwa", + "thing": "japanese::mono", + }, + } + + exp = { + "english": { + "hello": "hello", + "thing": "thing", + }, + "spanish": { + "hello": "hola", + "thing": "cosa", + "_castilian": { + "hello": "hola", + "thing": "cosa", + }, + }, + "japanese": { + "hello": "konichiwa", + "thing": "mono", + }, + } + + self.assertEqual(exp, self._copy_mod_obj_literal(obj, str, excise_namespace)) + + def test_get_view_analyzer_names(self): + """Test getting names of list of analyzer/view properties""" + views_analyzers = [ + {"name": "thing0", "type": "type0"}, + {"name": "thing1", "type": "type1"}, + ] + + self.assertEqual( + get_names(views_analyzers, "views"), ["thing0/type0", "thing1/type1"] + ) + + def test_get_coll_names(self): + """Test getting names of dict of list of index properties""" + coll_names_2_indexes = { + "coll0": [ + {"type": "type00", "fields": ["fields000", "fields001"]}, + {"type": "type01", "fields": ["fields010"]}, + ], + "coll1": [{"type": "type10", "fields": ["fields100"]}], + } + + self.assertEqual( + get_names(coll_names_2_indexes, "indexes"), + [ + "coll0/type00/['fields000', 'fields001']", + "coll0/type01/['fields010']", + "coll1/type10/['fields100']", + ], + ) diff --git a/spec/test/test_manifest_schema.py b/spec/test/test_manifest_schema.py new file mode 100644 index 00000000..6c00bf2d --- /dev/null +++ b/spec/test/test_manifest_schema.py @@ -0,0 +1,79 @@ +""" +Tests for manifest.schema.json + +Ensure that the manifest schema correctly validates data. + +These tests run within the re_api docker image. +""" +import unittest +import os.path as os_path +from relation_engine_server.utils.json_validation import run_validator +from jsonschema.exceptions import ValidationError + +schema_file = os_path.join("/app", "spec", "datasets", "djornl", "manifest.schema.json") +_TEST_DIR = os_path.join("/app", "spec", "test", "djornl") + + +class Test_Manifest_Schema(unittest.TestCase): + def test_load_invalid_manifest(self): + """test an invalid manifest file""" + + invalid_dir = os_path.join(_TEST_DIR, "invalid_manifest") + + error_list = [ + { + # no file list provided + "file": "no_file_list", + "msg": "'file_list' is a required property", + }, + { + # a cluster file entry should have a prefix + "file": "cluster_no_prefix", + "msg": r"{'data_type': 'cluster', 'path': 'I2_named.tsv'} is not valid under any of the given schemas", + }, + { + # each file_list entry has to have a path + "file": "missing_path", + "msg": "'path' is a required property", + }, + { + # if the date is not quoted, pyyaml will turn it into a date object. Doh! + "file": "date_not_in_quotes", + "msg": "datetime.date\(2020, 12, 25\) is not of type 'string'", + }, + { + # file format is invalid + "file": "invalid_format", + "msg": "'txt' is not one of \['tsv', 'csv'\]", + }, + { + # there must be an indicator of file format + "file": "no_file_format", + "msg": r"{'data_type': 'edge', 'date': '2020-12-25', 'path': 'edge_data'}" + + " is not valid under any of the given schemas", + }, + ] + + for entry in error_list: + data_file = os_path.join(invalid_dir, entry["file"] + ".yaml") + print("looking at " + data_file) + + with self.assertRaisesRegex(ValidationError, entry["msg"]): + run_validator( + schema_file=schema_file, data_file=data_file, nicer_errors=True + ) + + def test_load_valid_manifests(self): + + valid_dir = os_path.join(_TEST_DIR, "valid_manifest") + file_list = ["with_descriptions", "no_file_ext", "no_file_format"] + + for file in file_list: + data_file = os_path.join(valid_dir, file + ".yaml") + print("looking at " + data_file) + + self.assertTrue( + run_validator( + schema_file=schema_file, data_file=data_file, nicer_errors=True + ) + ) diff --git a/spec/test/test_validate.py b/spec/test/test_validate.py new file mode 100644 index 00000000..70784406 --- /dev/null +++ b/spec/test/test_validate.py @@ -0,0 +1,216 @@ +""" +Tests for the schema validation functions + +These tests run within the re_api docker image, and require access to the ArangoDB image for validation of AQL strings. +""" +import unittest +import os.path as os_path + +from spec.test.helpers import capture_stdout +from relation_engine_server.utils.wait_for import wait_for_arangodb +from jsonschema.exceptions import ValidationError +from spec.validate import ( + validate_schema, + validate_collection, + validate_stored_query, + validate_data_source, + validate_view, + validate_all, + validate_all_by_type, +) + +_TEST_DIR = "/app/spec/test/sample_schemas" + + +class TestValidate(unittest.TestCase): + @classmethod + def setUpClass(cls): + wait_for_arangodb() + + def test_validate_schema(self): + """Validate a single file using the generic validate_schema method""" + + err_msg = "No validation schema found for 'made-up_schema'" + with self.assertRaisesRegex(ValueError, err_msg): + validate_schema("/path/to/file", "made-up_schema") + + def test_validate_collection_errors(self): + """Testing collection-specific schema errors""" + + base_dir = os_path.join(_TEST_DIR, "collections") + + error_list = [ + { + "msg": "Name key should match filename: test_nodes vs wrong_name", + "file": "wrong_name.yaml", + "err": ValueError, + }, + { + "msg": "'http://json-schema.org/draft-07/schema#' is not of type 'object'", + "file": "schema_not_object.yaml", + }, + { + "msg": "Additional properties are not allowed \('title' was unexpected\)", + "file": "extra_top_level_entries.yaml", + }, + { + "msg": 'Time-travel edge schemas must require "from" and "to" attributes in ', + "file": "edge_delta_missing_to_from.yaml", + }, + { + "msg": 'Edge schemas must require "_from" and "_to" attributes in ', + "file": "edge_missing_to_from.yaml", + }, + { + "msg": 'Vertex schemas must require the "_key" attribute in ', + "file": "vertex_missing_key.yaml", + }, + { + "msg": 'Time-travel vertex schemas must require the "id" attribute in ', + "file": "vertex_missing_id.yaml", + }, + ] + + for entry in error_list: + err_type = entry["err"] if "err" in entry else ValidationError + # generic method, requires schema type + with self.assertRaisesRegex(err_type, entry["msg"]): + validate_schema(os_path.join(base_dir, entry["file"]), "collection") + # specific method + with self.assertRaisesRegex(err_type, entry["msg"]): + validate_collection(os_path.join(base_dir, entry["file"])) + + # TODO: add an example of a schema that validates but where data['schema'] is + # not a valid json schema. + + def test_validate_collection(self): + """Testing collection-specific schema errors""" + + base_dir = os_path.join(_TEST_DIR, "collections") + + # valid schemas -- check delta is set appropriately + for type in ["edge", "vertex"]: + data = validate_collection(os_path.join(base_dir, "test_" + type + ".yaml")) + self.assertEqual(data["delta"], False) + + # delta is true: + data = validate_collection( + os_path.join(base_dir, "test_delta_" + type + ".yaml") + ) + self.assertEqual(data["delta"], True) + + def test_validate_data_source(self): + + base_dir = os_path.join(_TEST_DIR, "data_sources") + + # working example + output = validate_data_source(os_path.join(base_dir, "minimal.yaml")) + self.assertEqual( + output, + { + "name": "minimal", + "category": "network", + "title": "Example minimal data source", + }, + ) + + error_list = [ + { + "msg": "Additional properties are not allowed \('type' was unexpected\)", + "file": "invalid_additional_property.json", + }, + { + "msg": "'this is not a valid URI' is not a 'uri'", + "file": "uri_validation.json", + }, + ] + + for entry in error_list: + err_type = entry["err"] if "err" in entry else ValidationError + + # generic method + with self.assertRaisesRegex(err_type, entry["msg"]): + validate_schema(os_path.join(base_dir, entry["file"]), "data_source") + + # same thing as above via specific method + with self.assertRaisesRegex(err_type, entry["msg"]): + validate_data_source(os_path.join(base_dir, entry["file"])) + + def test_validate_stored_query(self): + + base_dir = os_path.join(_TEST_DIR, "stored_queries") + + err_str = "False is not of type 'object'" + with self.assertRaisesRegex(ValidationError, err_str): + validate_stored_query(os_path.join(base_dir, "params_not_object.yaml")) + + # total nonsense instead of AQL + err_str = "syntax error, unexpected identifier, expecting assignment" + with self.assertRaisesRegex(ValueError, err_str): + validate_stored_query(os_path.join(base_dir, "invalid_aql.yaml")) + + # invalid bind params + err_str = "Bind vars are invalid" + with self.assertRaisesRegex(ValueError, err_str): + validate_stored_query(os_path.join(base_dir, "invalid_bind_params.yaml")) + + def test_validate_view(self): + + base_dir = os_path.join(_TEST_DIR, "views") + output = { + "name": "minimal", + "type": "arangosearch", + } + + self.assertEqual( + validate_schema(os_path.join(base_dir, "minimal.json"), "view"), output + ) + + self.assertEqual(validate_view(os_path.join(base_dir, "minimal.json")), output) + + err_str = "'from the shore' is not one of \['arangosearch'\]" + with self.assertRaisesRegex(ValidationError, err_str): + validate_view(os_path.join(base_dir, "wrong_type.json")) + + def test_validate_all(self): + """test all the files in a directory""" + + with self.assertRaisesRegex( + ValueError, "No validation schema found for 'muffins'" + ): + validate_all("muffins") + + def validate_all_duplicate_names(self): + with self.assertRaisesRegex( + ValidationError, "duplicate_names failed validation" + ): + validate_all("collection", os_path.join(_TEST_DIR, "duplicate_names")) + + stdout = capture_stdout(validate_all_duplicate_names, self) + self.assertRegex(stdout, "Duplicate queries named 'test_vertex'") + + sample_schemas = { + "collection": "collections", + "stored_query": "stored_queries", + "view": "views", + "data_source": "data_sources", + } + + for (schema_type, directory) in sample_schemas.items(): + # n.b. this assumes all the schemas in /spec are valid! + stdout = capture_stdout(validate_all, schema_type) + self.assertRegex(stdout, r"...all valid") + + with self.assertRaises(Exception): + validate_all(schema_type, os_path.join(_TEST_DIR, directory)) + + def test_validate_all_by_type(self): + """test all files of all types from a root directory""" + + # use value from config + n_errors = validate_all_by_type() + self.assertEqual(n_errors, 0) + + # known dodgy dir + n_errors = validate_all_by_type(_TEST_DIR) + self.assertGreater(n_errors, 0) diff --git a/spec/validate.py b/spec/validate.py new file mode 100644 index 00000000..1a7064be --- /dev/null +++ b/spec/validate.py @@ -0,0 +1,297 @@ +""" +Validate everything in this repo, such as syntax, structure, etc. +""" +import sys +import os +import glob +import requests +import json +from jsonschema.exceptions import ValidationError + +from relation_engine_server.utils.config import get_config +from relation_engine_server.utils.wait_for import wait_for_arangodb +from relation_engine_server.utils.json_validation import run_validator + +_CONF = get_config() +_BASE_DIR = "/app/spec" + +_VALID_SCHEMA_TYPES = { + "data_source": { + "file": os.path.join(_BASE_DIR, "data_source_schema.yaml"), + "plural": "data_sources", + }, + "stored_query": { + "file": os.path.join(_BASE_DIR, "stored_query_schema.yaml"), + "plural": "stored_queries", + }, + "collection": { + "file": os.path.join(_BASE_DIR, "collection_schema.yaml"), + "plural": "collections", + }, + "view": { + "file": os.path.join(_BASE_DIR, "view_schema.yaml"), + "plural": "views", + }, + "analyzer": { + "file": os.path.join(_BASE_DIR, "analyzer_schema.yaml"), + "plural": "analyzers", + }, +} + + +def get_schema_type_paths(schema_type, directory=None): + if schema_type not in _VALID_SCHEMA_TYPES.keys(): + raise ValueError(f"No validation schema found for '{schema_type}'") + if directory is None: + type_dir_name = _VALID_SCHEMA_TYPES[schema_type]["plural"] + directory = _CONF["spec_paths"][type_dir_name] + + paths = [] + for path in glob.iglob(os.path.join(directory, "**", "*.*"), recursive=True): + if path.endswith(".yaml") or path.endswith(".json"): + paths.append(path) + + return sorted(paths) + + +def validate_all(schema_type, directory=None): + """ + Validate the syntax of all schemas of type schema_type in a specified directory + + :param schema_type: (string) the schema type to validate + :param directory: (string) the directory to look in. + If not specified, the default directory for the schema_type + will be used. + """ + err_files = [] + n_files = 0 + names = set() # type: set + + print(f"Validating {schema_type} schemas in {directory}...") + + for path in get_schema_type_paths(schema_type, directory): + n_files += 1 + try: + data = validate_schema(path, schema_type) + # Check for any duplicate schema names + name = data["name"] + if name in names: + raise ValueError(f"Duplicate queries named '{name}'") + else: + names.add(name) + + except Exception as err: + print(f"✕ {path} failed validation") + print(err) + err_files.append([path, err]) + + if not n_files: + print("No schema files found") + return + + if err_files: + err_file_str = "\n".join([i[0] for i in err_files]) + raise ValidationError( + f"{directory} failed validation\n" f"files with errors:\n" f"{err_file_str}" + ) + + # all's well + print("...all valid.") + return + + +def validate_all_by_type(validation_base_dir=None): + """ + Validate the syntax of all schemas of all types in validation_base_dir + + Assumes that the schemas will be set up in parent directories named with the plural form + of the schema type name, i.e. all collection schemas in the 'collections' dir, all views + in the 'views' dir, etc. + + :param validation_base_dir: (string) the directory to look in. + If not specified, the default directory from the config + will be used + + :return n_errors: (int) the number of errors encountered + + """ + + n_errors = [] + for schema_type in sorted(_VALID_SCHEMA_TYPES.keys()): + try: + if validation_base_dir is None: + validate_all(schema_type) + else: + directory = os.path.join( + validation_base_dir, _VALID_SCHEMA_TYPES[schema_type]["plural"] + ) + validate_all(schema_type, directory) + except Exception as err: + n_errors.append(err) + print("\n") + + if n_errors: + print("Validation failed!\n") + print("\n\n".join([str(n) for n in n_errors])) + else: + print("Validation succeeded!") + + return len(n_errors) + + +def validate_schema(path, schema_type): + """Validate a single file against its schema""" + + if schema_type not in _VALID_SCHEMA_TYPES.keys(): + raise ValueError(f"No validation schema found for '{schema_type}'") + + return globals()["validate_" + schema_type](path) + + +def validate_collection(path): + print(f" validating {path}...") + + # JSON schema for vertex and edge collection schemas found in /schema + collection_schema_file = _VALID_SCHEMA_TYPES["collection"]["file"] + data = run_validator(schema_file=collection_schema_file, data_file=path) + namecheck_schema(path, data) + + # Make sure it can be used as a JSON schema + # If the schema is invalid, a SchemaError will get raised + # Otherwise, the schema will work and a ValidationError will get raised (what we want) + try: + run_validator(data={}, schema=data["schema"]) + except ValidationError: + pass + except Exception as err: + print("=" * 80) + print("Unable to load schema in " + path) + raise err + + required = data["schema"].get("required", []) + + # Edges must require _from and _to while vertices must require _key + has_edge_fields = "_from" in required and "_to" in required + has_delta_edge_fields = "from" in required and "to" in required + + if data["type"] == "edge" and data.get("delta") and not has_delta_edge_fields: + raise ValidationError( + 'Time-travel edge schemas must require "from" and "to" attributes in ' + + path + ) + elif data["type"] == "edge" and not data.get("delta") and not has_edge_fields: + raise ValidationError( + 'Edge schemas must require "_from" and "_to" attributes in ' + path + ) + elif data["type"] == "vertex" and data.get("delta") and "id" not in required: + raise ValidationError( + 'Time-travel vertex schemas must require the "id" attribute in ' + path + ) + elif data["type"] == "vertex" and not data.get("delta") and "_key" not in required: + raise ValidationError( + 'Vertex schemas must require the "_key" attribute in ' + path + ) + + print(f"✓ {path} is valid.") + return data + + +def validate_data_source(path): + print(f" validating {path}...") + + # JSON schema for data source files in /data_sources + data_source_schema_file = _VALID_SCHEMA_TYPES["data_source"]["file"] + data = run_validator(schema_file=data_source_schema_file, data_file=path) + namecheck_schema(path, data) + + print(f"✓ {path} is valid.") + return data + + +def validate_stored_query(path): + print(f" validating {path}...") + + stored_queries_schema_file = _VALID_SCHEMA_TYPES["stored_query"]["file"] + data = run_validator(schema_file=stored_queries_schema_file, data_file=path) + namecheck_schema(path, data) + + # Make sure `params` can be used as a JSON schema + if data.get("params"): + # If the schema is invalid, a SchemaError will get raised + # Otherwise, the schema will work and a ValidationError will get raised + try: + run_validator(data={}, schema=data["params"]) + except ValidationError: + pass + + # check that the query is valid AQL + validate_aql_on_arango(data) + + print(f"✓ {path} is valid.") + return data + + +def validate_view(path): + """Validate the structure and syntax of an arangodb view""" + print(f" validating {path}...") + + # JSON schema for /views + view_schema_file = _VALID_SCHEMA_TYPES["view"]["file"] + data = run_validator(data_file=path, schema_file=view_schema_file) + namecheck_schema(path, data) + + print(f"✓ {path} is valid.") + return data + + +def validate_analyzer(path): + """Validate ArangoDB analyzer config""" + print(f" validating {path}...") + + # JSON schema for /analyzers + analyzer_schema_file = _VALID_SCHEMA_TYPES["analyzer"]["file"] + data = run_validator(data_file=path, schema_file=analyzer_schema_file) + namecheck_schema(path, data) + + print(f"✓ {path} is valid.") + return data + + +def namecheck_schema(path, data): + """Ensure that the schema "name" is the same as the file name minus extensions""" + name = data["name"] + filename = os.path.splitext(os.path.basename(path))[0] + if name != filename: + raise ValueError(f"Name key should match filename: {name} vs {filename}") + + +def validate_aql_on_arango(data): + """Validate a string as valid AQL syntax by running it on the ArangoDB""" + query = data.get("query_prefix", "") + " " + data["query"] + url = _CONF["db_url"] + "/_api/query" + auth = (_CONF["db_user"], _CONF["db_pass"]) + + resp = requests.post(url, data=json.dumps({"query": query}), auth=auth) + parsed = resp.json() + if parsed["error"]: + raise ValueError(parsed["errorMessage"]) + query_bind_vars = set(parsed["bindVars"]) + params = set(data.get("params", {}).get("properties", {}).keys()) + if params != query_bind_vars: + raise ValueError( + "Bind vars are invalid.\n" + + f" Extra vars in query: {query_bind_vars - params}.\n" + + f" Extra params in schema: {params - query_bind_vars}" + ) + + +if __name__ == "__main__": + + validation_base_dir = None + if len(sys.argv) > 1: + validation_base_dir = sys.argv[1] + + wait_for_arangodb() + n_errors = validate_all_by_type(validation_base_dir) + exit_code = 0 if not n_errors else 1 + sys.exit(exit_code) diff --git a/spec/view_schema.yaml b/spec/view_schema.yaml new file mode 100644 index 00000000..1b03536f --- /dev/null +++ b/spec/view_schema.yaml @@ -0,0 +1,12 @@ +name: view_schema +type: object +required: ['name', 'type'] +properties: + name: + type: string + title: View name + format: regex + pattern: ^\w+$ + type: + type: string + enum: ['arangosearch'] diff --git a/spec/views/Compounds.json b/spec/views/Compounds.json new file mode 100644 index 00000000..2fc4e82b --- /dev/null +++ b/spec/views/Compounds.json @@ -0,0 +1,42 @@ +{ + "name": "Compounds", + "type": "arangosearch", + "writebufferIdle": 64, + "writebufferActive": 0, + "primarySort": [], + "writebufferSizeMax": 33554432, + "commitIntervalMsec": 1000, + "consolidationPolicy": { + "type": "bytes_accum", + "threshold": 0.1 + }, + "cleanupIntervalStep": 10, + "links": { + "rxn_compound": { + "analyzers": [ + "identity" + ], + "fields": { + "id": { + "analyzers": [ + "text_en" + ] + }, + "abbreviation": { + "analyzers": [ + "text_en" + ] + }, + "aliases": { + "analyzers": [ + "text_en" + ] + } + }, + "includeAllFields": true, + "storeValues": "none", + "trackListPositions": false + } + }, + "consolidationIntervalMsec": 60000 +} diff --git a/spec/views/README.md b/spec/views/README.md new file mode 100644 index 00000000..34b767ac --- /dev/null +++ b/spec/views/README.md @@ -0,0 +1,3 @@ +# Views + +These are json files for Arango views, which are required to perform searches on vertices or edges in Arango. The data in them is used by the [Relation Engine API](https://github.com/kbase/relation_engine) to create views via the `POST /_api/view#arangosearch` endpoint of the ArangoDB HTTP interface. Please [see the ArangoDB docs](https://www.arangodb.com/docs/3.5/http/views-arangosearch.html) for the full set of parameters available. diff --git a/spec/views/Reactions.json b/spec/views/Reactions.json new file mode 100644 index 00000000..8bf91caf --- /dev/null +++ b/spec/views/Reactions.json @@ -0,0 +1,37 @@ +{ + "name": "Reactions", + "type": "arangosearch", + "writebufferIdle": 64, + "writebufferActive": 0, + "primarySort": [], + "writebufferSizeMax": 33554432, + "commitIntervalMsec": 1000, + "consolidationPolicy": { + "type": "bytes_accum", + "threshold": 0.1 + }, + "cleanupIntervalStep": 10, + "links": { + "rxn_reaction": { + "analyzers": [ + "identity" + ], + "fields": { + "name": { + "analyzers": [ + "text_en" + ] + }, + "aliases": { + "analyzers": [ + "text_en" + ] + } + }, + "includeAllFields": true, + "storeValues": "none", + "trackListPositions": false + } + }, + "consolidationIntervalMsec": 60000 +} diff --git a/spec/views/djornl/djornl_node_view.json b/spec/views/djornl/djornl_node_view.json new file mode 100644 index 00000000..8dc30052 --- /dev/null +++ b/spec/views/djornl/djornl_node_view.json @@ -0,0 +1,76 @@ +{ + "name": "djornl_node_view", + "type": "arangosearch", + "primarySort": [], + "cleanupIntervalStep": 2, + "commitIntervalMsec": 1000, + "consolidationPolicy": { + "type": "bytes_accum", + "threshold": 0.1 + }, + "writebufferIdle": 64, + "writebufferActive": 0, + "consolidationIntervalMsec": 60000, + "writebufferSizeMax": 33554432, + "links": { + "djornl_node": { + "analyzers": [ + "identity" + ], + "fields": { + "transcript": {}, + "tair_computational_description": { + "analyzers": [ + "text_en" + ] + }, + "tair_short_description": { + "analyzers": [ + "text_en" + ] + }, + "gene_model_type": {}, + "go_terms": {}, + "go_description": { + "analyzers": [ + "text_en" + ] + }, + "mapman_name": { + "analyzers": [ + "text_en" + ] + }, + "mapman_description": { + "analyzers": [ + "text_en" + ] + }, + "pheno_description": { + "analyzers": [ + "text_en" + ] + }, + "pheno_pto_name": { + "analyzers": [ + "text_en" + ] + }, + "pheno_pto_description": { + "analyzers": [ + "text_en" + ] + }, + "pheno_reference": {}, + "user_notes": { + "analyzers": [ + "text_en" + ] + } + }, + "includeAllFields": false, + "storeValues": "none", + "trackListPositions": false + } + } +} diff --git a/tox.ini b/tox.ini new file mode 100644 index 00000000..6deafc26 --- /dev/null +++ b/tox.ini @@ -0,0 +1,2 @@ +[flake8] +max-line-length = 120