From 3d1ccc0c7584c8ae69d6bebf9bc3bd45c332e3e0 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Tue, 28 Nov 2023 12:12:36 +0100 Subject: [PATCH 1/3] Remove V1/V2 distinction, separate migration tests --- src/main.py | 8 +- src/routers/mldcat_ap/dataset.py | 2 +- src/routers/{v1 => openml}/__init__.py | 0 src/routers/{v1 => openml}/datasets.py | 219 +++++++++++++----- src/routers/{v1 => openml}/qualities.py | 4 +- src/routers/{v1 => openml}/tasktype.py | 2 +- src/routers/v2/datasets.py | 186 --------------- .../{v1 => openml}/dataset_tag_test.py | 64 +---- .../datasets_list_datasets_test.py | 28 +-- tests/routers/{v2 => openml}/datasets_test.py | 4 +- .../migration/datasets_migration_test.py | 187 +++++++++++++++ .../routers/{v1 => openml}/qualities_test.py | 12 +- .../routers/{v1 => openml}/task_type_test.py | 6 +- tests/routers/v1/datasets_old_test.py | 98 -------- 14 files changed, 389 insertions(+), 431 deletions(-) rename src/routers/{v1 => openml}/__init__.py (100%) rename src/routers/{v1 => openml}/datasets.py (58%) rename src/routers/{v1 => openml}/qualities.py (95%) rename src/routers/{v1 => openml}/tasktype.py (98%) delete mode 100644 src/routers/v2/datasets.py rename tests/routers/{v1 => openml}/dataset_tag_test.py (60%) rename tests/routers/{v1 => openml}/datasets_list_datasets_test.py (94%) rename tests/routers/{v2 => openml}/datasets_test.py (91%) create mode 100644 tests/routers/openml/migration/datasets_migration_test.py rename tests/routers/{v1 => openml}/qualities_test.py (97%) rename tests/routers/{v1 => openml}/task_type_test.py (89%) delete mode 100644 tests/routers/v1/datasets_old_test.py diff --git a/src/main.py b/src/main.py index ffd2f7c6..db548f63 100644 --- a/src/main.py +++ b/src/main.py @@ -3,10 +3,9 @@ import uvicorn from fastapi import FastAPI from routers.mldcat_ap.dataset import router as mldcat_ap_router -from routers.v1.datasets import router as datasets_router_v1_format -from routers.v1.qualities import router as qualities_router -from routers.v1.tasktype import router as ttype_router -from routers.v2.datasets import router as datasets_router +from routers.openml.datasets import router as datasets_router +from routers.openml.qualities import router as qualities_router +from routers.openml.tasktype import router as ttype_router def _parse_args() -> argparse.Namespace: @@ -39,7 +38,6 @@ def create_api() -> FastAPI: app = FastAPI() app.include_router(datasets_router) - app.include_router(datasets_router_v1_format) app.include_router(qualities_router) app.include_router(mldcat_ap_router) app.include_router(ttype_router) diff --git a/src/routers/mldcat_ap/dataset.py b/src/routers/mldcat_ap/dataset.py index 8d9994b7..771edb83 100644 --- a/src/routers/mldcat_ap/dataset.py +++ b/src/routers/mldcat_ap/dataset.py @@ -5,7 +5,7 @@ from sqlalchemy import Connection from routers.dependencies import expdb_connection, userdb_connection -from routers.v2.datasets import get_dataset +from routers.openml.datasets import get_dataset router = APIRouter(prefix="/mldcat_ap/datasets", tags=["datasets"]) diff --git a/src/routers/v1/__init__.py b/src/routers/openml/__init__.py similarity index 100% rename from src/routers/v1/__init__.py rename to src/routers/openml/__init__.py diff --git a/src/routers/v1/datasets.py b/src/routers/openml/datasets.py similarity index 58% rename from src/routers/v1/datasets.py rename to src/routers/openml/datasets.py index 172890c7..65c18a61 100644 --- a/src/routers/v1/datasets.py +++ b/src/routers/openml/datasets.py @@ -2,22 +2,30 @@ We add separate endpoints for old-style JSON responses, so they don't clutter the schema of the new API, and are easily removed later. """ +import html import http.client -from enum import StrEnum +from collections import namedtuple +from enum import IntEnum, StrEnum from typing import Annotated, Any, Literal -from database.datasets import get_tags +from database.datasets import get_dataset as db_get_dataset +from database.datasets import ( + get_file, + get_latest_dataset_description, + get_latest_processing_update, + get_latest_status_update, + get_tags, +) from database.datasets import tag_dataset as db_tag_dataset -from database.users import APIKey, User, UserGroup +from database.users import APIKey, User, UserGroup, get_user_groups_for, get_user_id_for from fastapi import APIRouter, Body, Depends, HTTPException -from schemas.datasets.openml import DatasetStatus +from schemas.datasets.openml import DatasetFileFormat, DatasetMetadata, DatasetStatus, Visibility from sqlalchemy import Connection from routers.dependencies import Pagination, expdb_connection, fetch_user, userdb_connection from routers.types import CasualString128, IntegerRange, SystemString64, integer_range_regex -from routers.v2.datasets import get_dataset -router = APIRouter(prefix="/v1/datasets", tags=["datasets"]) +router = APIRouter(prefix="/datasets", tags=["datasets"]) @router.post( @@ -238,58 +246,161 @@ def quality_clause(quality: str, range_: str | None) -> str: return {"data": {"dataset": list(datasets.values())}} +class DatasetError(IntEnum): + NOT_FOUND = 111 + NO_ACCESS = 112 + NO_DATA_FILE = 113 + + +processing_info = namedtuple("processing_info", ["date", "warning", "error"]) + + +def _get_processing_information(dataset_id: int, connection: Connection) -> processing_info: + """Return processing information, if any. Otherwise, all fields `None`.""" + if not (data_processed := get_latest_processing_update(dataset_id, connection)): + return processing_info(date=None, warning=None, error=None) + + date_processed = data_processed["processing_date"] + warning = data_processed["warning"].strip() if data_processed["warning"] else None + error = data_processed["error"].strip() if data_processed["error"] else None + return processing_info(date=date_processed, warning=warning, error=error) + + +def _format_error(*, code: DatasetError, message: str) -> dict[str, str]: + """Formatter for JSON bodies of OpenML error codes.""" + return {"code": str(code), "message": message} + + +def _user_has_access( + dataset: dict[str, Any], + connection: Connection, + api_key: APIKey | None = None, +) -> bool: + """Determine if user of `api_key` has the right to view `dataset`.""" + if dataset["visibility"] == Visibility.PUBLIC: + return True + if not api_key: + return False + + if not (user_id := get_user_id_for(api_key=api_key, connection=connection)): + return False + + if user_id == dataset["uploader"]: + return True + + user_groups = get_user_groups_for(user_id=user_id, connection=connection) + ADMIN_GROUP = 1 + return ADMIN_GROUP in user_groups + + +def _format_parquet_url(dataset: dict[str, Any]) -> str | None: + if dataset["format"].lower() != DatasetFileFormat.ARFF: + return None + + minio_base_url = "https://openml1.win.tue.nl" + return f"{minio_base_url}/dataset{dataset['did']}/dataset_{dataset['did']}.pq" + + +def _format_dataset_url(dataset: dict[str, Any]) -> str: + base_url = "https://test.openml.org" + filename = f"{html.escape(dataset['name'])}.{dataset['format'].lower()}" + return f"{base_url}/data/v1/download/{dataset['file_id']}/{filename}" + + +def _safe_unquote(text: str | None) -> str | None: + """Remove any open and closing quotes and return the remainder if non-empty.""" + if not text: + return None + return text.strip("'\"") or None + + +def _csv_as_list(text: str | None, *, unquote_items: bool = True) -> list[str]: + """Return comma-separated values in `text` as list, optionally remove quotes.""" + if not text: + return [] + chars_to_strip = "'\"\t " if unquote_items else "\t " + return [item.strip(chars_to_strip) for item in text.split(",")] + + @router.get( path="/{dataset_id}", - description="Get old-style wrapped meta-data for dataset with ID `dataset_id`.", + description="Get meta-data for dataset with ID `dataset_id`.", ) -def get_dataset_wrapped( +def get_dataset( dataset_id: int, api_key: APIKey | None = None, user_db: Annotated[Connection, Depends(userdb_connection)] = None, expdb_db: Annotated[Connection, Depends(expdb_connection)] = None, -) -> dict[str, dict[str, Any]]: - try: - dataset = get_dataset( - user_db=user_db, - expdb_db=expdb_db, - dataset_id=dataset_id, - api_key=api_key, - ).model_dump(by_alias=True) - except HTTPException as e: - raise HTTPException( - status_code=http.client.PRECONDITION_FAILED, - detail=e.detail, - ) from None - if processing_data := dataset.get("processing_date"): - dataset["processing_date"] = str(processing_data).replace("T", " ") - if parquet_url := dataset.get("parquet_url"): - dataset["parquet_url"] = str(parquet_url).replace("https", "http") - if minio_url := dataset.get("minio_url"): - dataset["minio_url"] = str(minio_url).replace("https", "http") - - manual = [] - # ref test.openml.org/d/33 (contributor) and d/34 (creator) - # contributor/creator in database is '""' - # json content is [] - for field in ["contributor", "creator"]: - if dataset[field] == [""]: - dataset[field] = [] - manual.append(field) - - if isinstance(dataset["original_data_url"], list): - dataset["original_data_url"] = ", ".join(str(url) for url in dataset["original_data_url"]) - - for field, value in list(dataset.items()): - if field in manual: - continue - if isinstance(value, int): - dataset[field] = str(value) - elif isinstance(value, list) and len(value) == 1: - dataset[field] = str(value[0]) - if not dataset[field]: - del dataset[field] - - if "description" not in dataset: - dataset["description"] = [] - - return {"data_set_description": dataset} +) -> DatasetMetadata: + if not (dataset := db_get_dataset(dataset_id, expdb_db)): + error = _format_error(code=DatasetError.NOT_FOUND, message="Unknown dataset") + raise HTTPException(status_code=http.client.NOT_FOUND, detail=error) + + if not _user_has_access(dataset=dataset, connection=user_db, api_key=api_key): + error = _format_error(code=DatasetError.NO_ACCESS, message="No access granted") + raise HTTPException(status_code=http.client.FORBIDDEN, detail=error) + + if not (dataset_file := get_file(dataset["file_id"], user_db)): + error = _format_error( + code=DatasetError.NO_DATA_FILE, + message="No data file found", + ) + raise HTTPException(status_code=http.client.PRECONDITION_FAILED, detail=error) + + tags = get_tags(dataset_id, expdb_db) + description = get_latest_dataset_description(dataset_id, expdb_db) + processing_result = _get_processing_information(dataset_id, expdb_db) + status = get_latest_status_update(dataset_id, expdb_db) + + status_ = DatasetStatus(status["status"]) if status else DatasetStatus.IN_PREPARATION + + description_ = "" + if description: + description_ = description["description"].replace("\r", "").strip() + + dataset_url = _format_dataset_url(dataset) + parquet_url = _format_parquet_url(dataset) + + contributors = _csv_as_list(dataset["contributor"], unquote_items=True) + creators = _csv_as_list(dataset["creator"], unquote_items=True) + ignore_attribute = _csv_as_list(dataset["ignore_attribute"], unquote_items=True) + row_id_attribute = _csv_as_list(dataset["row_id_attribute"], unquote_items=True) + original_data_url = _csv_as_list(dataset["original_data_url"], unquote_items=True) + + # Not sure which properties are set by this bit: + # foreach( $this->xml_fields_dataset['csv'] as $field ) { + # $dataset->{$field} = getcsv( $dataset->{$field} ); + # } + + return DatasetMetadata( + id=dataset["did"], + visibility=dataset["visibility"], + status=status_, + name=dataset["name"], + licence=dataset["licence"], + version=dataset["version"], + version_label=dataset["version_label"] or "", + language=dataset["language"] or "", + creator=creators, + contributor=contributors, + citation=dataset["citation"] or "", + upload_date=dataset["upload_date"], + processing_date=processing_result.date, + warning=processing_result.warning, + error=processing_result.error, + description=description_, + description_version=description["version"] if description else 0, + tag=tags, + default_target_attribute=_safe_unquote(dataset["default_target_attribute"]), + ignore_attribute=ignore_attribute, + row_id_attribute=row_id_attribute, + url=dataset_url, + parquet_url=parquet_url, + minio_url=parquet_url, + file_id=dataset["file_id"], + format=dataset["format"].lower(), + paper_url=dataset["paper_url"] or None, + original_data_url=original_data_url, + collection_date=dataset["collection_date"], + md5_checksum=dataset_file["md5_hash"], + ) diff --git a/src/routers/v1/qualities.py b/src/routers/openml/qualities.py similarity index 95% rename from src/routers/v1/qualities.py rename to src/routers/openml/qualities.py index 366e208d..569f5d03 100644 --- a/src/routers/v1/qualities.py +++ b/src/routers/openml/qualities.py @@ -8,9 +8,9 @@ from sqlalchemy import Connection, text from routers.dependencies import expdb_connection, fetch_user -from routers.v2.datasets import DatasetError +from routers.openml.datasets import DatasetError -router = APIRouter(prefix="/v1/datasets", tags=["datasets"]) +router = APIRouter(prefix="/datasets", tags=["datasets"]) @router.get("/qualities/list") diff --git a/src/routers/v1/tasktype.py b/src/routers/openml/tasktype.py similarity index 98% rename from src/routers/v1/tasktype.py rename to src/routers/openml/tasktype.py index 49930843..909cea5f 100644 --- a/src/routers/v1/tasktype.py +++ b/src/routers/openml/tasktype.py @@ -9,7 +9,7 @@ from routers.dependencies import expdb_connection -router = APIRouter(prefix="/v1/tasktype", tags=["tasks"]) +router = APIRouter(prefix="/tasktype", tags=["tasks"]) def _normalize_task_type(task_type: dict[str, str | int]) -> dict[str, str | list[Any]]: diff --git a/src/routers/v2/datasets.py b/src/routers/v2/datasets.py deleted file mode 100644 index 864a9eef..00000000 --- a/src/routers/v2/datasets.py +++ /dev/null @@ -1,186 +0,0 @@ -import html -import http.client -from collections import namedtuple -from enum import IntEnum -from typing import Annotated, Any - -from database.datasets import get_dataset as db_get_dataset -from database.datasets import ( - get_file, - get_latest_dataset_description, - get_latest_processing_update, - get_latest_status_update, - get_tags, -) -from database.users import APIKey, get_user_groups_for, get_user_id_for -from fastapi import APIRouter, Depends, HTTPException -from routers.dependencies import expdb_connection, userdb_connection -from schemas.datasets.openml import ( - DatasetFileFormat, - DatasetMetadata, - DatasetStatus, - Visibility, -) -from sqlalchemy import Connection - -router = APIRouter(prefix="/v2/datasets", tags=["datasets"]) - - -class DatasetError(IntEnum): - NOT_FOUND = 111 - NO_ACCESS = 112 - NO_DATA_FILE = 113 - - -processing_info = namedtuple("processing_info", ["date", "warning", "error"]) - - -def _get_processing_information(dataset_id: int, connection: Connection) -> processing_info: - """Return processing information, if any. Otherwise, all fields `None`.""" - if not (data_processed := get_latest_processing_update(dataset_id, connection)): - return processing_info(date=None, warning=None, error=None) - - date_processed = data_processed["processing_date"] - warning = data_processed["warning"].strip() if data_processed["warning"] else None - error = data_processed["error"].strip() if data_processed["error"] else None - return processing_info(date=date_processed, warning=warning, error=error) - - -def _format_error(*, code: DatasetError, message: str) -> dict[str, str]: - """Formatter for JSON bodies of OpenML error codes.""" - return {"code": str(code), "message": message} - - -def _user_has_access( - dataset: dict[str, Any], - connection: Connection, - api_key: APIKey | None = None, -) -> bool: - """Determine if user of `api_key` has the right to view `dataset`.""" - if dataset["visibility"] == Visibility.PUBLIC: - return True - if not api_key: - return False - - if not (user_id := get_user_id_for(api_key=api_key, connection=connection)): - return False - - if user_id == dataset["uploader"]: - return True - - user_groups = get_user_groups_for(user_id=user_id, connection=connection) - ADMIN_GROUP = 1 - return ADMIN_GROUP in user_groups - - -def _format_parquet_url(dataset: dict[str, Any]) -> str | None: - if dataset["format"].lower() != DatasetFileFormat.ARFF: - return None - - minio_base_url = "https://openml1.win.tue.nl" - return f"{minio_base_url}/dataset{dataset['did']}/dataset_{dataset['did']}.pq" - - -def _format_dataset_url(dataset: dict[str, Any]) -> str: - base_url = "https://test.openml.org" - filename = f"{html.escape(dataset['name'])}.{dataset['format'].lower()}" - return f"{base_url}/data/v1/download/{dataset['file_id']}/{filename}" - - -def _safe_unquote(text: str | None) -> str | None: - """Remove any open and closing quotes and return the remainder if non-empty.""" - if not text: - return None - return text.strip("'\"") or None - - -def _csv_as_list(text: str | None, *, unquote_items: bool = True) -> list[str]: - """Return comma-separated values in `text` as list, optionally remove quotes.""" - if not text: - return [] - chars_to_strip = "'\"\t " if unquote_items else "\t " - return [item.strip(chars_to_strip) for item in text.split(",")] - - -@router.get( - path="/{dataset_id}", - description="Get meta-data for dataset with ID `dataset_id`.", -) -def get_dataset( - dataset_id: int, - api_key: APIKey | None = None, - user_db: Annotated[Connection, Depends(userdb_connection)] = None, - expdb_db: Annotated[Connection, Depends(expdb_connection)] = None, -) -> DatasetMetadata: - if not (dataset := db_get_dataset(dataset_id, expdb_db)): - error = _format_error(code=DatasetError.NOT_FOUND, message="Unknown dataset") - raise HTTPException(status_code=http.client.NOT_FOUND, detail=error) - - if not _user_has_access(dataset=dataset, connection=user_db, api_key=api_key): - error = _format_error(code=DatasetError.NO_ACCESS, message="No access granted") - raise HTTPException(status_code=http.client.FORBIDDEN, detail=error) - - if not (dataset_file := get_file(dataset["file_id"], user_db)): - error = _format_error( - code=DatasetError.NO_DATA_FILE, - message="No data file found", - ) - raise HTTPException(status_code=http.client.PRECONDITION_FAILED, detail=error) - - tags = get_tags(dataset_id, expdb_db) - description = get_latest_dataset_description(dataset_id, expdb_db) - processing_result = _get_processing_information(dataset_id, expdb_db) - status = get_latest_status_update(dataset_id, expdb_db) - - status_ = DatasetStatus(status["status"]) if status else DatasetStatus.IN_PREPARATION - - description_ = "" - if description: - description_ = description["description"].replace("\r", "").strip() - - dataset_url = _format_dataset_url(dataset) - parquet_url = _format_parquet_url(dataset) - - contributors = _csv_as_list(dataset["contributor"], unquote_items=True) - creators = _csv_as_list(dataset["creator"], unquote_items=True) - ignore_attribute = _csv_as_list(dataset["ignore_attribute"], unquote_items=True) - row_id_attribute = _csv_as_list(dataset["row_id_attribute"], unquote_items=True) - original_data_url = _csv_as_list(dataset["original_data_url"], unquote_items=True) - - # Not sure which properties are set by this bit: - # foreach( $this->xml_fields_dataset['csv'] as $field ) { - # $dataset->{$field} = getcsv( $dataset->{$field} ); - # } - - return DatasetMetadata( - id=dataset["did"], - visibility=dataset["visibility"], - status=status_, - name=dataset["name"], - licence=dataset["licence"], - version=dataset["version"], - version_label=dataset["version_label"] or "", - language=dataset["language"] or "", - creator=creators, - contributor=contributors, - citation=dataset["citation"] or "", - upload_date=dataset["upload_date"], - processing_date=processing_result.date, - warning=processing_result.warning, - error=processing_result.error, - description=description_, - description_version=description["version"] if description else 0, - tag=tags, - default_target_attribute=_safe_unquote(dataset["default_target_attribute"]), - ignore_attribute=ignore_attribute, - row_id_attribute=row_id_attribute, - url=dataset_url, - parquet_url=parquet_url, - minio_url=parquet_url, - file_id=dataset["file_id"], - format=dataset["format"].lower(), - paper_url=dataset["paper_url"] or None, - original_data_url=original_data_url, - collection_date=dataset["collection_date"], - md5_checksum=dataset_file["md5_hash"], - ) diff --git a/tests/routers/v1/dataset_tag_test.py b/tests/routers/openml/dataset_tag_test.py similarity index 60% rename from tests/routers/v1/dataset_tag_test.py rename to tests/routers/openml/dataset_tag_test.py index b3d0e250..f868f2cf 100644 --- a/tests/routers/v1/dataset_tag_test.py +++ b/tests/routers/openml/dataset_tag_test.py @@ -21,7 +21,7 @@ def test_dataset_tag_rejects_unauthorized(key: ApiKey, api_client: TestClient) - response = cast( httpx.Response, api_client.post( - f"/v1/datasets/tag{apikey}", + f"/datasets/tag{apikey}", json={"data_id": constants.PRIVATE_DATASET_ID, "tag": "test"}, ), ) @@ -39,7 +39,7 @@ def test_dataset_tag(key: ApiKey, expdb_test: Connection, api_client: TestClient response = cast( httpx.Response, api_client.post( - f"/v1/datasets/tag?api_key={key}", + f"/datasets/tag?api_key={key}", json={"data_id": dataset_id, "tag": tag}, ), ) @@ -55,7 +55,7 @@ def test_dataset_tag_returns_existing_tags(api_client: TestClient) -> None: response = cast( httpx.Response, api_client.post( - f"/v1/datasets/tag?api_key={ApiKey.ADMIN}", + f"/datasets/tag?api_key={ApiKey.ADMIN}", json={"data_id": dataset_id, "tag": tag}, ), ) @@ -68,7 +68,7 @@ def test_dataset_tag_fails_if_tag_exists(api_client: TestClient) -> None: response = cast( httpx.Response, api_client.post( - f"/v1/datasets/tag?api_key={ApiKey.ADMIN}", + f"/datasets/tag?api_key={ApiKey.ADMIN}", json={"data_id": dataset_id, "tag": tag}, ), ) @@ -95,64 +95,10 @@ def test_dataset_tag_invalid_tag_is_rejected( new = cast( httpx.Response, api_client.post( - f"/v1/datasets/tag?api_key{ApiKey.ADMIN}", + f"/datasets/tag?api_key{ApiKey.ADMIN}", json={"data_id": 1, "tag": tag}, ), ) assert new.status_code == http.client.UNPROCESSABLE_ENTITY assert ["body", "tag"] == new.json()["detail"][0]["loc"] - - -@pytest.mark.php() -@pytest.mark.parametrize( - "dataset_id", - list(range(1, 10)) + [101], -) -@pytest.mark.parametrize( - "api_key", - [ApiKey.ADMIN, ApiKey.REGULAR_USER, ApiKey.OWNER_USER], - ids=["Administrator", "regular user", "possible owner"], -) -@pytest.mark.parametrize( - "tag", - ["study_14", "totally_new_tag_for_migration_testing"], - ids=["typically existing tag", "new tag"], -) -def test_dataset_tag_response_is_identical( - dataset_id: int, - tag: str, - api_key: str, - api_client: TestClient, -) -> None: - original = httpx.post( - "http://server-api-php-api-1:80/api/v1/json/data/tag", - data={"api_key": api_key, "tag": tag, "data_id": dataset_id}, - ) - if ( - original.status_code == http.client.PRECONDITION_FAILED - and original.json()["error"]["message"] == "An Elastic Search Exception occured." - ): - pytest.skip("Encountered Elastic Search error.") - if original.status_code == http.client.OK: - # undo the tag, because we don't want to persist this change to the database - httpx.post( - "http://server-api-php-api-1:80/api/v1/json/data/untag", - data={"api_key": api_key, "tag": tag, "data_id": dataset_id}, - ) - new = cast( - httpx.Response, - api_client.post( - f"/v1/datasets/tag?api_key={api_key}", - json={"data_id": dataset_id, "tag": tag}, - ), - ) - - assert original.status_code == new.status_code, original.json() - if new.status_code != http.client.OK: - assert original.json()["error"] == new.json()["detail"] - return - - original = original.json() - new = new.json() - assert original == new diff --git a/tests/routers/v1/datasets_list_datasets_test.py b/tests/routers/openml/datasets_list_datasets_test.py similarity index 94% rename from tests/routers/v1/datasets_list_datasets_test.py rename to tests/routers/openml/datasets_list_datasets_test.py index 0de8e83b..e341735d 100644 --- a/tests/routers/v1/datasets_list_datasets_test.py +++ b/tests/routers/openml/datasets_list_datasets_test.py @@ -20,7 +20,7 @@ def _assert_empty_result( def test_list(api_client: TestClient) -> None: - response = api_client.get("/v1/datasets/list/") + response = api_client.get("/datasets/list/") assert response.status_code == http.client.OK assert "data" in response.json() assert "dataset" in response.json()["data"] @@ -40,7 +40,7 @@ def test_list(api_client: TestClient) -> None: ) def test_list_filter_active(status: str, amount: int, api_client: TestClient) -> None: response = api_client.post( - "/v1/datasets/list", + "/datasets/list", json={"status": status, "pagination": {"limit": constants.NUMBER_OF_DATASETS}}, ) assert response.status_code == http.client.OK, response.json() @@ -60,7 +60,7 @@ def test_list_filter_active(status: str, amount: int, api_client: TestClient) -> def test_list_accounts_privacy(api_key: ApiKey | None, amount: int, api_client: TestClient) -> None: key = f"?api_key={api_key}" if api_key else "" response = api_client.post( - f"/v1/datasets/list{key}", + f"/datasets/list{key}", json={"status": "all", "pagination": {"limit": 1000}}, ) assert response.status_code == http.client.OK, response.json() @@ -75,7 +75,7 @@ def test_list_accounts_privacy(api_key: ApiKey | None, amount: int, api_client: def test_list_data_name_present(name: str, count: int, api_client: TestClient) -> None: # The second iris dataset is private, so we need to authenticate. response = api_client.post( - f"/v1/datasets/list?api_key={ApiKey.ADMIN}", + f"/datasets/list?api_key={ApiKey.ADMIN}", json={"status": "all", "data_name": name}, ) assert response.status_code == http.client.OK @@ -90,7 +90,7 @@ def test_list_data_name_present(name: str, count: int, api_client: TestClient) - ) def test_list_data_name_absent(name: str, api_client: TestClient) -> None: response = api_client.post( - f"/v1/datasets/list?api_key={ApiKey.ADMIN}", + f"/datasets/list?api_key={ApiKey.ADMIN}", json={"status": "all", "data_name": name}, ) _assert_empty_result(response) @@ -116,7 +116,7 @@ def test_list_pagination(limit: int | None, offset: int | None, api_client: Test offset_body = {} if offset is None else {"offset": offset} limit_body = {} if limit is None else {"limit": limit} filters = {"status": "all", "pagination": offset_body | limit_body} - response = api_client.post("/v1/datasets/list", json=filters) + response = api_client.post("/datasets/list", json=filters) if offset in [130, 200]: _assert_empty_result(response) @@ -133,7 +133,7 @@ def test_list_pagination(limit: int | None, offset: int | None, api_client: Test ) def test_list_data_version(version: int, count: int, api_client: TestClient) -> None: response = api_client.post( - f"/v1/datasets/list?api_key={ApiKey.ADMIN}", + f"/datasets/list?api_key={ApiKey.ADMIN}", json={"status": "all", "data_version": version}, ) assert response.status_code == http.client.OK @@ -144,7 +144,7 @@ def test_list_data_version(version: int, count: int, api_client: TestClient) -> def test_list_data_version_no_result(api_client: TestClient) -> None: response = api_client.post( - f"/v1/datasets/list?api_key={ApiKey.ADMIN}", + f"/datasets/list?api_key={ApiKey.ADMIN}", json={"status": "all", "data_version": 4}, ) _assert_empty_result(response) @@ -160,7 +160,7 @@ def test_list_data_version_no_result(api_client: TestClient) -> None: ) def test_list_uploader(user_id: int, count: int, key: str, api_client: TestClient) -> None: response = api_client.post( - f"/v1/datasets/list?api_key={key}", + f"/datasets/list?api_key={key}", json={"status": "all", "uploader": user_id}, ) # The dataset of user 16 is private, so can not be retrieved by other users. @@ -179,7 +179,7 @@ def test_list_uploader(user_id: int, count: int, key: str, api_client: TestClien ) def test_list_data_id(data_id: list[int], api_client: TestClient) -> None: response = api_client.post( - "/v1/datasets/list", + "/datasets/list", json={"status": "all", "data_id": data_id}, ) @@ -195,7 +195,7 @@ def test_list_data_id(data_id: list[int], api_client: TestClient) -> None: ) def test_list_data_tag(tag: str, count: int, api_client: TestClient) -> None: response = api_client.post( - "/v1/datasets/list", + "/datasets/list", # study_14 has 100 datasets, we overwrite the default `limit` because otherwise # we don't know if the results are limited by filtering on the tag. json={"status": "all", "tag": tag, "pagination": {"limit": 101}}, @@ -207,7 +207,7 @@ def test_list_data_tag(tag: str, count: int, api_client: TestClient) -> None: def test_list_data_tag_empty(api_client: TestClient) -> None: response = api_client.post( - "/v1/datasets/list", + "/datasets/list", json={"status": "all", "tag": "not-a-tag"}, ) _assert_empty_result(response) @@ -228,7 +228,7 @@ def test_list_data_tag_empty(api_client: TestClient) -> None: ) def test_list_data_quality(quality: str, range_: str, count: int, api_client: TestClient) -> None: response = api_client.post( - "/v1/datasets/list", + "/datasets/list", json={"status": "all", quality: range_}, ) assert response.status_code == http.client.OK, response.json() @@ -276,7 +276,7 @@ def test_list_data_identical( new_style["pagination"]["offset"] = offset response = api_client.post( - f"/v1/datasets/list{api_key_query}", + f"/datasets/list{api_key_query}", json=new_style, ) diff --git a/tests/routers/v2/datasets_test.py b/tests/routers/openml/datasets_test.py similarity index 91% rename from tests/routers/v2/datasets_test.py rename to tests/routers/openml/datasets_test.py index 3b538290..1aff3dc2 100644 --- a/tests/routers/v2/datasets_test.py +++ b/tests/routers/openml/datasets_test.py @@ -19,7 +19,7 @@ def test_error_unknown_dataset( response_code: int, api_client: TestClient, ) -> None: - response = cast(httpx.Response, api_client.get(f"v2/datasets/{dataset_id}")) + response = cast(httpx.Response, api_client.get(f"/datasets/{dataset_id}")) assert response.status_code == response_code assert {"code": "111", "message": "Unknown dataset"} == response.json()["detail"] @@ -38,7 +38,7 @@ def test_private_dataset_no_user_no_access( response_code: int, ) -> None: query = f"?api_key={api_key}" if api_key else "" - response = cast(httpx.Response, api_client.get(f"v2/datasets/130{query}")) + response = cast(httpx.Response, api_client.get(f"/datasets/130{query}")) assert response.status_code == response_code assert {"code": "112", "message": "No access granted"} == response.json()["detail"] diff --git a/tests/routers/openml/migration/datasets_migration_test.py b/tests/routers/openml/migration/datasets_migration_test.py new file mode 100644 index 00000000..0266e5be --- /dev/null +++ b/tests/routers/openml/migration/datasets_migration_test.py @@ -0,0 +1,187 @@ +import http.client +import json +from typing import Any, cast + +import httpx +import pytest +from starlette.testclient import TestClient + +from tests.conftest import ApiKey + + +@pytest.mark.php() +@pytest.mark.parametrize( + "dataset_id", + range(1, 132), +) +def test_dataset_response_is_identical(dataset_id: int, api_client: TestClient) -> None: + original = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/data/{dataset_id}") + new = api_client.get(f"/datasets/{dataset_id}") + + if new.status_code == http.client.FORBIDDEN: + assert original.status_code == http.client.PRECONDITION_FAILED + else: + assert original.status_code == new.status_code + + if new.status_code != http.client.OK: + assert original.json()["error"] == new.json()["detail"] + return + + try: + original = original.json()["data_set_description"] + except json.decoder.JSONDecodeError: + pytest.skip("A PHP error occurred on the test server.") + + if "div" in original: + pytest.skip("A PHP error occurred on the test server.") + + # There are a few changes between the old API and the new API, so we convert here: + # The new API has normalized `format` field: + original["format"] = original["format"].lower() + + # There is odd behavior in the live server that I don't want to recreate: + # when the creator is a list of csv names, it can either be a str or a list + # depending on whether the names are quoted. E.g.: + # '"Alice", "Bob"' -> ["Alice", "Bob"] + # 'Alice, Bob' -> 'Alice, Bob' + if ( + "creator" in original + and isinstance(original["creator"], str) + and len(original["creator"].split(",")) > 1 + ): + original["creator"] = [name.strip() for name in original["creator"].split(",")] + + new_body = new.json() + if processing_data := new_body.get("processing_date"): + new_body["processing_date"] = str(processing_data).replace("T", " ") + if parquet_url := new_body.get("parquet_url"): + new_body["parquet_url"] = str(parquet_url).replace("https", "http") + if minio_url := new_body.get("minio_url"): + new_body["minio_url"] = str(minio_url).replace("https", "http") + + manual = [] + # ref test.openml.org/d/33 (contributor) and d/34 (creator) + # contributor/creator in database is '""' + # json content is [] + for field in ["contributor", "creator"]: + if new_body[field] == [""]: + new_body[field] = [] + manual.append(field) + + if isinstance(new_body["original_data_url"], list): + new_body["original_data_url"] = ", ".join(str(url) for url in new_body["original_data_url"]) + + for field, value in list(new_body.items()): + if field in manual: + continue + if isinstance(value, int): + new_body[field] = str(value) + elif isinstance(value, list) and len(value) == 1: + new_body[field] = str(value[0]) + if not new_body[field]: + del new_body[field] + + if "description" not in new_body: + new_body["description"] = [] + assert original == new_body + + +@pytest.mark.parametrize( + "dataset_id", + [-1, 138, 100_000], +) +def test_error_unknown_dataset( + dataset_id: int, + api_client: TestClient, +) -> None: + response = cast(httpx.Response, api_client.get(f"/datasets/{dataset_id}")) + + # The new API has "404 Not Found" instead of "412 PRECONDITION_FAILED" + assert response.status_code == http.client.NOT_FOUND + assert {"code": "111", "message": "Unknown dataset"} == response.json()["detail"] + + +@pytest.mark.parametrize( + "api_key", + [None, "a" * 32], +) +def test_private_dataset_no_user_no_access( + api_client: TestClient, + api_key: str | None, +) -> None: + query = f"?api_key={api_key}" if api_key else "" + response = cast(httpx.Response, api_client.get(f"/datasets/130{query}")) + + # New response is 403: Forbidden instead of 412: PRECONDITION FAILED + assert response.status_code == http.client.FORBIDDEN + assert {"code": "112", "message": "No access granted"} == response.json()["detail"] + + +@pytest.mark.skip("Not sure how to include apikey in test yet.") +def test_private_dataset_owner_access( + api_client: TestClient, + dataset_130: dict[str, Any], +) -> None: + response = cast(httpx.Response, api_client.get("/datasets/130?api_key=...")) + assert response.status_code == http.client.OK + assert dataset_130 == response.json() + + +@pytest.mark.skip("Not sure how to include apikey in test yet.") +def test_private_dataset_admin_access(api_client: TestClient) -> None: + cast(httpx.Response, api_client.get("/datasets/130?api_key=...")) + # test against cached response + + +@pytest.mark.php() +@pytest.mark.parametrize( + "dataset_id", + list(range(1, 10)) + [101], +) +@pytest.mark.parametrize( + "api_key", + [ApiKey.ADMIN, ApiKey.REGULAR_USER, ApiKey.OWNER_USER], + ids=["Administrator", "regular user", "possible owner"], +) +@pytest.mark.parametrize( + "tag", + ["study_14", "totally_new_tag_for_migration_testing"], + ids=["typically existing tag", "new tag"], +) +def test_dataset_tag_response_is_identical( + dataset_id: int, + tag: str, + api_key: str, + api_client: TestClient, +) -> None: + original = httpx.post( + "http://server-api-php-api-1:80/api/v1/json/data/tag", + data={"api_key": api_key, "tag": tag, "data_id": dataset_id}, + ) + if ( + original.status_code == http.client.PRECONDITION_FAILED + and original.json()["error"]["message"] == "An Elastic Search Exception occured." + ): + pytest.skip("Encountered Elastic Search error.") + if original.status_code == http.client.OK: + # undo the tag, because we don't want to persist this change to the database + httpx.post( + "http://server-api-php-api-1:80/api/v1/json/data/untag", + data={"api_key": api_key, "tag": tag, "data_id": dataset_id}, + ) + new = cast( + httpx.Response, + api_client.post( + f"/datasets/tag?api_key={api_key}", + json={"data_id": dataset_id, "tag": tag}, + ), + ) + + assert original.status_code == new.status_code, original.json() + if new.status_code != http.client.OK: + assert original.json()["error"] == new.json()["detail"] + return + + original = original.json() + new = new.json() + assert original == new diff --git a/tests/routers/v1/qualities_test.py b/tests/routers/openml/qualities_test.py similarity index 97% rename from tests/routers/v1/qualities_test.py rename to tests/routers/openml/qualities_test.py index d2de98b6..eb30b442 100644 --- a/tests/routers/v1/qualities_test.py +++ b/tests/routers/openml/qualities_test.py @@ -30,14 +30,14 @@ def _remove_quality_from_database(quality_name: str, expdb_test: Connection) -> @pytest.mark.php() def test_list_qualities_identical(api_client: TestClient) -> None: original = httpx.get("http://server-api-php-api-1:80/api/v1/json/data/qualities/list") - new = api_client.get("/v1/datasets/qualities/list") + new = api_client.get("/datasets/qualities/list") assert original.status_code == new.status_code assert original.json() == new.json() # To keep the test idempotent, we cannot test if reaction to database changes is identical def test_list_qualities(api_client: TestClient, expdb_test: Connection) -> None: - response = api_client.get("/v1/datasets/qualities/list") + response = api_client.get("/datasets/qualities/list") assert response.status_code == http.client.OK expected = { "data_qualities_list": { @@ -157,13 +157,13 @@ def test_list_qualities(api_client: TestClient, expdb_test: Connection) -> None: deleted = expected["data_qualities_list"]["quality"].pop() _remove_quality_from_database(quality_name=deleted, expdb_test=expdb_test) - response = api_client.get("/v1/datasets/qualities/list") + response = api_client.get("/datasets/qualities/list") assert response.status_code == http.client.OK assert expected == response.json() def test_get_quality(api_client: TestClient) -> None: - response = api_client.get("/v1/datasets/qualities/1") + response = api_client.get("/datasets/qualities/1") assert response.status_code == http.client.OK expected = [ {"name": "AutoCorrelation", "value": 0.6064659977703456}, @@ -284,7 +284,7 @@ def test_get_quality(api_client: TestClient) -> None: ) def test_get_quality_identical(data_id: int, api_client: TestClient) -> None: php_response = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/data/qualities/{data_id}") - python_response = api_client.get(f"/v1/datasets/qualities/{data_id}") + python_response = api_client.get(f"/datasets/qualities/{data_id}") assert python_response.status_code == php_response.status_code expected = [ @@ -308,7 +308,7 @@ def test_get_quality_identical_error(data_id: int, api_client: TestClient) -> No if data_id in [116]: pytest.skip("Detailed error for code 362 (no qualities) not yet supported.") php_response = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/data/qualities/{data_id}") - python_response = api_client.get(f"/v1/datasets/qualities/{data_id}") + python_response = api_client.get(f"/datasets/qualities/{data_id}") assert python_response.status_code == php_response.status_code # The "dataset unknown" error currently has a separate code in PHP depending on # where it occurs (e.g., get dataset->113 get quality->361) diff --git a/tests/routers/v1/task_type_test.py b/tests/routers/openml/task_type_test.py similarity index 89% rename from tests/routers/v1/task_type_test.py rename to tests/routers/openml/task_type_test.py index bfc2381b..2d91cab8 100644 --- a/tests/routers/v1/task_type_test.py +++ b/tests/routers/openml/task_type_test.py @@ -8,7 +8,7 @@ @pytest.mark.php() def test_list_task_type(api_client: TestClient) -> None: - response = api_client.get("/v1/tasktype/list") + response = api_client.get("/tasktype/list") original = httpx.get("http://server-api-php-api-1:80/api/v1/json/tasktype/list") assert response.status_code == original.status_code assert response.json() == original.json() @@ -20,7 +20,7 @@ def test_list_task_type(api_client: TestClient) -> None: list(range(1, 12)), ) def test_get_task_type(ttype_id: int, api_client: TestClient) -> None: - response = api_client.get(f"/v1/tasktype/{ttype_id}") + response = api_client.get(f"/tasktype/{ttype_id}") original = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/tasktype/{ttype_id}") assert response.status_code == original.status_code @@ -37,6 +37,6 @@ def test_get_task_type(ttype_id: int, api_client: TestClient) -> None: def test_get_task_type_unknown(api_client: TestClient) -> None: - response = api_client.get("/v1/tasktype/1000") + response = api_client.get("/tasktype/1000") assert response.status_code == http.client.PRECONDITION_FAILED assert response.json() == {"detail": {"code": "241", "message": "Unknown task type."}} diff --git a/tests/routers/v1/datasets_old_test.py b/tests/routers/v1/datasets_old_test.py deleted file mode 100644 index dff98368..00000000 --- a/tests/routers/v1/datasets_old_test.py +++ /dev/null @@ -1,98 +0,0 @@ -import http.client -import json.decoder -from typing import Any, cast - -import httpx -import pytest -from starlette.testclient import TestClient - - -@pytest.mark.php() -@pytest.mark.parametrize( - "dataset_id", - range(1, 132), -) -def test_dataset_response_is_identical(dataset_id: int, api_client: TestClient) -> None: - original = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/data/{dataset_id}") - new = cast(httpx.Response, api_client.get(f"/v1/datasets/{dataset_id}")) - assert original.status_code == new.status_code - assert new.json() - - if new.status_code != http.client.OK: - assert original.json()["error"] == new.json()["detail"] - return - - assert "data_set_description" in new.json() - - try: - original = original.json()["data_set_description"] - except json.decoder.JSONDecodeError: - pytest.skip("A PHP error occurred on the test server.") - - new = new.json()["data_set_description"] - - if "div" in original: - pytest.skip("A PHP error occurred on the test server.") - - # The new API has normalized `format` field: - original["format"] = original["format"].lower() - - # There is odd behavior in the live server that I don't want to recreate: - # when the creator is a list of csv names, it can either be a str or a list - # depending on whether the names are quoted. E.g.: - # '"Alice", "Bob"' -> ["Alice", "Bob"] - # 'Alice, Bob' -> 'Alice, Bob' - if ( - "creator" in original - and isinstance(original["creator"], str) - and len(original["creator"].split(",")) > 1 - ): - original["creator"] = [name.strip() for name in original["creator"].split(",")] - - # The remainder of the fields should be identical: - assert original == new - - -@pytest.mark.parametrize( - "dataset_id", - [-1, 138, 100_000], -) -def test_error_unknown_dataset( - dataset_id: int, - api_client: TestClient, -) -> None: - response = cast(httpx.Response, api_client.get(f"v1/datasets/{dataset_id}")) - - assert response.status_code == http.client.PRECONDITION_FAILED - assert {"code": "111", "message": "Unknown dataset"} == response.json()["detail"] - - -@pytest.mark.parametrize( - "api_key", - [None, "a" * 32], -) -def test_private_dataset_no_user_no_access( - api_client: TestClient, - api_key: str | None, -) -> None: - query = f"?api_key={api_key}" if api_key else "" - response = cast(httpx.Response, api_client.get(f"v1/datasets/130{query}")) - - assert response.status_code == http.client.PRECONDITION_FAILED - assert {"code": "112", "message": "No access granted"} == response.json()["detail"] - - -@pytest.mark.skip("Not sure how to include apikey in test yet.") -def test_private_dataset_owner_access( - api_client: TestClient, - dataset_130: dict[str, Any], -) -> None: - response = cast(httpx.Response, api_client.get("/v1/datasets/130?api_key=...")) - assert response.status_code == http.client.OK - assert dataset_130 == response.json() - - -@pytest.mark.skip("Not sure how to include apikey in test yet.") -def test_private_dataset_admin_access(api_client: TestClient) -> None: - cast(httpx.Response, api_client.get("/v1/datasets/130?api_key=...")) - # test against cached response From 33b6e42354845b1490815e9d4043da8d67ffec2c Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Tue, 28 Nov 2023 15:56:35 +0100 Subject: [PATCH 2/3] Use a httpx.client interface for calling the php server This puts the configuration in one place (the fixture) and makes it easier to add new calls / understand where the calls go. --- tests/conftest.py | 9 ++- tests/routers/openml/dataset_tag_test.py | 20 +++--- .../openml/datasets_list_datasets_test.py | 61 ++++++++++--------- tests/routers/openml/datasets_test.py | 16 ++--- .../migration/datasets_migration_test.py | 39 ++++++------ tests/routers/openml/qualities_test.py | 32 +++++----- tests/routers/openml/task_type_test.py | 16 ++--- 7 files changed, 105 insertions(+), 88 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 287b947a..6ccad271 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,6 +4,7 @@ from pathlib import Path from typing import Any, Generator +import httpx import pytest from database.setup import expdb_database, user_database from fastapi.testclient import TestClient @@ -40,7 +41,13 @@ def user_test() -> Connection: @pytest.fixture() -def api_client(expdb_test: Connection, user_test: Connection) -> TestClient: +def php_api() -> httpx.Client: + with httpx.Client(base_url="http://server-api-php-api-1:80/api/v1/json") as client: + yield client + + +@pytest.fixture() +def py_api(expdb_test: Connection, user_test: Connection) -> TestClient: app = create_api() # We use the lambda definitions because fixtures may not be called directly. app.dependency_overrides[expdb_connection] = lambda: expdb_test diff --git a/tests/routers/openml/dataset_tag_test.py b/tests/routers/openml/dataset_tag_test.py index f868f2cf..f315ffbe 100644 --- a/tests/routers/openml/dataset_tag_test.py +++ b/tests/routers/openml/dataset_tag_test.py @@ -16,11 +16,11 @@ [None, ApiKey.INVALID], ids=["no authentication", "invalid key"], ) -def test_dataset_tag_rejects_unauthorized(key: ApiKey, api_client: TestClient) -> None: +def test_dataset_tag_rejects_unauthorized(key: ApiKey, py_api: TestClient) -> None: apikey = "" if key is None else f"?api_key={key}" response = cast( httpx.Response, - api_client.post( + py_api.post( f"/datasets/tag{apikey}", json={"data_id": constants.PRIVATE_DATASET_ID, "tag": "test"}, ), @@ -34,11 +34,11 @@ def test_dataset_tag_rejects_unauthorized(key: ApiKey, api_client: TestClient) - [ApiKey.ADMIN, ApiKey.REGULAR_USER, ApiKey.OWNER_USER], ids=["administrator", "non-owner", "owner"], ) -def test_dataset_tag(key: ApiKey, expdb_test: Connection, api_client: TestClient) -> None: +def test_dataset_tag(key: ApiKey, expdb_test: Connection, py_api: TestClient) -> None: dataset_id, tag = constants.PRIVATE_DATASET_ID, "test" response = cast( httpx.Response, - api_client.post( + py_api.post( f"/datasets/tag?api_key={key}", json={"data_id": dataset_id, "tag": tag}, ), @@ -50,11 +50,11 @@ def test_dataset_tag(key: ApiKey, expdb_test: Connection, api_client: TestClient assert tag in tags -def test_dataset_tag_returns_existing_tags(api_client: TestClient) -> None: +def test_dataset_tag_returns_existing_tags(py_api: TestClient) -> None: dataset_id, tag = 1, "test" response = cast( httpx.Response, - api_client.post( + py_api.post( f"/datasets/tag?api_key={ApiKey.ADMIN}", json={"data_id": dataset_id, "tag": tag}, ), @@ -63,11 +63,11 @@ def test_dataset_tag_returns_existing_tags(api_client: TestClient) -> None: assert {"data_tag": {"id": str(dataset_id), "tag": ["study_14", tag]}} == response.json() -def test_dataset_tag_fails_if_tag_exists(api_client: TestClient) -> None: +def test_dataset_tag_fails_if_tag_exists(py_api: TestClient) -> None: dataset_id, tag = 1, "study_14" # Dataset 1 already is tagged with 'study_14' response = cast( httpx.Response, - api_client.post( + py_api.post( f"/datasets/tag?api_key={ApiKey.ADMIN}", json={"data_id": dataset_id, "tag": tag}, ), @@ -90,11 +90,11 @@ def test_dataset_tag_fails_if_tag_exists(api_client: TestClient) -> None: ) def test_dataset_tag_invalid_tag_is_rejected( tag: str, - api_client: TestClient, + py_api: TestClient, ) -> None: new = cast( httpx.Response, - api_client.post( + py_api.post( f"/datasets/tag?api_key{ApiKey.ADMIN}", json={"data_id": 1, "tag": tag}, ), diff --git a/tests/routers/openml/datasets_list_datasets_test.py b/tests/routers/openml/datasets_list_datasets_test.py index e341735d..92f70557 100644 --- a/tests/routers/openml/datasets_list_datasets_test.py +++ b/tests/routers/openml/datasets_list_datasets_test.py @@ -19,8 +19,8 @@ def _assert_empty_result( assert response.json()["detail"] == {"code": "372", "message": "No results"} -def test_list(api_client: TestClient) -> None: - response = api_client.get("/datasets/list/") +def test_list(py_api: TestClient) -> None: + response = py_api.get("/datasets/list/") assert response.status_code == http.client.OK assert "data" in response.json() assert "dataset" in response.json()["data"] @@ -38,8 +38,8 @@ def test_list(api_client: TestClient) -> None: ("all", constants.NUMBER_OF_DATASETS - constants.NUMBER_OF_PRIVATE_DATASETS), ], ) -def test_list_filter_active(status: str, amount: int, api_client: TestClient) -> None: - response = api_client.post( +def test_list_filter_active(status: str, amount: int, py_api: TestClient) -> None: + response = py_api.post( "/datasets/list", json={"status": status, "pagination": {"limit": constants.NUMBER_OF_DATASETS}}, ) @@ -57,9 +57,9 @@ def test_list_filter_active(status: str, amount: int, api_client: TestClient) -> (None, constants.NUMBER_OF_DATASETS - constants.NUMBER_OF_PRIVATE_DATASETS), ], ) -def test_list_accounts_privacy(api_key: ApiKey | None, amount: int, api_client: TestClient) -> None: +def test_list_accounts_privacy(api_key: ApiKey | None, amount: int, py_api: TestClient) -> None: key = f"?api_key={api_key}" if api_key else "" - response = api_client.post( + response = py_api.post( f"/datasets/list{key}", json={"status": "all", "pagination": {"limit": 1000}}, ) @@ -72,9 +72,9 @@ def test_list_accounts_privacy(api_key: ApiKey | None, amount: int, api_client: ("name", "count"), [("abalone", 1), ("iris", 2)], ) -def test_list_data_name_present(name: str, count: int, api_client: TestClient) -> None: +def test_list_data_name_present(name: str, count: int, py_api: TestClient) -> None: # The second iris dataset is private, so we need to authenticate. - response = api_client.post( + response = py_api.post( f"/datasets/list?api_key={ApiKey.ADMIN}", json={"status": "all", "data_name": name}, ) @@ -88,8 +88,8 @@ def test_list_data_name_present(name: str, count: int, api_client: TestClient) - "name", ["ir", "long_name_without_overlap"], ) -def test_list_data_name_absent(name: str, api_client: TestClient) -> None: - response = api_client.post( +def test_list_data_name_absent(name: str, py_api: TestClient) -> None: + response = py_api.post( f"/datasets/list?api_key={ApiKey.ADMIN}", json={"status": "all", "data_name": name}, ) @@ -102,7 +102,7 @@ def test_list_quality_filers() -> None: @pytest.mark.parametrize("limit", [None, 5, 10, 200]) @pytest.mark.parametrize("offset", [None, 0, 5, 129, 130, 200]) -def test_list_pagination(limit: int | None, offset: int | None, api_client: TestClient) -> None: +def test_list_pagination(limit: int | None, offset: int | None, py_api: TestClient) -> None: all_ids = [ did for did in range(1, 1 + constants.NUMBER_OF_DATASETS) @@ -116,7 +116,7 @@ def test_list_pagination(limit: int | None, offset: int | None, api_client: Test offset_body = {} if offset is None else {"offset": offset} limit_body = {} if limit is None else {"limit": limit} filters = {"status": "all", "pagination": offset_body | limit_body} - response = api_client.post("/datasets/list", json=filters) + response = py_api.post("/datasets/list", json=filters) if offset in [130, 200]: _assert_empty_result(response) @@ -131,8 +131,8 @@ def test_list_pagination(limit: int | None, offset: int | None, api_client: Test ("version", "count"), [(1, 100), (2, 6), (5, 1)], ) -def test_list_data_version(version: int, count: int, api_client: TestClient) -> None: - response = api_client.post( +def test_list_data_version(version: int, count: int, py_api: TestClient) -> None: + response = py_api.post( f"/datasets/list?api_key={ApiKey.ADMIN}", json={"status": "all", "data_version": version}, ) @@ -142,8 +142,8 @@ def test_list_data_version(version: int, count: int, api_client: TestClient) -> assert {dataset["version"] for dataset in datasets} == {version} -def test_list_data_version_no_result(api_client: TestClient) -> None: - response = api_client.post( +def test_list_data_version_no_result(py_api: TestClient) -> None: + response = py_api.post( f"/datasets/list?api_key={ApiKey.ADMIN}", json={"status": "all", "data_version": 4}, ) @@ -158,8 +158,8 @@ def test_list_data_version_no_result(api_client: TestClient) -> None: ("user_id", "count"), [(1, 59), (2, 34), (16, 1)], ) -def test_list_uploader(user_id: int, count: int, key: str, api_client: TestClient) -> None: - response = api_client.post( +def test_list_uploader(user_id: int, count: int, key: str, py_api: TestClient) -> None: + response = py_api.post( f"/datasets/list?api_key={key}", json={"status": "all", "uploader": user_id}, ) @@ -177,8 +177,8 @@ def test_list_uploader(user_id: int, count: int, key: str, api_client: TestClien "data_id", [[1], [1, 2, 3], [1, 2, 3, 3000], [1, 2, 3, 130]], ) -def test_list_data_id(data_id: list[int], api_client: TestClient) -> None: - response = api_client.post( +def test_list_data_id(data_id: list[int], py_api: TestClient) -> None: + response = py_api.post( "/datasets/list", json={"status": "all", "data_id": data_id}, ) @@ -193,8 +193,8 @@ def test_list_data_id(data_id: list[int], api_client: TestClient) -> None: ("tag", "count"), [("study_14", 100), ("study_15", 1)], ) -def test_list_data_tag(tag: str, count: int, api_client: TestClient) -> None: - response = api_client.post( +def test_list_data_tag(tag: str, count: int, py_api: TestClient) -> None: + response = py_api.post( "/datasets/list", # study_14 has 100 datasets, we overwrite the default `limit` because otherwise # we don't know if the results are limited by filtering on the tag. @@ -205,8 +205,8 @@ def test_list_data_tag(tag: str, count: int, api_client: TestClient) -> None: assert len(datasets) == count -def test_list_data_tag_empty(api_client: TestClient) -> None: - response = api_client.post( +def test_list_data_tag_empty(py_api: TestClient) -> None: + response = py_api.post( "/datasets/list", json={"status": "all", "tag": "not-a-tag"}, ) @@ -226,8 +226,8 @@ def test_list_data_tag_empty(api_client: TestClient) -> None: ("number_missing_values", "2..100000", 22), ], ) -def test_list_data_quality(quality: str, range_: str, count: int, api_client: TestClient) -> None: - response = api_client.post( +def test_list_data_quality(quality: str, range_: str, count: int, py_api: TestClient) -> None: + response = py_api.post( "/datasets/list", json={"status": "all", quality: range_}, ) @@ -258,7 +258,8 @@ def test_list_data_quality(quality: str, range_: str, count: int, api_client: Te api_key=st.sampled_from([None, ApiKey.REGULAR_USER, ApiKey.OWNER_USER]), ) # type: ignore[misc] # https://github.com/openml/server-api/issues/108 def test_list_data_identical( - api_client: TestClient, + py_api: TestClient, + php_api: httpx.Client, **kwargs: dict[str, Any], ) -> Any: limit, offset = kwargs["limit"], kwargs["offset"] @@ -275,7 +276,7 @@ def test_list_data_identical( if offset is not None: new_style["pagination"]["offset"] = offset - response = api_client.post( + response = py_api.post( f"/datasets/list{api_key_query}", json=new_style, ) @@ -286,11 +287,11 @@ def test_list_data_identical( for filter_, value in kwargs.items() if value is not None ] - uri = "http://server-api-php-api-1:80/api/v1/json/data/list" + uri = "/data/list" if query: uri += f"/{'/'.join([str(v) for q in query for v in q])}" uri += api_key_query - original = httpx.get(uri) + original = php_api.get(uri) assert original.status_code == response.status_code, response.json() if original.status_code == http.client.PRECONDITION_FAILED: diff --git a/tests/routers/openml/datasets_test.py b/tests/routers/openml/datasets_test.py index 1aff3dc2..98a11941 100644 --- a/tests/routers/openml/datasets_test.py +++ b/tests/routers/openml/datasets_test.py @@ -17,9 +17,9 @@ def test_error_unknown_dataset( dataset_id: int, response_code: int, - api_client: TestClient, + py_api: TestClient, ) -> None: - response = cast(httpx.Response, api_client.get(f"/datasets/{dataset_id}")) + response = cast(httpx.Response, py_api.get(f"/datasets/{dataset_id}")) assert response.status_code == response_code assert {"code": "111", "message": "Unknown dataset"} == response.json()["detail"] @@ -33,12 +33,12 @@ def test_error_unknown_dataset( ], ) def test_private_dataset_no_user_no_access( - api_client: TestClient, + py_api: TestClient, api_key: str | None, response_code: int, ) -> None: query = f"?api_key={api_key}" if api_key else "" - response = cast(httpx.Response, api_client.get(f"/datasets/130{query}")) + response = cast(httpx.Response, py_api.get(f"/datasets/130{query}")) assert response.status_code == response_code assert {"code": "112", "message": "No access granted"} == response.json()["detail"] @@ -46,15 +46,15 @@ def test_private_dataset_no_user_no_access( @pytest.mark.skip("Not sure how to include apikey in test yet.") def test_private_dataset_owner_access( - api_client: TestClient, + py_api: TestClient, dataset_130: dict[str, Any], ) -> None: - response = cast(httpx.Response, api_client.get("/v2/datasets/130?api_key=...")) + response = cast(httpx.Response, py_api.get("/v2/datasets/130?api_key=...")) assert response.status_code == http.client.OK assert dataset_130 == response.json() @pytest.mark.skip("Not sure how to include apikey in test yet.") -def test_private_dataset_admin_access(api_client: TestClient) -> None: - cast(httpx.Response, api_client.get("/v2/datasets/130?api_key=...")) +def test_private_dataset_admin_access(py_api: TestClient) -> None: + cast(httpx.Response, py_api.get("/v2/datasets/130?api_key=...")) # test against cached response diff --git a/tests/routers/openml/migration/datasets_migration_test.py b/tests/routers/openml/migration/datasets_migration_test.py index 0266e5be..6d93827c 100644 --- a/tests/routers/openml/migration/datasets_migration_test.py +++ b/tests/routers/openml/migration/datasets_migration_test.py @@ -14,9 +14,13 @@ "dataset_id", range(1, 132), ) -def test_dataset_response_is_identical(dataset_id: int, api_client: TestClient) -> None: - original = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/data/{dataset_id}") - new = api_client.get(f"/datasets/{dataset_id}") +def test_dataset_response_is_identical( + dataset_id: int, + py_api: TestClient, + php_api: httpx.Client, +) -> None: + original = php_api.get(f"/data/{dataset_id}") + new = py_api.get(f"/datasets/{dataset_id}") if new.status_code == http.client.FORBIDDEN: assert original.status_code == http.client.PRECONDITION_FAILED @@ -92,9 +96,9 @@ def test_dataset_response_is_identical(dataset_id: int, api_client: TestClient) ) def test_error_unknown_dataset( dataset_id: int, - api_client: TestClient, + py_api: TestClient, ) -> None: - response = cast(httpx.Response, api_client.get(f"/datasets/{dataset_id}")) + response = cast(httpx.Response, py_api.get(f"/datasets/{dataset_id}")) # The new API has "404 Not Found" instead of "412 PRECONDITION_FAILED" assert response.status_code == http.client.NOT_FOUND @@ -106,11 +110,11 @@ def test_error_unknown_dataset( [None, "a" * 32], ) def test_private_dataset_no_user_no_access( - api_client: TestClient, + py_api: TestClient, api_key: str | None, ) -> None: query = f"?api_key={api_key}" if api_key else "" - response = cast(httpx.Response, api_client.get(f"/datasets/130{query}")) + response = cast(httpx.Response, py_api.get(f"/datasets/130{query}")) # New response is 403: Forbidden instead of 412: PRECONDITION FAILED assert response.status_code == http.client.FORBIDDEN @@ -119,17 +123,17 @@ def test_private_dataset_no_user_no_access( @pytest.mark.skip("Not sure how to include apikey in test yet.") def test_private_dataset_owner_access( - api_client: TestClient, + py_api: TestClient, dataset_130: dict[str, Any], ) -> None: - response = cast(httpx.Response, api_client.get("/datasets/130?api_key=...")) + response = cast(httpx.Response, py_api.get("/datasets/130?api_key=...")) assert response.status_code == http.client.OK assert dataset_130 == response.json() @pytest.mark.skip("Not sure how to include apikey in test yet.") -def test_private_dataset_admin_access(api_client: TestClient) -> None: - cast(httpx.Response, api_client.get("/datasets/130?api_key=...")) +def test_private_dataset_admin_access(py_api: TestClient) -> None: + cast(httpx.Response, py_api.get("/datasets/130?api_key=...")) # test against cached response @@ -152,10 +156,11 @@ def test_dataset_tag_response_is_identical( dataset_id: int, tag: str, api_key: str, - api_client: TestClient, + py_api: TestClient, + php_api: httpx.Client, ) -> None: - original = httpx.post( - "http://server-api-php-api-1:80/api/v1/json/data/tag", + original = php_api.post( + "/data/tag", data={"api_key": api_key, "tag": tag, "data_id": dataset_id}, ) if ( @@ -165,13 +170,13 @@ def test_dataset_tag_response_is_identical( pytest.skip("Encountered Elastic Search error.") if original.status_code == http.client.OK: # undo the tag, because we don't want to persist this change to the database - httpx.post( - "http://server-api-php-api-1:80/api/v1/json/data/untag", + php_api.post( + "/data/untag", data={"api_key": api_key, "tag": tag, "data_id": dataset_id}, ) new = cast( httpx.Response, - api_client.post( + py_api.post( f"/datasets/tag?api_key={api_key}", json={"data_id": dataset_id, "tag": tag}, ), diff --git a/tests/routers/openml/qualities_test.py b/tests/routers/openml/qualities_test.py index eb30b442..851f7791 100644 --- a/tests/routers/openml/qualities_test.py +++ b/tests/routers/openml/qualities_test.py @@ -28,16 +28,16 @@ def _remove_quality_from_database(quality_name: str, expdb_test: Connection) -> @pytest.mark.php() -def test_list_qualities_identical(api_client: TestClient) -> None: - original = httpx.get("http://server-api-php-api-1:80/api/v1/json/data/qualities/list") - new = api_client.get("/datasets/qualities/list") +def test_list_qualities_identical(py_api: TestClient, php_api: httpx.Client) -> None: + original = php_api.get("/data/qualities/list") + new = py_api.get("/datasets/qualities/list") assert original.status_code == new.status_code assert original.json() == new.json() # To keep the test idempotent, we cannot test if reaction to database changes is identical -def test_list_qualities(api_client: TestClient, expdb_test: Connection) -> None: - response = api_client.get("/datasets/qualities/list") +def test_list_qualities(py_api: TestClient, expdb_test: Connection) -> None: + response = py_api.get("/datasets/qualities/list") assert response.status_code == http.client.OK expected = { "data_qualities_list": { @@ -157,13 +157,13 @@ def test_list_qualities(api_client: TestClient, expdb_test: Connection) -> None: deleted = expected["data_qualities_list"]["quality"].pop() _remove_quality_from_database(quality_name=deleted, expdb_test=expdb_test) - response = api_client.get("/datasets/qualities/list") + response = py_api.get("/datasets/qualities/list") assert response.status_code == http.client.OK assert expected == response.json() -def test_get_quality(api_client: TestClient) -> None: - response = api_client.get("/datasets/qualities/1") +def test_get_quality(py_api: TestClient) -> None: + response = py_api.get("/datasets/qualities/1") assert response.status_code == http.client.OK expected = [ {"name": "AutoCorrelation", "value": 0.6064659977703456}, @@ -282,9 +282,9 @@ def test_get_quality(api_client: TestClient) -> None: "data_id", list(set(range(1, 132)) - {55, 56, 59, 116, 130}), ) -def test_get_quality_identical(data_id: int, api_client: TestClient) -> None: - php_response = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/data/qualities/{data_id}") - python_response = api_client.get(f"/datasets/qualities/{data_id}") +def test_get_quality_identical(data_id: int, py_api: TestClient, php_api: httpx.Client) -> None: + php_response = php_api.get(f"/data/qualities/{data_id}") + python_response = py_api.get(f"/datasets/qualities/{data_id}") assert python_response.status_code == php_response.status_code expected = [ @@ -302,13 +302,17 @@ def test_get_quality_identical(data_id: int, api_client: TestClient) -> None: "data_id", [55, 56, 59, 116, 130, 132], ) -def test_get_quality_identical_error(data_id: int, api_client: TestClient) -> None: +def test_get_quality_identical_error( + data_id: int, + py_api: TestClient, + php_api: httpx.Client, +) -> None: if data_id in [55, 56, 59]: pytest.skip("Detailed error for code 364 (failed processing) not yet supported.") if data_id in [116]: pytest.skip("Detailed error for code 362 (no qualities) not yet supported.") - php_response = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/data/qualities/{data_id}") - python_response = api_client.get(f"/datasets/qualities/{data_id}") + php_response = php_api.get(f"/data/qualities/{data_id}") + python_response = py_api.get(f"/datasets/qualities/{data_id}") assert python_response.status_code == php_response.status_code # The "dataset unknown" error currently has a separate code in PHP depending on # where it occurs (e.g., get dataset->113 get quality->361) diff --git a/tests/routers/openml/task_type_test.py b/tests/routers/openml/task_type_test.py index 2d91cab8..c5182716 100644 --- a/tests/routers/openml/task_type_test.py +++ b/tests/routers/openml/task_type_test.py @@ -7,9 +7,9 @@ @pytest.mark.php() -def test_list_task_type(api_client: TestClient) -> None: - response = api_client.get("/tasktype/list") - original = httpx.get("http://server-api-php-api-1:80/api/v1/json/tasktype/list") +def test_list_task_type(py_api: TestClient, php_api: httpx.Client) -> None: + response = py_api.get("/tasktype/list") + original = php_api.get("/tasktype/list") assert response.status_code == original.status_code assert response.json() == original.json() @@ -19,9 +19,9 @@ def test_list_task_type(api_client: TestClient) -> None: "ttype_id", list(range(1, 12)), ) -def test_get_task_type(ttype_id: int, api_client: TestClient) -> None: - response = api_client.get(f"/tasktype/{ttype_id}") - original = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/tasktype/{ttype_id}") +def test_get_task_type(ttype_id: int, py_api: TestClient, php_api: httpx.Client) -> None: + response = py_api.get(f"/tasktype/{ttype_id}") + original = php_api.get(f"/tasktype/{ttype_id}") assert response.status_code == original.status_code py_json = response.json() @@ -36,7 +36,7 @@ def test_get_task_type(ttype_id: int, api_client: TestClient) -> None: assert not differences -def test_get_task_type_unknown(api_client: TestClient) -> None: - response = api_client.get("/tasktype/1000") +def test_get_task_type_unknown(py_api: TestClient) -> None: + response = py_api.get("/tasktype/1000") assert response.status_code == http.client.PRECONDITION_FAILED assert response.json() == {"detail": {"code": "241", "message": "Unknown task type."}} From 714eaf25d90c8de1435123db7605867f31c319b0 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Tue, 28 Nov 2023 16:21:36 +0100 Subject: [PATCH 3/3] Update for removal of distinct V1 and V2 endpoints --- docs/migration.md | 90 ++++++++++++++++------------------------------- 1 file changed, 30 insertions(+), 60 deletions(-) diff --git a/docs/migration.md b/docs/migration.md index e3e9c38d..dc49e33c 100644 --- a/docs/migration.md +++ b/docs/migration.md @@ -1,31 +1,22 @@ # Migration -This implementation currently maintains two separate endpoints. -There are "old" endpoints (Python-V1), which mimic responses of the PHP REST API (PHP-V1) -as closely as possible, and new endpoints (V2) which have some additional changes and -will be updated going forward. - -The advised way to upgrade connector packages that currently interact with the old -JSON API is to first migrate from the old PHP API to our re-implemented V1 API. -See "[V1: PHP to Python](#v1--php-to-python)" for the differences between the PHP and -Python API. After that migration, continue with the "[V1 to V2](#v1-to-v2)" guide. - -Connectors currently using the XML API are recommended to upgrade to V2 directly, -in which case using the generated REST API documentation is recommended. - -## V1: PHP to Python - -The first iteration of the new server has nearly identical responses to the old JSON -endpoints, but there are exceptions. Most exceptions either bug fixes, or arise from -technical limitations. This list covers the most important changes, but there may -be some undocumented changes for edge cases. The PHP API was underspecified, and we -decided that reverse engineering the specifications which mostly arise from -implementation details was not worth the effort. If there is a behavioral change which -was not documented but affects you, please [open a bug report](https://github.com/openml/server-api/issues/new?assignees=&labels=bug%2C+triage&projects=&template=bug-report.md&title=). - -### All Endpoints +The Python reimplementation provides the same endpoints as the old API, which are +largely functioning the same way. However, there are a few major deviations: + + * Use of typed JSON: e.g., when a value represents an integer, it is returned as integer. + * Lists when multiple values are possible: if a field can have none, one, or multiple entries (e.g., authors), we always return a list. + * Restriction or expansion of input types as appropriate. + * Standardizing authentication and access messages, and consistently execute those checks + before fetching data or providing error messages about the data. + +The list above is not exhaustive. Minor changes include, for example, bug fixes and the removal of unnecessary nesting. +There may be undocumented changes, especially in edge cases which may not have occurred in the test environment. +As the PHP API was underspecified, the re-implementation is based on a mix of reading old code and probing the API. +If there is a behavioral change which was not documented but affects you, please [open a bug report](https://github.com/openml/server-api/issues/new?assignees=&labels=bug%2C+triage&projects=&template=bug-report.md&title=). + +## All Endpoints The following changes affect all endpoints. -#### Error on Invalid Input +### Error on Invalid Input When providing input of invalid types (e.g., a non-integer dataset id) the HTTP header and JSON content will be different. @@ -45,7 +36,7 @@ and JSON content will be different. These endpoints now do enforce stricter input constraints. Constraints for each endpoint parameter are documented in the API docs. -#### Other Errors +### Other Errors For any other error messages, the response is identical except that outer field will be `"detail"` instead of `"error"`: ```diff title="JSON Content" @@ -53,9 +44,7 @@ For any other error messages, the response is identical except that outer field + {"detail":{"code":"112","message":"No access granted"}} ``` -In some cases the JSON endpoints previously returned XML ([example](https://github.com/openml/OpenML/issues/1200)). -Python-V1 will always return JSON. - +In some cases the JSON endpoints previously returned XML ([example](https://github.com/openml/OpenML/issues/1200)), the new API always returns JSON. ```diff title="XML replaced by JSON" - @@ -65,20 +54,26 @@ Python-V1 will always return JSON. + {"detail": {"code":"103", "message": "Authentication failed" } } ``` -### Datasets +## Datasets -#### `GET /{dataset_id}` +### `GET /{dataset_id}` - Dataset format names are normalized to be all lower-case (`"Sparse_ARFF"` -> `"sparse_arff"`). - - Non-`arff` datasets will not incorrectly have a `"parquet_url"`: - https://github.com/openml/OpenML/issues/1189 + - Non-`arff` datasets will not incorrectly have a `"parquet_url"` ([openml#1189](https://github.com/openml/OpenML/issues/1189)). - If `"creator"` contains multiple comma-separated creators it is always returned as a list, instead of it depending on the quotation used by the original uploader. - For (some?) datasets that have multiple values in `"ignore_attribute"`, this field is correctly populated instead of omitted. + - Processing date is formatted with a `T` in the middle: + ```diff title="processing_date" + - "2019-07-09 15:22:03" + + "2019-07-09T15:22:03" + ``` + - Fields which may contain lists of values (e.g., `creator`, `contributor`) now always + returns a list (which may also be empty or contain a single element). + - Fields without a set value are no longer automatically removed from the response. - -#### `GET /data/list/{filters}` +### `GET /data/list/{filters}` The endpoint now accepts the filters in the body of the request, instead of as query parameters. ```diff @@ -95,28 +90,3 @@ includes datasets which are private. The `limit` and `offset` parameters can now be used independently, you no longer need to provide both if you wish to set only one. - -## V1 to V2 -Most of the changes are focused on standardizing responses, working on: - - * using JSON types. - * removing levels of nesting for endpoints which return single-field JSON. - * always returning lists for fields which may contain multiple values even if it - contains only one element or no element. - * restricting or expanding input types as appropriate. - * standardizing authentication and access messages, and consistently execute those checks - before fetching data or providing error messages about the data. - - -### Datasets - -#### `GET /{dataset_id}` - - - Processing date is formatted with a `T` in the middle: - ```diff title="processing_date" - - "2019-07-09 15:22:03" - + "2019-07-09T15:22:03" - ``` - - Fields which may contain lists of values (e.g., `creator`, `contributor`) now always - returns a list (which may also be empty or contain a single element). - - Fields without a set value are no longer automatically removed from the response.