From 3d1ccc0c7584c8ae69d6bebf9bc3bd45c332e3e0 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Tue, 28 Nov 2023 12:12:36 +0100
Subject: [PATCH 1/3] Remove V1/V2 distinction, separate migration tests
---
src/main.py | 8 +-
src/routers/mldcat_ap/dataset.py | 2 +-
src/routers/{v1 => openml}/__init__.py | 0
src/routers/{v1 => openml}/datasets.py | 219 +++++++++++++-----
src/routers/{v1 => openml}/qualities.py | 4 +-
src/routers/{v1 => openml}/tasktype.py | 2 +-
src/routers/v2/datasets.py | 186 ---------------
.../{v1 => openml}/dataset_tag_test.py | 64 +----
.../datasets_list_datasets_test.py | 28 +--
tests/routers/{v2 => openml}/datasets_test.py | 4 +-
.../migration/datasets_migration_test.py | 187 +++++++++++++++
.../routers/{v1 => openml}/qualities_test.py | 12 +-
.../routers/{v1 => openml}/task_type_test.py | 6 +-
tests/routers/v1/datasets_old_test.py | 98 --------
14 files changed, 389 insertions(+), 431 deletions(-)
rename src/routers/{v1 => openml}/__init__.py (100%)
rename src/routers/{v1 => openml}/datasets.py (58%)
rename src/routers/{v1 => openml}/qualities.py (95%)
rename src/routers/{v1 => openml}/tasktype.py (98%)
delete mode 100644 src/routers/v2/datasets.py
rename tests/routers/{v1 => openml}/dataset_tag_test.py (60%)
rename tests/routers/{v1 => openml}/datasets_list_datasets_test.py (94%)
rename tests/routers/{v2 => openml}/datasets_test.py (91%)
create mode 100644 tests/routers/openml/migration/datasets_migration_test.py
rename tests/routers/{v1 => openml}/qualities_test.py (97%)
rename tests/routers/{v1 => openml}/task_type_test.py (89%)
delete mode 100644 tests/routers/v1/datasets_old_test.py
diff --git a/src/main.py b/src/main.py
index ffd2f7c6..db548f63 100644
--- a/src/main.py
+++ b/src/main.py
@@ -3,10 +3,9 @@
import uvicorn
from fastapi import FastAPI
from routers.mldcat_ap.dataset import router as mldcat_ap_router
-from routers.v1.datasets import router as datasets_router_v1_format
-from routers.v1.qualities import router as qualities_router
-from routers.v1.tasktype import router as ttype_router
-from routers.v2.datasets import router as datasets_router
+from routers.openml.datasets import router as datasets_router
+from routers.openml.qualities import router as qualities_router
+from routers.openml.tasktype import router as ttype_router
def _parse_args() -> argparse.Namespace:
@@ -39,7 +38,6 @@ def create_api() -> FastAPI:
app = FastAPI()
app.include_router(datasets_router)
- app.include_router(datasets_router_v1_format)
app.include_router(qualities_router)
app.include_router(mldcat_ap_router)
app.include_router(ttype_router)
diff --git a/src/routers/mldcat_ap/dataset.py b/src/routers/mldcat_ap/dataset.py
index 8d9994b7..771edb83 100644
--- a/src/routers/mldcat_ap/dataset.py
+++ b/src/routers/mldcat_ap/dataset.py
@@ -5,7 +5,7 @@
from sqlalchemy import Connection
from routers.dependencies import expdb_connection, userdb_connection
-from routers.v2.datasets import get_dataset
+from routers.openml.datasets import get_dataset
router = APIRouter(prefix="/mldcat_ap/datasets", tags=["datasets"])
diff --git a/src/routers/v1/__init__.py b/src/routers/openml/__init__.py
similarity index 100%
rename from src/routers/v1/__init__.py
rename to src/routers/openml/__init__.py
diff --git a/src/routers/v1/datasets.py b/src/routers/openml/datasets.py
similarity index 58%
rename from src/routers/v1/datasets.py
rename to src/routers/openml/datasets.py
index 172890c7..65c18a61 100644
--- a/src/routers/v1/datasets.py
+++ b/src/routers/openml/datasets.py
@@ -2,22 +2,30 @@
We add separate endpoints for old-style JSON responses, so they don't clutter the schema of the
new API, and are easily removed later.
"""
+import html
import http.client
-from enum import StrEnum
+from collections import namedtuple
+from enum import IntEnum, StrEnum
from typing import Annotated, Any, Literal
-from database.datasets import get_tags
+from database.datasets import get_dataset as db_get_dataset
+from database.datasets import (
+ get_file,
+ get_latest_dataset_description,
+ get_latest_processing_update,
+ get_latest_status_update,
+ get_tags,
+)
from database.datasets import tag_dataset as db_tag_dataset
-from database.users import APIKey, User, UserGroup
+from database.users import APIKey, User, UserGroup, get_user_groups_for, get_user_id_for
from fastapi import APIRouter, Body, Depends, HTTPException
-from schemas.datasets.openml import DatasetStatus
+from schemas.datasets.openml import DatasetFileFormat, DatasetMetadata, DatasetStatus, Visibility
from sqlalchemy import Connection
from routers.dependencies import Pagination, expdb_connection, fetch_user, userdb_connection
from routers.types import CasualString128, IntegerRange, SystemString64, integer_range_regex
-from routers.v2.datasets import get_dataset
-router = APIRouter(prefix="/v1/datasets", tags=["datasets"])
+router = APIRouter(prefix="/datasets", tags=["datasets"])
@router.post(
@@ -238,58 +246,161 @@ def quality_clause(quality: str, range_: str | None) -> str:
return {"data": {"dataset": list(datasets.values())}}
+class DatasetError(IntEnum):
+ NOT_FOUND = 111
+ NO_ACCESS = 112
+ NO_DATA_FILE = 113
+
+
+processing_info = namedtuple("processing_info", ["date", "warning", "error"])
+
+
+def _get_processing_information(dataset_id: int, connection: Connection) -> processing_info:
+ """Return processing information, if any. Otherwise, all fields `None`."""
+ if not (data_processed := get_latest_processing_update(dataset_id, connection)):
+ return processing_info(date=None, warning=None, error=None)
+
+ date_processed = data_processed["processing_date"]
+ warning = data_processed["warning"].strip() if data_processed["warning"] else None
+ error = data_processed["error"].strip() if data_processed["error"] else None
+ return processing_info(date=date_processed, warning=warning, error=error)
+
+
+def _format_error(*, code: DatasetError, message: str) -> dict[str, str]:
+ """Formatter for JSON bodies of OpenML error codes."""
+ return {"code": str(code), "message": message}
+
+
+def _user_has_access(
+ dataset: dict[str, Any],
+ connection: Connection,
+ api_key: APIKey | None = None,
+) -> bool:
+ """Determine if user of `api_key` has the right to view `dataset`."""
+ if dataset["visibility"] == Visibility.PUBLIC:
+ return True
+ if not api_key:
+ return False
+
+ if not (user_id := get_user_id_for(api_key=api_key, connection=connection)):
+ return False
+
+ if user_id == dataset["uploader"]:
+ return True
+
+ user_groups = get_user_groups_for(user_id=user_id, connection=connection)
+ ADMIN_GROUP = 1
+ return ADMIN_GROUP in user_groups
+
+
+def _format_parquet_url(dataset: dict[str, Any]) -> str | None:
+ if dataset["format"].lower() != DatasetFileFormat.ARFF:
+ return None
+
+ minio_base_url = "https://openml1.win.tue.nl"
+ return f"{minio_base_url}/dataset{dataset['did']}/dataset_{dataset['did']}.pq"
+
+
+def _format_dataset_url(dataset: dict[str, Any]) -> str:
+ base_url = "https://test.openml.org"
+ filename = f"{html.escape(dataset['name'])}.{dataset['format'].lower()}"
+ return f"{base_url}/data/v1/download/{dataset['file_id']}/{filename}"
+
+
+def _safe_unquote(text: str | None) -> str | None:
+ """Remove any open and closing quotes and return the remainder if non-empty."""
+ if not text:
+ return None
+ return text.strip("'\"") or None
+
+
+def _csv_as_list(text: str | None, *, unquote_items: bool = True) -> list[str]:
+ """Return comma-separated values in `text` as list, optionally remove quotes."""
+ if not text:
+ return []
+ chars_to_strip = "'\"\t " if unquote_items else "\t "
+ return [item.strip(chars_to_strip) for item in text.split(",")]
+
+
@router.get(
path="/{dataset_id}",
- description="Get old-style wrapped meta-data for dataset with ID `dataset_id`.",
+ description="Get meta-data for dataset with ID `dataset_id`.",
)
-def get_dataset_wrapped(
+def get_dataset(
dataset_id: int,
api_key: APIKey | None = None,
user_db: Annotated[Connection, Depends(userdb_connection)] = None,
expdb_db: Annotated[Connection, Depends(expdb_connection)] = None,
-) -> dict[str, dict[str, Any]]:
- try:
- dataset = get_dataset(
- user_db=user_db,
- expdb_db=expdb_db,
- dataset_id=dataset_id,
- api_key=api_key,
- ).model_dump(by_alias=True)
- except HTTPException as e:
- raise HTTPException(
- status_code=http.client.PRECONDITION_FAILED,
- detail=e.detail,
- ) from None
- if processing_data := dataset.get("processing_date"):
- dataset["processing_date"] = str(processing_data).replace("T", " ")
- if parquet_url := dataset.get("parquet_url"):
- dataset["parquet_url"] = str(parquet_url).replace("https", "http")
- if minio_url := dataset.get("minio_url"):
- dataset["minio_url"] = str(minio_url).replace("https", "http")
-
- manual = []
- # ref test.openml.org/d/33 (contributor) and d/34 (creator)
- # contributor/creator in database is '""'
- # json content is []
- for field in ["contributor", "creator"]:
- if dataset[field] == [""]:
- dataset[field] = []
- manual.append(field)
-
- if isinstance(dataset["original_data_url"], list):
- dataset["original_data_url"] = ", ".join(str(url) for url in dataset["original_data_url"])
-
- for field, value in list(dataset.items()):
- if field in manual:
- continue
- if isinstance(value, int):
- dataset[field] = str(value)
- elif isinstance(value, list) and len(value) == 1:
- dataset[field] = str(value[0])
- if not dataset[field]:
- del dataset[field]
-
- if "description" not in dataset:
- dataset["description"] = []
-
- return {"data_set_description": dataset}
+) -> DatasetMetadata:
+ if not (dataset := db_get_dataset(dataset_id, expdb_db)):
+ error = _format_error(code=DatasetError.NOT_FOUND, message="Unknown dataset")
+ raise HTTPException(status_code=http.client.NOT_FOUND, detail=error)
+
+ if not _user_has_access(dataset=dataset, connection=user_db, api_key=api_key):
+ error = _format_error(code=DatasetError.NO_ACCESS, message="No access granted")
+ raise HTTPException(status_code=http.client.FORBIDDEN, detail=error)
+
+ if not (dataset_file := get_file(dataset["file_id"], user_db)):
+ error = _format_error(
+ code=DatasetError.NO_DATA_FILE,
+ message="No data file found",
+ )
+ raise HTTPException(status_code=http.client.PRECONDITION_FAILED, detail=error)
+
+ tags = get_tags(dataset_id, expdb_db)
+ description = get_latest_dataset_description(dataset_id, expdb_db)
+ processing_result = _get_processing_information(dataset_id, expdb_db)
+ status = get_latest_status_update(dataset_id, expdb_db)
+
+ status_ = DatasetStatus(status["status"]) if status else DatasetStatus.IN_PREPARATION
+
+ description_ = ""
+ if description:
+ description_ = description["description"].replace("\r", "").strip()
+
+ dataset_url = _format_dataset_url(dataset)
+ parquet_url = _format_parquet_url(dataset)
+
+ contributors = _csv_as_list(dataset["contributor"], unquote_items=True)
+ creators = _csv_as_list(dataset["creator"], unquote_items=True)
+ ignore_attribute = _csv_as_list(dataset["ignore_attribute"], unquote_items=True)
+ row_id_attribute = _csv_as_list(dataset["row_id_attribute"], unquote_items=True)
+ original_data_url = _csv_as_list(dataset["original_data_url"], unquote_items=True)
+
+ # Not sure which properties are set by this bit:
+ # foreach( $this->xml_fields_dataset['csv'] as $field ) {
+ # $dataset->{$field} = getcsv( $dataset->{$field} );
+ # }
+
+ return DatasetMetadata(
+ id=dataset["did"],
+ visibility=dataset["visibility"],
+ status=status_,
+ name=dataset["name"],
+ licence=dataset["licence"],
+ version=dataset["version"],
+ version_label=dataset["version_label"] or "",
+ language=dataset["language"] or "",
+ creator=creators,
+ contributor=contributors,
+ citation=dataset["citation"] or "",
+ upload_date=dataset["upload_date"],
+ processing_date=processing_result.date,
+ warning=processing_result.warning,
+ error=processing_result.error,
+ description=description_,
+ description_version=description["version"] if description else 0,
+ tag=tags,
+ default_target_attribute=_safe_unquote(dataset["default_target_attribute"]),
+ ignore_attribute=ignore_attribute,
+ row_id_attribute=row_id_attribute,
+ url=dataset_url,
+ parquet_url=parquet_url,
+ minio_url=parquet_url,
+ file_id=dataset["file_id"],
+ format=dataset["format"].lower(),
+ paper_url=dataset["paper_url"] or None,
+ original_data_url=original_data_url,
+ collection_date=dataset["collection_date"],
+ md5_checksum=dataset_file["md5_hash"],
+ )
diff --git a/src/routers/v1/qualities.py b/src/routers/openml/qualities.py
similarity index 95%
rename from src/routers/v1/qualities.py
rename to src/routers/openml/qualities.py
index 366e208d..569f5d03 100644
--- a/src/routers/v1/qualities.py
+++ b/src/routers/openml/qualities.py
@@ -8,9 +8,9 @@
from sqlalchemy import Connection, text
from routers.dependencies import expdb_connection, fetch_user
-from routers.v2.datasets import DatasetError
+from routers.openml.datasets import DatasetError
-router = APIRouter(prefix="/v1/datasets", tags=["datasets"])
+router = APIRouter(prefix="/datasets", tags=["datasets"])
@router.get("/qualities/list")
diff --git a/src/routers/v1/tasktype.py b/src/routers/openml/tasktype.py
similarity index 98%
rename from src/routers/v1/tasktype.py
rename to src/routers/openml/tasktype.py
index 49930843..909cea5f 100644
--- a/src/routers/v1/tasktype.py
+++ b/src/routers/openml/tasktype.py
@@ -9,7 +9,7 @@
from routers.dependencies import expdb_connection
-router = APIRouter(prefix="/v1/tasktype", tags=["tasks"])
+router = APIRouter(prefix="/tasktype", tags=["tasks"])
def _normalize_task_type(task_type: dict[str, str | int]) -> dict[str, str | list[Any]]:
diff --git a/src/routers/v2/datasets.py b/src/routers/v2/datasets.py
deleted file mode 100644
index 864a9eef..00000000
--- a/src/routers/v2/datasets.py
+++ /dev/null
@@ -1,186 +0,0 @@
-import html
-import http.client
-from collections import namedtuple
-from enum import IntEnum
-from typing import Annotated, Any
-
-from database.datasets import get_dataset as db_get_dataset
-from database.datasets import (
- get_file,
- get_latest_dataset_description,
- get_latest_processing_update,
- get_latest_status_update,
- get_tags,
-)
-from database.users import APIKey, get_user_groups_for, get_user_id_for
-from fastapi import APIRouter, Depends, HTTPException
-from routers.dependencies import expdb_connection, userdb_connection
-from schemas.datasets.openml import (
- DatasetFileFormat,
- DatasetMetadata,
- DatasetStatus,
- Visibility,
-)
-from sqlalchemy import Connection
-
-router = APIRouter(prefix="/v2/datasets", tags=["datasets"])
-
-
-class DatasetError(IntEnum):
- NOT_FOUND = 111
- NO_ACCESS = 112
- NO_DATA_FILE = 113
-
-
-processing_info = namedtuple("processing_info", ["date", "warning", "error"])
-
-
-def _get_processing_information(dataset_id: int, connection: Connection) -> processing_info:
- """Return processing information, if any. Otherwise, all fields `None`."""
- if not (data_processed := get_latest_processing_update(dataset_id, connection)):
- return processing_info(date=None, warning=None, error=None)
-
- date_processed = data_processed["processing_date"]
- warning = data_processed["warning"].strip() if data_processed["warning"] else None
- error = data_processed["error"].strip() if data_processed["error"] else None
- return processing_info(date=date_processed, warning=warning, error=error)
-
-
-def _format_error(*, code: DatasetError, message: str) -> dict[str, str]:
- """Formatter for JSON bodies of OpenML error codes."""
- return {"code": str(code), "message": message}
-
-
-def _user_has_access(
- dataset: dict[str, Any],
- connection: Connection,
- api_key: APIKey | None = None,
-) -> bool:
- """Determine if user of `api_key` has the right to view `dataset`."""
- if dataset["visibility"] == Visibility.PUBLIC:
- return True
- if not api_key:
- return False
-
- if not (user_id := get_user_id_for(api_key=api_key, connection=connection)):
- return False
-
- if user_id == dataset["uploader"]:
- return True
-
- user_groups = get_user_groups_for(user_id=user_id, connection=connection)
- ADMIN_GROUP = 1
- return ADMIN_GROUP in user_groups
-
-
-def _format_parquet_url(dataset: dict[str, Any]) -> str | None:
- if dataset["format"].lower() != DatasetFileFormat.ARFF:
- return None
-
- minio_base_url = "https://openml1.win.tue.nl"
- return f"{minio_base_url}/dataset{dataset['did']}/dataset_{dataset['did']}.pq"
-
-
-def _format_dataset_url(dataset: dict[str, Any]) -> str:
- base_url = "https://test.openml.org"
- filename = f"{html.escape(dataset['name'])}.{dataset['format'].lower()}"
- return f"{base_url}/data/v1/download/{dataset['file_id']}/{filename}"
-
-
-def _safe_unquote(text: str | None) -> str | None:
- """Remove any open and closing quotes and return the remainder if non-empty."""
- if not text:
- return None
- return text.strip("'\"") or None
-
-
-def _csv_as_list(text: str | None, *, unquote_items: bool = True) -> list[str]:
- """Return comma-separated values in `text` as list, optionally remove quotes."""
- if not text:
- return []
- chars_to_strip = "'\"\t " if unquote_items else "\t "
- return [item.strip(chars_to_strip) for item in text.split(",")]
-
-
-@router.get(
- path="/{dataset_id}",
- description="Get meta-data for dataset with ID `dataset_id`.",
-)
-def get_dataset(
- dataset_id: int,
- api_key: APIKey | None = None,
- user_db: Annotated[Connection, Depends(userdb_connection)] = None,
- expdb_db: Annotated[Connection, Depends(expdb_connection)] = None,
-) -> DatasetMetadata:
- if not (dataset := db_get_dataset(dataset_id, expdb_db)):
- error = _format_error(code=DatasetError.NOT_FOUND, message="Unknown dataset")
- raise HTTPException(status_code=http.client.NOT_FOUND, detail=error)
-
- if not _user_has_access(dataset=dataset, connection=user_db, api_key=api_key):
- error = _format_error(code=DatasetError.NO_ACCESS, message="No access granted")
- raise HTTPException(status_code=http.client.FORBIDDEN, detail=error)
-
- if not (dataset_file := get_file(dataset["file_id"], user_db)):
- error = _format_error(
- code=DatasetError.NO_DATA_FILE,
- message="No data file found",
- )
- raise HTTPException(status_code=http.client.PRECONDITION_FAILED, detail=error)
-
- tags = get_tags(dataset_id, expdb_db)
- description = get_latest_dataset_description(dataset_id, expdb_db)
- processing_result = _get_processing_information(dataset_id, expdb_db)
- status = get_latest_status_update(dataset_id, expdb_db)
-
- status_ = DatasetStatus(status["status"]) if status else DatasetStatus.IN_PREPARATION
-
- description_ = ""
- if description:
- description_ = description["description"].replace("\r", "").strip()
-
- dataset_url = _format_dataset_url(dataset)
- parquet_url = _format_parquet_url(dataset)
-
- contributors = _csv_as_list(dataset["contributor"], unquote_items=True)
- creators = _csv_as_list(dataset["creator"], unquote_items=True)
- ignore_attribute = _csv_as_list(dataset["ignore_attribute"], unquote_items=True)
- row_id_attribute = _csv_as_list(dataset["row_id_attribute"], unquote_items=True)
- original_data_url = _csv_as_list(dataset["original_data_url"], unquote_items=True)
-
- # Not sure which properties are set by this bit:
- # foreach( $this->xml_fields_dataset['csv'] as $field ) {
- # $dataset->{$field} = getcsv( $dataset->{$field} );
- # }
-
- return DatasetMetadata(
- id=dataset["did"],
- visibility=dataset["visibility"],
- status=status_,
- name=dataset["name"],
- licence=dataset["licence"],
- version=dataset["version"],
- version_label=dataset["version_label"] or "",
- language=dataset["language"] or "",
- creator=creators,
- contributor=contributors,
- citation=dataset["citation"] or "",
- upload_date=dataset["upload_date"],
- processing_date=processing_result.date,
- warning=processing_result.warning,
- error=processing_result.error,
- description=description_,
- description_version=description["version"] if description else 0,
- tag=tags,
- default_target_attribute=_safe_unquote(dataset["default_target_attribute"]),
- ignore_attribute=ignore_attribute,
- row_id_attribute=row_id_attribute,
- url=dataset_url,
- parquet_url=parquet_url,
- minio_url=parquet_url,
- file_id=dataset["file_id"],
- format=dataset["format"].lower(),
- paper_url=dataset["paper_url"] or None,
- original_data_url=original_data_url,
- collection_date=dataset["collection_date"],
- md5_checksum=dataset_file["md5_hash"],
- )
diff --git a/tests/routers/v1/dataset_tag_test.py b/tests/routers/openml/dataset_tag_test.py
similarity index 60%
rename from tests/routers/v1/dataset_tag_test.py
rename to tests/routers/openml/dataset_tag_test.py
index b3d0e250..f868f2cf 100644
--- a/tests/routers/v1/dataset_tag_test.py
+++ b/tests/routers/openml/dataset_tag_test.py
@@ -21,7 +21,7 @@ def test_dataset_tag_rejects_unauthorized(key: ApiKey, api_client: TestClient) -
response = cast(
httpx.Response,
api_client.post(
- f"/v1/datasets/tag{apikey}",
+ f"/datasets/tag{apikey}",
json={"data_id": constants.PRIVATE_DATASET_ID, "tag": "test"},
),
)
@@ -39,7 +39,7 @@ def test_dataset_tag(key: ApiKey, expdb_test: Connection, api_client: TestClient
response = cast(
httpx.Response,
api_client.post(
- f"/v1/datasets/tag?api_key={key}",
+ f"/datasets/tag?api_key={key}",
json={"data_id": dataset_id, "tag": tag},
),
)
@@ -55,7 +55,7 @@ def test_dataset_tag_returns_existing_tags(api_client: TestClient) -> None:
response = cast(
httpx.Response,
api_client.post(
- f"/v1/datasets/tag?api_key={ApiKey.ADMIN}",
+ f"/datasets/tag?api_key={ApiKey.ADMIN}",
json={"data_id": dataset_id, "tag": tag},
),
)
@@ -68,7 +68,7 @@ def test_dataset_tag_fails_if_tag_exists(api_client: TestClient) -> None:
response = cast(
httpx.Response,
api_client.post(
- f"/v1/datasets/tag?api_key={ApiKey.ADMIN}",
+ f"/datasets/tag?api_key={ApiKey.ADMIN}",
json={"data_id": dataset_id, "tag": tag},
),
)
@@ -95,64 +95,10 @@ def test_dataset_tag_invalid_tag_is_rejected(
new = cast(
httpx.Response,
api_client.post(
- f"/v1/datasets/tag?api_key{ApiKey.ADMIN}",
+ f"/datasets/tag?api_key{ApiKey.ADMIN}",
json={"data_id": 1, "tag": tag},
),
)
assert new.status_code == http.client.UNPROCESSABLE_ENTITY
assert ["body", "tag"] == new.json()["detail"][0]["loc"]
-
-
-@pytest.mark.php()
-@pytest.mark.parametrize(
- "dataset_id",
- list(range(1, 10)) + [101],
-)
-@pytest.mark.parametrize(
- "api_key",
- [ApiKey.ADMIN, ApiKey.REGULAR_USER, ApiKey.OWNER_USER],
- ids=["Administrator", "regular user", "possible owner"],
-)
-@pytest.mark.parametrize(
- "tag",
- ["study_14", "totally_new_tag_for_migration_testing"],
- ids=["typically existing tag", "new tag"],
-)
-def test_dataset_tag_response_is_identical(
- dataset_id: int,
- tag: str,
- api_key: str,
- api_client: TestClient,
-) -> None:
- original = httpx.post(
- "http://server-api-php-api-1:80/api/v1/json/data/tag",
- data={"api_key": api_key, "tag": tag, "data_id": dataset_id},
- )
- if (
- original.status_code == http.client.PRECONDITION_FAILED
- and original.json()["error"]["message"] == "An Elastic Search Exception occured."
- ):
- pytest.skip("Encountered Elastic Search error.")
- if original.status_code == http.client.OK:
- # undo the tag, because we don't want to persist this change to the database
- httpx.post(
- "http://server-api-php-api-1:80/api/v1/json/data/untag",
- data={"api_key": api_key, "tag": tag, "data_id": dataset_id},
- )
- new = cast(
- httpx.Response,
- api_client.post(
- f"/v1/datasets/tag?api_key={api_key}",
- json={"data_id": dataset_id, "tag": tag},
- ),
- )
-
- assert original.status_code == new.status_code, original.json()
- if new.status_code != http.client.OK:
- assert original.json()["error"] == new.json()["detail"]
- return
-
- original = original.json()
- new = new.json()
- assert original == new
diff --git a/tests/routers/v1/datasets_list_datasets_test.py b/tests/routers/openml/datasets_list_datasets_test.py
similarity index 94%
rename from tests/routers/v1/datasets_list_datasets_test.py
rename to tests/routers/openml/datasets_list_datasets_test.py
index 0de8e83b..e341735d 100644
--- a/tests/routers/v1/datasets_list_datasets_test.py
+++ b/tests/routers/openml/datasets_list_datasets_test.py
@@ -20,7 +20,7 @@ def _assert_empty_result(
def test_list(api_client: TestClient) -> None:
- response = api_client.get("/v1/datasets/list/")
+ response = api_client.get("/datasets/list/")
assert response.status_code == http.client.OK
assert "data" in response.json()
assert "dataset" in response.json()["data"]
@@ -40,7 +40,7 @@ def test_list(api_client: TestClient) -> None:
)
def test_list_filter_active(status: str, amount: int, api_client: TestClient) -> None:
response = api_client.post(
- "/v1/datasets/list",
+ "/datasets/list",
json={"status": status, "pagination": {"limit": constants.NUMBER_OF_DATASETS}},
)
assert response.status_code == http.client.OK, response.json()
@@ -60,7 +60,7 @@ def test_list_filter_active(status: str, amount: int, api_client: TestClient) ->
def test_list_accounts_privacy(api_key: ApiKey | None, amount: int, api_client: TestClient) -> None:
key = f"?api_key={api_key}" if api_key else ""
response = api_client.post(
- f"/v1/datasets/list{key}",
+ f"/datasets/list{key}",
json={"status": "all", "pagination": {"limit": 1000}},
)
assert response.status_code == http.client.OK, response.json()
@@ -75,7 +75,7 @@ def test_list_accounts_privacy(api_key: ApiKey | None, amount: int, api_client:
def test_list_data_name_present(name: str, count: int, api_client: TestClient) -> None:
# The second iris dataset is private, so we need to authenticate.
response = api_client.post(
- f"/v1/datasets/list?api_key={ApiKey.ADMIN}",
+ f"/datasets/list?api_key={ApiKey.ADMIN}",
json={"status": "all", "data_name": name},
)
assert response.status_code == http.client.OK
@@ -90,7 +90,7 @@ def test_list_data_name_present(name: str, count: int, api_client: TestClient) -
)
def test_list_data_name_absent(name: str, api_client: TestClient) -> None:
response = api_client.post(
- f"/v1/datasets/list?api_key={ApiKey.ADMIN}",
+ f"/datasets/list?api_key={ApiKey.ADMIN}",
json={"status": "all", "data_name": name},
)
_assert_empty_result(response)
@@ -116,7 +116,7 @@ def test_list_pagination(limit: int | None, offset: int | None, api_client: Test
offset_body = {} if offset is None else {"offset": offset}
limit_body = {} if limit is None else {"limit": limit}
filters = {"status": "all", "pagination": offset_body | limit_body}
- response = api_client.post("/v1/datasets/list", json=filters)
+ response = api_client.post("/datasets/list", json=filters)
if offset in [130, 200]:
_assert_empty_result(response)
@@ -133,7 +133,7 @@ def test_list_pagination(limit: int | None, offset: int | None, api_client: Test
)
def test_list_data_version(version: int, count: int, api_client: TestClient) -> None:
response = api_client.post(
- f"/v1/datasets/list?api_key={ApiKey.ADMIN}",
+ f"/datasets/list?api_key={ApiKey.ADMIN}",
json={"status": "all", "data_version": version},
)
assert response.status_code == http.client.OK
@@ -144,7 +144,7 @@ def test_list_data_version(version: int, count: int, api_client: TestClient) ->
def test_list_data_version_no_result(api_client: TestClient) -> None:
response = api_client.post(
- f"/v1/datasets/list?api_key={ApiKey.ADMIN}",
+ f"/datasets/list?api_key={ApiKey.ADMIN}",
json={"status": "all", "data_version": 4},
)
_assert_empty_result(response)
@@ -160,7 +160,7 @@ def test_list_data_version_no_result(api_client: TestClient) -> None:
)
def test_list_uploader(user_id: int, count: int, key: str, api_client: TestClient) -> None:
response = api_client.post(
- f"/v1/datasets/list?api_key={key}",
+ f"/datasets/list?api_key={key}",
json={"status": "all", "uploader": user_id},
)
# The dataset of user 16 is private, so can not be retrieved by other users.
@@ -179,7 +179,7 @@ def test_list_uploader(user_id: int, count: int, key: str, api_client: TestClien
)
def test_list_data_id(data_id: list[int], api_client: TestClient) -> None:
response = api_client.post(
- "/v1/datasets/list",
+ "/datasets/list",
json={"status": "all", "data_id": data_id},
)
@@ -195,7 +195,7 @@ def test_list_data_id(data_id: list[int], api_client: TestClient) -> None:
)
def test_list_data_tag(tag: str, count: int, api_client: TestClient) -> None:
response = api_client.post(
- "/v1/datasets/list",
+ "/datasets/list",
# study_14 has 100 datasets, we overwrite the default `limit` because otherwise
# we don't know if the results are limited by filtering on the tag.
json={"status": "all", "tag": tag, "pagination": {"limit": 101}},
@@ -207,7 +207,7 @@ def test_list_data_tag(tag: str, count: int, api_client: TestClient) -> None:
def test_list_data_tag_empty(api_client: TestClient) -> None:
response = api_client.post(
- "/v1/datasets/list",
+ "/datasets/list",
json={"status": "all", "tag": "not-a-tag"},
)
_assert_empty_result(response)
@@ -228,7 +228,7 @@ def test_list_data_tag_empty(api_client: TestClient) -> None:
)
def test_list_data_quality(quality: str, range_: str, count: int, api_client: TestClient) -> None:
response = api_client.post(
- "/v1/datasets/list",
+ "/datasets/list",
json={"status": "all", quality: range_},
)
assert response.status_code == http.client.OK, response.json()
@@ -276,7 +276,7 @@ def test_list_data_identical(
new_style["pagination"]["offset"] = offset
response = api_client.post(
- f"/v1/datasets/list{api_key_query}",
+ f"/datasets/list{api_key_query}",
json=new_style,
)
diff --git a/tests/routers/v2/datasets_test.py b/tests/routers/openml/datasets_test.py
similarity index 91%
rename from tests/routers/v2/datasets_test.py
rename to tests/routers/openml/datasets_test.py
index 3b538290..1aff3dc2 100644
--- a/tests/routers/v2/datasets_test.py
+++ b/tests/routers/openml/datasets_test.py
@@ -19,7 +19,7 @@ def test_error_unknown_dataset(
response_code: int,
api_client: TestClient,
) -> None:
- response = cast(httpx.Response, api_client.get(f"v2/datasets/{dataset_id}"))
+ response = cast(httpx.Response, api_client.get(f"/datasets/{dataset_id}"))
assert response.status_code == response_code
assert {"code": "111", "message": "Unknown dataset"} == response.json()["detail"]
@@ -38,7 +38,7 @@ def test_private_dataset_no_user_no_access(
response_code: int,
) -> None:
query = f"?api_key={api_key}" if api_key else ""
- response = cast(httpx.Response, api_client.get(f"v2/datasets/130{query}"))
+ response = cast(httpx.Response, api_client.get(f"/datasets/130{query}"))
assert response.status_code == response_code
assert {"code": "112", "message": "No access granted"} == response.json()["detail"]
diff --git a/tests/routers/openml/migration/datasets_migration_test.py b/tests/routers/openml/migration/datasets_migration_test.py
new file mode 100644
index 00000000..0266e5be
--- /dev/null
+++ b/tests/routers/openml/migration/datasets_migration_test.py
@@ -0,0 +1,187 @@
+import http.client
+import json
+from typing import Any, cast
+
+import httpx
+import pytest
+from starlette.testclient import TestClient
+
+from tests.conftest import ApiKey
+
+
+@pytest.mark.php()
+@pytest.mark.parametrize(
+ "dataset_id",
+ range(1, 132),
+)
+def test_dataset_response_is_identical(dataset_id: int, api_client: TestClient) -> None:
+ original = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/data/{dataset_id}")
+ new = api_client.get(f"/datasets/{dataset_id}")
+
+ if new.status_code == http.client.FORBIDDEN:
+ assert original.status_code == http.client.PRECONDITION_FAILED
+ else:
+ assert original.status_code == new.status_code
+
+ if new.status_code != http.client.OK:
+ assert original.json()["error"] == new.json()["detail"]
+ return
+
+ try:
+ original = original.json()["data_set_description"]
+ except json.decoder.JSONDecodeError:
+ pytest.skip("A PHP error occurred on the test server.")
+
+ if "div" in original:
+ pytest.skip("A PHP error occurred on the test server.")
+
+ # There are a few changes between the old API and the new API, so we convert here:
+ # The new API has normalized `format` field:
+ original["format"] = original["format"].lower()
+
+ # There is odd behavior in the live server that I don't want to recreate:
+ # when the creator is a list of csv names, it can either be a str or a list
+ # depending on whether the names are quoted. E.g.:
+ # '"Alice", "Bob"' -> ["Alice", "Bob"]
+ # 'Alice, Bob' -> 'Alice, Bob'
+ if (
+ "creator" in original
+ and isinstance(original["creator"], str)
+ and len(original["creator"].split(",")) > 1
+ ):
+ original["creator"] = [name.strip() for name in original["creator"].split(",")]
+
+ new_body = new.json()
+ if processing_data := new_body.get("processing_date"):
+ new_body["processing_date"] = str(processing_data).replace("T", " ")
+ if parquet_url := new_body.get("parquet_url"):
+ new_body["parquet_url"] = str(parquet_url).replace("https", "http")
+ if minio_url := new_body.get("minio_url"):
+ new_body["minio_url"] = str(minio_url).replace("https", "http")
+
+ manual = []
+ # ref test.openml.org/d/33 (contributor) and d/34 (creator)
+ # contributor/creator in database is '""'
+ # json content is []
+ for field in ["contributor", "creator"]:
+ if new_body[field] == [""]:
+ new_body[field] = []
+ manual.append(field)
+
+ if isinstance(new_body["original_data_url"], list):
+ new_body["original_data_url"] = ", ".join(str(url) for url in new_body["original_data_url"])
+
+ for field, value in list(new_body.items()):
+ if field in manual:
+ continue
+ if isinstance(value, int):
+ new_body[field] = str(value)
+ elif isinstance(value, list) and len(value) == 1:
+ new_body[field] = str(value[0])
+ if not new_body[field]:
+ del new_body[field]
+
+ if "description" not in new_body:
+ new_body["description"] = []
+ assert original == new_body
+
+
+@pytest.mark.parametrize(
+ "dataset_id",
+ [-1, 138, 100_000],
+)
+def test_error_unknown_dataset(
+ dataset_id: int,
+ api_client: TestClient,
+) -> None:
+ response = cast(httpx.Response, api_client.get(f"/datasets/{dataset_id}"))
+
+ # The new API has "404 Not Found" instead of "412 PRECONDITION_FAILED"
+ assert response.status_code == http.client.NOT_FOUND
+ assert {"code": "111", "message": "Unknown dataset"} == response.json()["detail"]
+
+
+@pytest.mark.parametrize(
+ "api_key",
+ [None, "a" * 32],
+)
+def test_private_dataset_no_user_no_access(
+ api_client: TestClient,
+ api_key: str | None,
+) -> None:
+ query = f"?api_key={api_key}" if api_key else ""
+ response = cast(httpx.Response, api_client.get(f"/datasets/130{query}"))
+
+ # New response is 403: Forbidden instead of 412: PRECONDITION FAILED
+ assert response.status_code == http.client.FORBIDDEN
+ assert {"code": "112", "message": "No access granted"} == response.json()["detail"]
+
+
+@pytest.mark.skip("Not sure how to include apikey in test yet.")
+def test_private_dataset_owner_access(
+ api_client: TestClient,
+ dataset_130: dict[str, Any],
+) -> None:
+ response = cast(httpx.Response, api_client.get("/datasets/130?api_key=..."))
+ assert response.status_code == http.client.OK
+ assert dataset_130 == response.json()
+
+
+@pytest.mark.skip("Not sure how to include apikey in test yet.")
+def test_private_dataset_admin_access(api_client: TestClient) -> None:
+ cast(httpx.Response, api_client.get("/datasets/130?api_key=..."))
+ # test against cached response
+
+
+@pytest.mark.php()
+@pytest.mark.parametrize(
+ "dataset_id",
+ list(range(1, 10)) + [101],
+)
+@pytest.mark.parametrize(
+ "api_key",
+ [ApiKey.ADMIN, ApiKey.REGULAR_USER, ApiKey.OWNER_USER],
+ ids=["Administrator", "regular user", "possible owner"],
+)
+@pytest.mark.parametrize(
+ "tag",
+ ["study_14", "totally_new_tag_for_migration_testing"],
+ ids=["typically existing tag", "new tag"],
+)
+def test_dataset_tag_response_is_identical(
+ dataset_id: int,
+ tag: str,
+ api_key: str,
+ api_client: TestClient,
+) -> None:
+ original = httpx.post(
+ "http://server-api-php-api-1:80/api/v1/json/data/tag",
+ data={"api_key": api_key, "tag": tag, "data_id": dataset_id},
+ )
+ if (
+ original.status_code == http.client.PRECONDITION_FAILED
+ and original.json()["error"]["message"] == "An Elastic Search Exception occured."
+ ):
+ pytest.skip("Encountered Elastic Search error.")
+ if original.status_code == http.client.OK:
+ # undo the tag, because we don't want to persist this change to the database
+ httpx.post(
+ "http://server-api-php-api-1:80/api/v1/json/data/untag",
+ data={"api_key": api_key, "tag": tag, "data_id": dataset_id},
+ )
+ new = cast(
+ httpx.Response,
+ api_client.post(
+ f"/datasets/tag?api_key={api_key}",
+ json={"data_id": dataset_id, "tag": tag},
+ ),
+ )
+
+ assert original.status_code == new.status_code, original.json()
+ if new.status_code != http.client.OK:
+ assert original.json()["error"] == new.json()["detail"]
+ return
+
+ original = original.json()
+ new = new.json()
+ assert original == new
diff --git a/tests/routers/v1/qualities_test.py b/tests/routers/openml/qualities_test.py
similarity index 97%
rename from tests/routers/v1/qualities_test.py
rename to tests/routers/openml/qualities_test.py
index d2de98b6..eb30b442 100644
--- a/tests/routers/v1/qualities_test.py
+++ b/tests/routers/openml/qualities_test.py
@@ -30,14 +30,14 @@ def _remove_quality_from_database(quality_name: str, expdb_test: Connection) ->
@pytest.mark.php()
def test_list_qualities_identical(api_client: TestClient) -> None:
original = httpx.get("http://server-api-php-api-1:80/api/v1/json/data/qualities/list")
- new = api_client.get("/v1/datasets/qualities/list")
+ new = api_client.get("/datasets/qualities/list")
assert original.status_code == new.status_code
assert original.json() == new.json()
# To keep the test idempotent, we cannot test if reaction to database changes is identical
def test_list_qualities(api_client: TestClient, expdb_test: Connection) -> None:
- response = api_client.get("/v1/datasets/qualities/list")
+ response = api_client.get("/datasets/qualities/list")
assert response.status_code == http.client.OK
expected = {
"data_qualities_list": {
@@ -157,13 +157,13 @@ def test_list_qualities(api_client: TestClient, expdb_test: Connection) -> None:
deleted = expected["data_qualities_list"]["quality"].pop()
_remove_quality_from_database(quality_name=deleted, expdb_test=expdb_test)
- response = api_client.get("/v1/datasets/qualities/list")
+ response = api_client.get("/datasets/qualities/list")
assert response.status_code == http.client.OK
assert expected == response.json()
def test_get_quality(api_client: TestClient) -> None:
- response = api_client.get("/v1/datasets/qualities/1")
+ response = api_client.get("/datasets/qualities/1")
assert response.status_code == http.client.OK
expected = [
{"name": "AutoCorrelation", "value": 0.6064659977703456},
@@ -284,7 +284,7 @@ def test_get_quality(api_client: TestClient) -> None:
)
def test_get_quality_identical(data_id: int, api_client: TestClient) -> None:
php_response = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/data/qualities/{data_id}")
- python_response = api_client.get(f"/v1/datasets/qualities/{data_id}")
+ python_response = api_client.get(f"/datasets/qualities/{data_id}")
assert python_response.status_code == php_response.status_code
expected = [
@@ -308,7 +308,7 @@ def test_get_quality_identical_error(data_id: int, api_client: TestClient) -> No
if data_id in [116]:
pytest.skip("Detailed error for code 362 (no qualities) not yet supported.")
php_response = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/data/qualities/{data_id}")
- python_response = api_client.get(f"/v1/datasets/qualities/{data_id}")
+ python_response = api_client.get(f"/datasets/qualities/{data_id}")
assert python_response.status_code == php_response.status_code
# The "dataset unknown" error currently has a separate code in PHP depending on
# where it occurs (e.g., get dataset->113 get quality->361)
diff --git a/tests/routers/v1/task_type_test.py b/tests/routers/openml/task_type_test.py
similarity index 89%
rename from tests/routers/v1/task_type_test.py
rename to tests/routers/openml/task_type_test.py
index bfc2381b..2d91cab8 100644
--- a/tests/routers/v1/task_type_test.py
+++ b/tests/routers/openml/task_type_test.py
@@ -8,7 +8,7 @@
@pytest.mark.php()
def test_list_task_type(api_client: TestClient) -> None:
- response = api_client.get("/v1/tasktype/list")
+ response = api_client.get("/tasktype/list")
original = httpx.get("http://server-api-php-api-1:80/api/v1/json/tasktype/list")
assert response.status_code == original.status_code
assert response.json() == original.json()
@@ -20,7 +20,7 @@ def test_list_task_type(api_client: TestClient) -> None:
list(range(1, 12)),
)
def test_get_task_type(ttype_id: int, api_client: TestClient) -> None:
- response = api_client.get(f"/v1/tasktype/{ttype_id}")
+ response = api_client.get(f"/tasktype/{ttype_id}")
original = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/tasktype/{ttype_id}")
assert response.status_code == original.status_code
@@ -37,6 +37,6 @@ def test_get_task_type(ttype_id: int, api_client: TestClient) -> None:
def test_get_task_type_unknown(api_client: TestClient) -> None:
- response = api_client.get("/v1/tasktype/1000")
+ response = api_client.get("/tasktype/1000")
assert response.status_code == http.client.PRECONDITION_FAILED
assert response.json() == {"detail": {"code": "241", "message": "Unknown task type."}}
diff --git a/tests/routers/v1/datasets_old_test.py b/tests/routers/v1/datasets_old_test.py
deleted file mode 100644
index dff98368..00000000
--- a/tests/routers/v1/datasets_old_test.py
+++ /dev/null
@@ -1,98 +0,0 @@
-import http.client
-import json.decoder
-from typing import Any, cast
-
-import httpx
-import pytest
-from starlette.testclient import TestClient
-
-
-@pytest.mark.php()
-@pytest.mark.parametrize(
- "dataset_id",
- range(1, 132),
-)
-def test_dataset_response_is_identical(dataset_id: int, api_client: TestClient) -> None:
- original = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/data/{dataset_id}")
- new = cast(httpx.Response, api_client.get(f"/v1/datasets/{dataset_id}"))
- assert original.status_code == new.status_code
- assert new.json()
-
- if new.status_code != http.client.OK:
- assert original.json()["error"] == new.json()["detail"]
- return
-
- assert "data_set_description" in new.json()
-
- try:
- original = original.json()["data_set_description"]
- except json.decoder.JSONDecodeError:
- pytest.skip("A PHP error occurred on the test server.")
-
- new = new.json()["data_set_description"]
-
- if "div" in original:
- pytest.skip("A PHP error occurred on the test server.")
-
- # The new API has normalized `format` field:
- original["format"] = original["format"].lower()
-
- # There is odd behavior in the live server that I don't want to recreate:
- # when the creator is a list of csv names, it can either be a str or a list
- # depending on whether the names are quoted. E.g.:
- # '"Alice", "Bob"' -> ["Alice", "Bob"]
- # 'Alice, Bob' -> 'Alice, Bob'
- if (
- "creator" in original
- and isinstance(original["creator"], str)
- and len(original["creator"].split(",")) > 1
- ):
- original["creator"] = [name.strip() for name in original["creator"].split(",")]
-
- # The remainder of the fields should be identical:
- assert original == new
-
-
-@pytest.mark.parametrize(
- "dataset_id",
- [-1, 138, 100_000],
-)
-def test_error_unknown_dataset(
- dataset_id: int,
- api_client: TestClient,
-) -> None:
- response = cast(httpx.Response, api_client.get(f"v1/datasets/{dataset_id}"))
-
- assert response.status_code == http.client.PRECONDITION_FAILED
- assert {"code": "111", "message": "Unknown dataset"} == response.json()["detail"]
-
-
-@pytest.mark.parametrize(
- "api_key",
- [None, "a" * 32],
-)
-def test_private_dataset_no_user_no_access(
- api_client: TestClient,
- api_key: str | None,
-) -> None:
- query = f"?api_key={api_key}" if api_key else ""
- response = cast(httpx.Response, api_client.get(f"v1/datasets/130{query}"))
-
- assert response.status_code == http.client.PRECONDITION_FAILED
- assert {"code": "112", "message": "No access granted"} == response.json()["detail"]
-
-
-@pytest.mark.skip("Not sure how to include apikey in test yet.")
-def test_private_dataset_owner_access(
- api_client: TestClient,
- dataset_130: dict[str, Any],
-) -> None:
- response = cast(httpx.Response, api_client.get("/v1/datasets/130?api_key=..."))
- assert response.status_code == http.client.OK
- assert dataset_130 == response.json()
-
-
-@pytest.mark.skip("Not sure how to include apikey in test yet.")
-def test_private_dataset_admin_access(api_client: TestClient) -> None:
- cast(httpx.Response, api_client.get("/v1/datasets/130?api_key=..."))
- # test against cached response
From 33b6e42354845b1490815e9d4043da8d67ffec2c Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Tue, 28 Nov 2023 15:56:35 +0100
Subject: [PATCH 2/3] Use a httpx.client interface for calling the php server
This puts the configuration in one place (the fixture) and makes
it easier to add new calls / understand where the calls go.
---
tests/conftest.py | 9 ++-
tests/routers/openml/dataset_tag_test.py | 20 +++---
.../openml/datasets_list_datasets_test.py | 61 ++++++++++---------
tests/routers/openml/datasets_test.py | 16 ++---
.../migration/datasets_migration_test.py | 39 ++++++------
tests/routers/openml/qualities_test.py | 32 +++++-----
tests/routers/openml/task_type_test.py | 16 ++---
7 files changed, 105 insertions(+), 88 deletions(-)
diff --git a/tests/conftest.py b/tests/conftest.py
index 287b947a..6ccad271 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -4,6 +4,7 @@
from pathlib import Path
from typing import Any, Generator
+import httpx
import pytest
from database.setup import expdb_database, user_database
from fastapi.testclient import TestClient
@@ -40,7 +41,13 @@ def user_test() -> Connection:
@pytest.fixture()
-def api_client(expdb_test: Connection, user_test: Connection) -> TestClient:
+def php_api() -> httpx.Client:
+ with httpx.Client(base_url="http://server-api-php-api-1:80/api/v1/json") as client:
+ yield client
+
+
+@pytest.fixture()
+def py_api(expdb_test: Connection, user_test: Connection) -> TestClient:
app = create_api()
# We use the lambda definitions because fixtures may not be called directly.
app.dependency_overrides[expdb_connection] = lambda: expdb_test
diff --git a/tests/routers/openml/dataset_tag_test.py b/tests/routers/openml/dataset_tag_test.py
index f868f2cf..f315ffbe 100644
--- a/tests/routers/openml/dataset_tag_test.py
+++ b/tests/routers/openml/dataset_tag_test.py
@@ -16,11 +16,11 @@
[None, ApiKey.INVALID],
ids=["no authentication", "invalid key"],
)
-def test_dataset_tag_rejects_unauthorized(key: ApiKey, api_client: TestClient) -> None:
+def test_dataset_tag_rejects_unauthorized(key: ApiKey, py_api: TestClient) -> None:
apikey = "" if key is None else f"?api_key={key}"
response = cast(
httpx.Response,
- api_client.post(
+ py_api.post(
f"/datasets/tag{apikey}",
json={"data_id": constants.PRIVATE_DATASET_ID, "tag": "test"},
),
@@ -34,11 +34,11 @@ def test_dataset_tag_rejects_unauthorized(key: ApiKey, api_client: TestClient) -
[ApiKey.ADMIN, ApiKey.REGULAR_USER, ApiKey.OWNER_USER],
ids=["administrator", "non-owner", "owner"],
)
-def test_dataset_tag(key: ApiKey, expdb_test: Connection, api_client: TestClient) -> None:
+def test_dataset_tag(key: ApiKey, expdb_test: Connection, py_api: TestClient) -> None:
dataset_id, tag = constants.PRIVATE_DATASET_ID, "test"
response = cast(
httpx.Response,
- api_client.post(
+ py_api.post(
f"/datasets/tag?api_key={key}",
json={"data_id": dataset_id, "tag": tag},
),
@@ -50,11 +50,11 @@ def test_dataset_tag(key: ApiKey, expdb_test: Connection, api_client: TestClient
assert tag in tags
-def test_dataset_tag_returns_existing_tags(api_client: TestClient) -> None:
+def test_dataset_tag_returns_existing_tags(py_api: TestClient) -> None:
dataset_id, tag = 1, "test"
response = cast(
httpx.Response,
- api_client.post(
+ py_api.post(
f"/datasets/tag?api_key={ApiKey.ADMIN}",
json={"data_id": dataset_id, "tag": tag},
),
@@ -63,11 +63,11 @@ def test_dataset_tag_returns_existing_tags(api_client: TestClient) -> None:
assert {"data_tag": {"id": str(dataset_id), "tag": ["study_14", tag]}} == response.json()
-def test_dataset_tag_fails_if_tag_exists(api_client: TestClient) -> None:
+def test_dataset_tag_fails_if_tag_exists(py_api: TestClient) -> None:
dataset_id, tag = 1, "study_14" # Dataset 1 already is tagged with 'study_14'
response = cast(
httpx.Response,
- api_client.post(
+ py_api.post(
f"/datasets/tag?api_key={ApiKey.ADMIN}",
json={"data_id": dataset_id, "tag": tag},
),
@@ -90,11 +90,11 @@ def test_dataset_tag_fails_if_tag_exists(api_client: TestClient) -> None:
)
def test_dataset_tag_invalid_tag_is_rejected(
tag: str,
- api_client: TestClient,
+ py_api: TestClient,
) -> None:
new = cast(
httpx.Response,
- api_client.post(
+ py_api.post(
f"/datasets/tag?api_key{ApiKey.ADMIN}",
json={"data_id": 1, "tag": tag},
),
diff --git a/tests/routers/openml/datasets_list_datasets_test.py b/tests/routers/openml/datasets_list_datasets_test.py
index e341735d..92f70557 100644
--- a/tests/routers/openml/datasets_list_datasets_test.py
+++ b/tests/routers/openml/datasets_list_datasets_test.py
@@ -19,8 +19,8 @@ def _assert_empty_result(
assert response.json()["detail"] == {"code": "372", "message": "No results"}
-def test_list(api_client: TestClient) -> None:
- response = api_client.get("/datasets/list/")
+def test_list(py_api: TestClient) -> None:
+ response = py_api.get("/datasets/list/")
assert response.status_code == http.client.OK
assert "data" in response.json()
assert "dataset" in response.json()["data"]
@@ -38,8 +38,8 @@ def test_list(api_client: TestClient) -> None:
("all", constants.NUMBER_OF_DATASETS - constants.NUMBER_OF_PRIVATE_DATASETS),
],
)
-def test_list_filter_active(status: str, amount: int, api_client: TestClient) -> None:
- response = api_client.post(
+def test_list_filter_active(status: str, amount: int, py_api: TestClient) -> None:
+ response = py_api.post(
"/datasets/list",
json={"status": status, "pagination": {"limit": constants.NUMBER_OF_DATASETS}},
)
@@ -57,9 +57,9 @@ def test_list_filter_active(status: str, amount: int, api_client: TestClient) ->
(None, constants.NUMBER_OF_DATASETS - constants.NUMBER_OF_PRIVATE_DATASETS),
],
)
-def test_list_accounts_privacy(api_key: ApiKey | None, amount: int, api_client: TestClient) -> None:
+def test_list_accounts_privacy(api_key: ApiKey | None, amount: int, py_api: TestClient) -> None:
key = f"?api_key={api_key}" if api_key else ""
- response = api_client.post(
+ response = py_api.post(
f"/datasets/list{key}",
json={"status": "all", "pagination": {"limit": 1000}},
)
@@ -72,9 +72,9 @@ def test_list_accounts_privacy(api_key: ApiKey | None, amount: int, api_client:
("name", "count"),
[("abalone", 1), ("iris", 2)],
)
-def test_list_data_name_present(name: str, count: int, api_client: TestClient) -> None:
+def test_list_data_name_present(name: str, count: int, py_api: TestClient) -> None:
# The second iris dataset is private, so we need to authenticate.
- response = api_client.post(
+ response = py_api.post(
f"/datasets/list?api_key={ApiKey.ADMIN}",
json={"status": "all", "data_name": name},
)
@@ -88,8 +88,8 @@ def test_list_data_name_present(name: str, count: int, api_client: TestClient) -
"name",
["ir", "long_name_without_overlap"],
)
-def test_list_data_name_absent(name: str, api_client: TestClient) -> None:
- response = api_client.post(
+def test_list_data_name_absent(name: str, py_api: TestClient) -> None:
+ response = py_api.post(
f"/datasets/list?api_key={ApiKey.ADMIN}",
json={"status": "all", "data_name": name},
)
@@ -102,7 +102,7 @@ def test_list_quality_filers() -> None:
@pytest.mark.parametrize("limit", [None, 5, 10, 200])
@pytest.mark.parametrize("offset", [None, 0, 5, 129, 130, 200])
-def test_list_pagination(limit: int | None, offset: int | None, api_client: TestClient) -> None:
+def test_list_pagination(limit: int | None, offset: int | None, py_api: TestClient) -> None:
all_ids = [
did
for did in range(1, 1 + constants.NUMBER_OF_DATASETS)
@@ -116,7 +116,7 @@ def test_list_pagination(limit: int | None, offset: int | None, api_client: Test
offset_body = {} if offset is None else {"offset": offset}
limit_body = {} if limit is None else {"limit": limit}
filters = {"status": "all", "pagination": offset_body | limit_body}
- response = api_client.post("/datasets/list", json=filters)
+ response = py_api.post("/datasets/list", json=filters)
if offset in [130, 200]:
_assert_empty_result(response)
@@ -131,8 +131,8 @@ def test_list_pagination(limit: int | None, offset: int | None, api_client: Test
("version", "count"),
[(1, 100), (2, 6), (5, 1)],
)
-def test_list_data_version(version: int, count: int, api_client: TestClient) -> None:
- response = api_client.post(
+def test_list_data_version(version: int, count: int, py_api: TestClient) -> None:
+ response = py_api.post(
f"/datasets/list?api_key={ApiKey.ADMIN}",
json={"status": "all", "data_version": version},
)
@@ -142,8 +142,8 @@ def test_list_data_version(version: int, count: int, api_client: TestClient) ->
assert {dataset["version"] for dataset in datasets} == {version}
-def test_list_data_version_no_result(api_client: TestClient) -> None:
- response = api_client.post(
+def test_list_data_version_no_result(py_api: TestClient) -> None:
+ response = py_api.post(
f"/datasets/list?api_key={ApiKey.ADMIN}",
json={"status": "all", "data_version": 4},
)
@@ -158,8 +158,8 @@ def test_list_data_version_no_result(api_client: TestClient) -> None:
("user_id", "count"),
[(1, 59), (2, 34), (16, 1)],
)
-def test_list_uploader(user_id: int, count: int, key: str, api_client: TestClient) -> None:
- response = api_client.post(
+def test_list_uploader(user_id: int, count: int, key: str, py_api: TestClient) -> None:
+ response = py_api.post(
f"/datasets/list?api_key={key}",
json={"status": "all", "uploader": user_id},
)
@@ -177,8 +177,8 @@ def test_list_uploader(user_id: int, count: int, key: str, api_client: TestClien
"data_id",
[[1], [1, 2, 3], [1, 2, 3, 3000], [1, 2, 3, 130]],
)
-def test_list_data_id(data_id: list[int], api_client: TestClient) -> None:
- response = api_client.post(
+def test_list_data_id(data_id: list[int], py_api: TestClient) -> None:
+ response = py_api.post(
"/datasets/list",
json={"status": "all", "data_id": data_id},
)
@@ -193,8 +193,8 @@ def test_list_data_id(data_id: list[int], api_client: TestClient) -> None:
("tag", "count"),
[("study_14", 100), ("study_15", 1)],
)
-def test_list_data_tag(tag: str, count: int, api_client: TestClient) -> None:
- response = api_client.post(
+def test_list_data_tag(tag: str, count: int, py_api: TestClient) -> None:
+ response = py_api.post(
"/datasets/list",
# study_14 has 100 datasets, we overwrite the default `limit` because otherwise
# we don't know if the results are limited by filtering on the tag.
@@ -205,8 +205,8 @@ def test_list_data_tag(tag: str, count: int, api_client: TestClient) -> None:
assert len(datasets) == count
-def test_list_data_tag_empty(api_client: TestClient) -> None:
- response = api_client.post(
+def test_list_data_tag_empty(py_api: TestClient) -> None:
+ response = py_api.post(
"/datasets/list",
json={"status": "all", "tag": "not-a-tag"},
)
@@ -226,8 +226,8 @@ def test_list_data_tag_empty(api_client: TestClient) -> None:
("number_missing_values", "2..100000", 22),
],
)
-def test_list_data_quality(quality: str, range_: str, count: int, api_client: TestClient) -> None:
- response = api_client.post(
+def test_list_data_quality(quality: str, range_: str, count: int, py_api: TestClient) -> None:
+ response = py_api.post(
"/datasets/list",
json={"status": "all", quality: range_},
)
@@ -258,7 +258,8 @@ def test_list_data_quality(quality: str, range_: str, count: int, api_client: Te
api_key=st.sampled_from([None, ApiKey.REGULAR_USER, ApiKey.OWNER_USER]),
) # type: ignore[misc] # https://github.com/openml/server-api/issues/108
def test_list_data_identical(
- api_client: TestClient,
+ py_api: TestClient,
+ php_api: httpx.Client,
**kwargs: dict[str, Any],
) -> Any:
limit, offset = kwargs["limit"], kwargs["offset"]
@@ -275,7 +276,7 @@ def test_list_data_identical(
if offset is not None:
new_style["pagination"]["offset"] = offset
- response = api_client.post(
+ response = py_api.post(
f"/datasets/list{api_key_query}",
json=new_style,
)
@@ -286,11 +287,11 @@ def test_list_data_identical(
for filter_, value in kwargs.items()
if value is not None
]
- uri = "http://server-api-php-api-1:80/api/v1/json/data/list"
+ uri = "/data/list"
if query:
uri += f"/{'/'.join([str(v) for q in query for v in q])}"
uri += api_key_query
- original = httpx.get(uri)
+ original = php_api.get(uri)
assert original.status_code == response.status_code, response.json()
if original.status_code == http.client.PRECONDITION_FAILED:
diff --git a/tests/routers/openml/datasets_test.py b/tests/routers/openml/datasets_test.py
index 1aff3dc2..98a11941 100644
--- a/tests/routers/openml/datasets_test.py
+++ b/tests/routers/openml/datasets_test.py
@@ -17,9 +17,9 @@
def test_error_unknown_dataset(
dataset_id: int,
response_code: int,
- api_client: TestClient,
+ py_api: TestClient,
) -> None:
- response = cast(httpx.Response, api_client.get(f"/datasets/{dataset_id}"))
+ response = cast(httpx.Response, py_api.get(f"/datasets/{dataset_id}"))
assert response.status_code == response_code
assert {"code": "111", "message": "Unknown dataset"} == response.json()["detail"]
@@ -33,12 +33,12 @@ def test_error_unknown_dataset(
],
)
def test_private_dataset_no_user_no_access(
- api_client: TestClient,
+ py_api: TestClient,
api_key: str | None,
response_code: int,
) -> None:
query = f"?api_key={api_key}" if api_key else ""
- response = cast(httpx.Response, api_client.get(f"/datasets/130{query}"))
+ response = cast(httpx.Response, py_api.get(f"/datasets/130{query}"))
assert response.status_code == response_code
assert {"code": "112", "message": "No access granted"} == response.json()["detail"]
@@ -46,15 +46,15 @@ def test_private_dataset_no_user_no_access(
@pytest.mark.skip("Not sure how to include apikey in test yet.")
def test_private_dataset_owner_access(
- api_client: TestClient,
+ py_api: TestClient,
dataset_130: dict[str, Any],
) -> None:
- response = cast(httpx.Response, api_client.get("/v2/datasets/130?api_key=..."))
+ response = cast(httpx.Response, py_api.get("/v2/datasets/130?api_key=..."))
assert response.status_code == http.client.OK
assert dataset_130 == response.json()
@pytest.mark.skip("Not sure how to include apikey in test yet.")
-def test_private_dataset_admin_access(api_client: TestClient) -> None:
- cast(httpx.Response, api_client.get("/v2/datasets/130?api_key=..."))
+def test_private_dataset_admin_access(py_api: TestClient) -> None:
+ cast(httpx.Response, py_api.get("/v2/datasets/130?api_key=..."))
# test against cached response
diff --git a/tests/routers/openml/migration/datasets_migration_test.py b/tests/routers/openml/migration/datasets_migration_test.py
index 0266e5be..6d93827c 100644
--- a/tests/routers/openml/migration/datasets_migration_test.py
+++ b/tests/routers/openml/migration/datasets_migration_test.py
@@ -14,9 +14,13 @@
"dataset_id",
range(1, 132),
)
-def test_dataset_response_is_identical(dataset_id: int, api_client: TestClient) -> None:
- original = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/data/{dataset_id}")
- new = api_client.get(f"/datasets/{dataset_id}")
+def test_dataset_response_is_identical(
+ dataset_id: int,
+ py_api: TestClient,
+ php_api: httpx.Client,
+) -> None:
+ original = php_api.get(f"/data/{dataset_id}")
+ new = py_api.get(f"/datasets/{dataset_id}")
if new.status_code == http.client.FORBIDDEN:
assert original.status_code == http.client.PRECONDITION_FAILED
@@ -92,9 +96,9 @@ def test_dataset_response_is_identical(dataset_id: int, api_client: TestClient)
)
def test_error_unknown_dataset(
dataset_id: int,
- api_client: TestClient,
+ py_api: TestClient,
) -> None:
- response = cast(httpx.Response, api_client.get(f"/datasets/{dataset_id}"))
+ response = cast(httpx.Response, py_api.get(f"/datasets/{dataset_id}"))
# The new API has "404 Not Found" instead of "412 PRECONDITION_FAILED"
assert response.status_code == http.client.NOT_FOUND
@@ -106,11 +110,11 @@ def test_error_unknown_dataset(
[None, "a" * 32],
)
def test_private_dataset_no_user_no_access(
- api_client: TestClient,
+ py_api: TestClient,
api_key: str | None,
) -> None:
query = f"?api_key={api_key}" if api_key else ""
- response = cast(httpx.Response, api_client.get(f"/datasets/130{query}"))
+ response = cast(httpx.Response, py_api.get(f"/datasets/130{query}"))
# New response is 403: Forbidden instead of 412: PRECONDITION FAILED
assert response.status_code == http.client.FORBIDDEN
@@ -119,17 +123,17 @@ def test_private_dataset_no_user_no_access(
@pytest.mark.skip("Not sure how to include apikey in test yet.")
def test_private_dataset_owner_access(
- api_client: TestClient,
+ py_api: TestClient,
dataset_130: dict[str, Any],
) -> None:
- response = cast(httpx.Response, api_client.get("/datasets/130?api_key=..."))
+ response = cast(httpx.Response, py_api.get("/datasets/130?api_key=..."))
assert response.status_code == http.client.OK
assert dataset_130 == response.json()
@pytest.mark.skip("Not sure how to include apikey in test yet.")
-def test_private_dataset_admin_access(api_client: TestClient) -> None:
- cast(httpx.Response, api_client.get("/datasets/130?api_key=..."))
+def test_private_dataset_admin_access(py_api: TestClient) -> None:
+ cast(httpx.Response, py_api.get("/datasets/130?api_key=..."))
# test against cached response
@@ -152,10 +156,11 @@ def test_dataset_tag_response_is_identical(
dataset_id: int,
tag: str,
api_key: str,
- api_client: TestClient,
+ py_api: TestClient,
+ php_api: httpx.Client,
) -> None:
- original = httpx.post(
- "http://server-api-php-api-1:80/api/v1/json/data/tag",
+ original = php_api.post(
+ "/data/tag",
data={"api_key": api_key, "tag": tag, "data_id": dataset_id},
)
if (
@@ -165,13 +170,13 @@ def test_dataset_tag_response_is_identical(
pytest.skip("Encountered Elastic Search error.")
if original.status_code == http.client.OK:
# undo the tag, because we don't want to persist this change to the database
- httpx.post(
- "http://server-api-php-api-1:80/api/v1/json/data/untag",
+ php_api.post(
+ "/data/untag",
data={"api_key": api_key, "tag": tag, "data_id": dataset_id},
)
new = cast(
httpx.Response,
- api_client.post(
+ py_api.post(
f"/datasets/tag?api_key={api_key}",
json={"data_id": dataset_id, "tag": tag},
),
diff --git a/tests/routers/openml/qualities_test.py b/tests/routers/openml/qualities_test.py
index eb30b442..851f7791 100644
--- a/tests/routers/openml/qualities_test.py
+++ b/tests/routers/openml/qualities_test.py
@@ -28,16 +28,16 @@ def _remove_quality_from_database(quality_name: str, expdb_test: Connection) ->
@pytest.mark.php()
-def test_list_qualities_identical(api_client: TestClient) -> None:
- original = httpx.get("http://server-api-php-api-1:80/api/v1/json/data/qualities/list")
- new = api_client.get("/datasets/qualities/list")
+def test_list_qualities_identical(py_api: TestClient, php_api: httpx.Client) -> None:
+ original = php_api.get("/data/qualities/list")
+ new = py_api.get("/datasets/qualities/list")
assert original.status_code == new.status_code
assert original.json() == new.json()
# To keep the test idempotent, we cannot test if reaction to database changes is identical
-def test_list_qualities(api_client: TestClient, expdb_test: Connection) -> None:
- response = api_client.get("/datasets/qualities/list")
+def test_list_qualities(py_api: TestClient, expdb_test: Connection) -> None:
+ response = py_api.get("/datasets/qualities/list")
assert response.status_code == http.client.OK
expected = {
"data_qualities_list": {
@@ -157,13 +157,13 @@ def test_list_qualities(api_client: TestClient, expdb_test: Connection) -> None:
deleted = expected["data_qualities_list"]["quality"].pop()
_remove_quality_from_database(quality_name=deleted, expdb_test=expdb_test)
- response = api_client.get("/datasets/qualities/list")
+ response = py_api.get("/datasets/qualities/list")
assert response.status_code == http.client.OK
assert expected == response.json()
-def test_get_quality(api_client: TestClient) -> None:
- response = api_client.get("/datasets/qualities/1")
+def test_get_quality(py_api: TestClient) -> None:
+ response = py_api.get("/datasets/qualities/1")
assert response.status_code == http.client.OK
expected = [
{"name": "AutoCorrelation", "value": 0.6064659977703456},
@@ -282,9 +282,9 @@ def test_get_quality(api_client: TestClient) -> None:
"data_id",
list(set(range(1, 132)) - {55, 56, 59, 116, 130}),
)
-def test_get_quality_identical(data_id: int, api_client: TestClient) -> None:
- php_response = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/data/qualities/{data_id}")
- python_response = api_client.get(f"/datasets/qualities/{data_id}")
+def test_get_quality_identical(data_id: int, py_api: TestClient, php_api: httpx.Client) -> None:
+ php_response = php_api.get(f"/data/qualities/{data_id}")
+ python_response = py_api.get(f"/datasets/qualities/{data_id}")
assert python_response.status_code == php_response.status_code
expected = [
@@ -302,13 +302,17 @@ def test_get_quality_identical(data_id: int, api_client: TestClient) -> None:
"data_id",
[55, 56, 59, 116, 130, 132],
)
-def test_get_quality_identical_error(data_id: int, api_client: TestClient) -> None:
+def test_get_quality_identical_error(
+ data_id: int,
+ py_api: TestClient,
+ php_api: httpx.Client,
+) -> None:
if data_id in [55, 56, 59]:
pytest.skip("Detailed error for code 364 (failed processing) not yet supported.")
if data_id in [116]:
pytest.skip("Detailed error for code 362 (no qualities) not yet supported.")
- php_response = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/data/qualities/{data_id}")
- python_response = api_client.get(f"/datasets/qualities/{data_id}")
+ php_response = php_api.get(f"/data/qualities/{data_id}")
+ python_response = py_api.get(f"/datasets/qualities/{data_id}")
assert python_response.status_code == php_response.status_code
# The "dataset unknown" error currently has a separate code in PHP depending on
# where it occurs (e.g., get dataset->113 get quality->361)
diff --git a/tests/routers/openml/task_type_test.py b/tests/routers/openml/task_type_test.py
index 2d91cab8..c5182716 100644
--- a/tests/routers/openml/task_type_test.py
+++ b/tests/routers/openml/task_type_test.py
@@ -7,9 +7,9 @@
@pytest.mark.php()
-def test_list_task_type(api_client: TestClient) -> None:
- response = api_client.get("/tasktype/list")
- original = httpx.get("http://server-api-php-api-1:80/api/v1/json/tasktype/list")
+def test_list_task_type(py_api: TestClient, php_api: httpx.Client) -> None:
+ response = py_api.get("/tasktype/list")
+ original = php_api.get("/tasktype/list")
assert response.status_code == original.status_code
assert response.json() == original.json()
@@ -19,9 +19,9 @@ def test_list_task_type(api_client: TestClient) -> None:
"ttype_id",
list(range(1, 12)),
)
-def test_get_task_type(ttype_id: int, api_client: TestClient) -> None:
- response = api_client.get(f"/tasktype/{ttype_id}")
- original = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/tasktype/{ttype_id}")
+def test_get_task_type(ttype_id: int, py_api: TestClient, php_api: httpx.Client) -> None:
+ response = py_api.get(f"/tasktype/{ttype_id}")
+ original = php_api.get(f"/tasktype/{ttype_id}")
assert response.status_code == original.status_code
py_json = response.json()
@@ -36,7 +36,7 @@ def test_get_task_type(ttype_id: int, api_client: TestClient) -> None:
assert not differences
-def test_get_task_type_unknown(api_client: TestClient) -> None:
- response = api_client.get("/tasktype/1000")
+def test_get_task_type_unknown(py_api: TestClient) -> None:
+ response = py_api.get("/tasktype/1000")
assert response.status_code == http.client.PRECONDITION_FAILED
assert response.json() == {"detail": {"code": "241", "message": "Unknown task type."}}
From 714eaf25d90c8de1435123db7605867f31c319b0 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Tue, 28 Nov 2023 16:21:36 +0100
Subject: [PATCH 3/3] Update for removal of distinct V1 and V2 endpoints
---
docs/migration.md | 90 ++++++++++++++++-------------------------------
1 file changed, 30 insertions(+), 60 deletions(-)
diff --git a/docs/migration.md b/docs/migration.md
index e3e9c38d..dc49e33c 100644
--- a/docs/migration.md
+++ b/docs/migration.md
@@ -1,31 +1,22 @@
# Migration
-This implementation currently maintains two separate endpoints.
-There are "old" endpoints (Python-V1), which mimic responses of the PHP REST API (PHP-V1)
-as closely as possible, and new endpoints (V2) which have some additional changes and
-will be updated going forward.
-
-The advised way to upgrade connector packages that currently interact with the old
-JSON API is to first migrate from the old PHP API to our re-implemented V1 API.
-See "[V1: PHP to Python](#v1--php-to-python)" for the differences between the PHP and
-Python API. After that migration, continue with the "[V1 to V2](#v1-to-v2)" guide.
-
-Connectors currently using the XML API are recommended to upgrade to V2 directly,
-in which case using the generated REST API documentation is recommended.
-
-## V1: PHP to Python
-
-The first iteration of the new server has nearly identical responses to the old JSON
-endpoints, but there are exceptions. Most exceptions either bug fixes, or arise from
-technical limitations. This list covers the most important changes, but there may
-be some undocumented changes for edge cases. The PHP API was underspecified, and we
-decided that reverse engineering the specifications which mostly arise from
-implementation details was not worth the effort. If there is a behavioral change which
-was not documented but affects you, please [open a bug report](https://github.com/openml/server-api/issues/new?assignees=&labels=bug%2C+triage&projects=&template=bug-report.md&title=).
-
-### All Endpoints
+The Python reimplementation provides the same endpoints as the old API, which are
+largely functioning the same way. However, there are a few major deviations:
+
+ * Use of typed JSON: e.g., when a value represents an integer, it is returned as integer.
+ * Lists when multiple values are possible: if a field can have none, one, or multiple entries (e.g., authors), we always return a list.
+ * Restriction or expansion of input types as appropriate.
+ * Standardizing authentication and access messages, and consistently execute those checks
+ before fetching data or providing error messages about the data.
+
+The list above is not exhaustive. Minor changes include, for example, bug fixes and the removal of unnecessary nesting.
+There may be undocumented changes, especially in edge cases which may not have occurred in the test environment.
+As the PHP API was underspecified, the re-implementation is based on a mix of reading old code and probing the API.
+If there is a behavioral change which was not documented but affects you, please [open a bug report](https://github.com/openml/server-api/issues/new?assignees=&labels=bug%2C+triage&projects=&template=bug-report.md&title=).
+
+## All Endpoints
The following changes affect all endpoints.
-#### Error on Invalid Input
+### Error on Invalid Input
When providing input of invalid types (e.g., a non-integer dataset id) the HTTP header
and JSON content will be different.
@@ -45,7 +36,7 @@ and JSON content will be different.
These endpoints now do enforce stricter input constraints.
Constraints for each endpoint parameter are documented in the API docs.
-#### Other Errors
+### Other Errors
For any other error messages, the response is identical except that outer field will be `"detail"` instead of `"error"`:
```diff title="JSON Content"
@@ -53,9 +44,7 @@ For any other error messages, the response is identical except that outer field
+ {"detail":{"code":"112","message":"No access granted"}}
```
-In some cases the JSON endpoints previously returned XML ([example](https://github.com/openml/OpenML/issues/1200)).
-Python-V1 will always return JSON.
-
+In some cases the JSON endpoints previously returned XML ([example](https://github.com/openml/OpenML/issues/1200)), the new API always returns JSON.
```diff title="XML replaced by JSON"
-
@@ -65,20 +54,26 @@ Python-V1 will always return JSON.
+ {"detail": {"code":"103", "message": "Authentication failed" } }
```
-### Datasets
+## Datasets
-#### `GET /{dataset_id}`
+### `GET /{dataset_id}`
- Dataset format names are normalized to be all lower-case
(`"Sparse_ARFF"` -> `"sparse_arff"`).
- - Non-`arff` datasets will not incorrectly have a `"parquet_url"`:
- https://github.com/openml/OpenML/issues/1189
+ - Non-`arff` datasets will not incorrectly have a `"parquet_url"` ([openml#1189](https://github.com/openml/OpenML/issues/1189)).
- If `"creator"` contains multiple comma-separated creators it is always returned
as a list, instead of it depending on the quotation used by the original uploader.
- For (some?) datasets that have multiple values in `"ignore_attribute"`, this field
is correctly populated instead of omitted.
+ - Processing date is formatted with a `T` in the middle:
+ ```diff title="processing_date"
+ - "2019-07-09 15:22:03"
+ + "2019-07-09T15:22:03"
+ ```
+ - Fields which may contain lists of values (e.g., `creator`, `contributor`) now always
+ returns a list (which may also be empty or contain a single element).
+ - Fields without a set value are no longer automatically removed from the response.
-
-#### `GET /data/list/{filters}`
+### `GET /data/list/{filters}`
The endpoint now accepts the filters in the body of the request, instead of as query parameters.
```diff
@@ -95,28 +90,3 @@ includes datasets which are private.
The `limit` and `offset` parameters can now be used independently, you no longer need
to provide both if you wish to set only one.
-
-## V1 to V2
-Most of the changes are focused on standardizing responses, working on:
-
- * using JSON types.
- * removing levels of nesting for endpoints which return single-field JSON.
- * always returning lists for fields which may contain multiple values even if it
- contains only one element or no element.
- * restricting or expanding input types as appropriate.
- * standardizing authentication and access messages, and consistently execute those checks
- before fetching data or providing error messages about the data.
-
-
-### Datasets
-
-#### `GET /{dataset_id}`
-
- - Processing date is formatted with a `T` in the middle:
- ```diff title="processing_date"
- - "2019-07-09 15:22:03"
- + "2019-07-09T15:22:03"
- ```
- - Fields which may contain lists of values (e.g., `creator`, `contributor`) now always
- returns a list (which may also be empty or contain a single element).
- - Fields without a set value are no longer automatically removed from the response.