From 3d1ccc0c7584c8ae69d6bebf9bc3bd45c332e3e0 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Tue, 28 Nov 2023 12:12:36 +0100
Subject: [PATCH 1/3] Remove V1/V2 distinction, separate migration tests

---
 src/main.py                                   |   8 +-
 src/routers/mldcat_ap/dataset.py              |   2 +-
 src/routers/{v1 => openml}/__init__.py        |   0
 src/routers/{v1 => openml}/datasets.py        | 219 +++++++++++++-----
 src/routers/{v1 => openml}/qualities.py       |   4 +-
 src/routers/{v1 => openml}/tasktype.py        |   2 +-
 src/routers/v2/datasets.py                    | 186 ---------------
 .../{v1 => openml}/dataset_tag_test.py        |  64 +----
 .../datasets_list_datasets_test.py            |  28 +--
 tests/routers/{v2 => openml}/datasets_test.py |   4 +-
 .../migration/datasets_migration_test.py      | 187 +++++++++++++++
 .../routers/{v1 => openml}/qualities_test.py  |  12 +-
 .../routers/{v1 => openml}/task_type_test.py  |   6 +-
 tests/routers/v1/datasets_old_test.py         |  98 --------
 14 files changed, 389 insertions(+), 431 deletions(-)
 rename src/routers/{v1 => openml}/__init__.py (100%)
 rename src/routers/{v1 => openml}/datasets.py (58%)
 rename src/routers/{v1 => openml}/qualities.py (95%)
 rename src/routers/{v1 => openml}/tasktype.py (98%)
 delete mode 100644 src/routers/v2/datasets.py
 rename tests/routers/{v1 => openml}/dataset_tag_test.py (60%)
 rename tests/routers/{v1 => openml}/datasets_list_datasets_test.py (94%)
 rename tests/routers/{v2 => openml}/datasets_test.py (91%)
 create mode 100644 tests/routers/openml/migration/datasets_migration_test.py
 rename tests/routers/{v1 => openml}/qualities_test.py (97%)
 rename tests/routers/{v1 => openml}/task_type_test.py (89%)
 delete mode 100644 tests/routers/v1/datasets_old_test.py

diff --git a/src/main.py b/src/main.py
index ffd2f7c6..db548f63 100644
--- a/src/main.py
+++ b/src/main.py
@@ -3,10 +3,9 @@
 import uvicorn
 from fastapi import FastAPI
 from routers.mldcat_ap.dataset import router as mldcat_ap_router
-from routers.v1.datasets import router as datasets_router_v1_format
-from routers.v1.qualities import router as qualities_router
-from routers.v1.tasktype import router as ttype_router
-from routers.v2.datasets import router as datasets_router
+from routers.openml.datasets import router as datasets_router
+from routers.openml.qualities import router as qualities_router
+from routers.openml.tasktype import router as ttype_router
 
 
 def _parse_args() -> argparse.Namespace:
@@ -39,7 +38,6 @@ def create_api() -> FastAPI:
     app = FastAPI()
 
     app.include_router(datasets_router)
-    app.include_router(datasets_router_v1_format)
     app.include_router(qualities_router)
     app.include_router(mldcat_ap_router)
     app.include_router(ttype_router)
diff --git a/src/routers/mldcat_ap/dataset.py b/src/routers/mldcat_ap/dataset.py
index 8d9994b7..771edb83 100644
--- a/src/routers/mldcat_ap/dataset.py
+++ b/src/routers/mldcat_ap/dataset.py
@@ -5,7 +5,7 @@
 from sqlalchemy import Connection
 
 from routers.dependencies import expdb_connection, userdb_connection
-from routers.v2.datasets import get_dataset
+from routers.openml.datasets import get_dataset
 
 router = APIRouter(prefix="/mldcat_ap/datasets", tags=["datasets"])
 
diff --git a/src/routers/v1/__init__.py b/src/routers/openml/__init__.py
similarity index 100%
rename from src/routers/v1/__init__.py
rename to src/routers/openml/__init__.py
diff --git a/src/routers/v1/datasets.py b/src/routers/openml/datasets.py
similarity index 58%
rename from src/routers/v1/datasets.py
rename to src/routers/openml/datasets.py
index 172890c7..65c18a61 100644
--- a/src/routers/v1/datasets.py
+++ b/src/routers/openml/datasets.py
@@ -2,22 +2,30 @@
 We add separate endpoints for old-style JSON responses, so they don't clutter the schema of the
 new API, and are easily removed later.
 """
+import html
 import http.client
-from enum import StrEnum
+from collections import namedtuple
+from enum import IntEnum, StrEnum
 from typing import Annotated, Any, Literal
 
-from database.datasets import get_tags
+from database.datasets import get_dataset as db_get_dataset
+from database.datasets import (
+    get_file,
+    get_latest_dataset_description,
+    get_latest_processing_update,
+    get_latest_status_update,
+    get_tags,
+)
 from database.datasets import tag_dataset as db_tag_dataset
-from database.users import APIKey, User, UserGroup
+from database.users import APIKey, User, UserGroup, get_user_groups_for, get_user_id_for
 from fastapi import APIRouter, Body, Depends, HTTPException
-from schemas.datasets.openml import DatasetStatus
+from schemas.datasets.openml import DatasetFileFormat, DatasetMetadata, DatasetStatus, Visibility
 from sqlalchemy import Connection
 
 from routers.dependencies import Pagination, expdb_connection, fetch_user, userdb_connection
 from routers.types import CasualString128, IntegerRange, SystemString64, integer_range_regex
-from routers.v2.datasets import get_dataset
 
-router = APIRouter(prefix="/v1/datasets", tags=["datasets"])
+router = APIRouter(prefix="/datasets", tags=["datasets"])
 
 
 @router.post(
@@ -238,58 +246,161 @@ def quality_clause(quality: str, range_: str | None) -> str:
     return {"data": {"dataset": list(datasets.values())}}
 
 
+class DatasetError(IntEnum):
+    NOT_FOUND = 111
+    NO_ACCESS = 112
+    NO_DATA_FILE = 113
+
+
+processing_info = namedtuple("processing_info", ["date", "warning", "error"])
+
+
+def _get_processing_information(dataset_id: int, connection: Connection) -> processing_info:
+    """Return processing information, if any. Otherwise, all fields `None`."""
+    if not (data_processed := get_latest_processing_update(dataset_id, connection)):
+        return processing_info(date=None, warning=None, error=None)
+
+    date_processed = data_processed["processing_date"]
+    warning = data_processed["warning"].strip() if data_processed["warning"] else None
+    error = data_processed["error"].strip() if data_processed["error"] else None
+    return processing_info(date=date_processed, warning=warning, error=error)
+
+
+def _format_error(*, code: DatasetError, message: str) -> dict[str, str]:
+    """Formatter for JSON bodies of OpenML error codes."""
+    return {"code": str(code), "message": message}
+
+
+def _user_has_access(
+    dataset: dict[str, Any],
+    connection: Connection,
+    api_key: APIKey | None = None,
+) -> bool:
+    """Determine if user of `api_key` has the right to view `dataset`."""
+    if dataset["visibility"] == Visibility.PUBLIC:
+        return True
+    if not api_key:
+        return False
+
+    if not (user_id := get_user_id_for(api_key=api_key, connection=connection)):
+        return False
+
+    if user_id == dataset["uploader"]:
+        return True
+
+    user_groups = get_user_groups_for(user_id=user_id, connection=connection)
+    ADMIN_GROUP = 1
+    return ADMIN_GROUP in user_groups
+
+
+def _format_parquet_url(dataset: dict[str, Any]) -> str | None:
+    if dataset["format"].lower() != DatasetFileFormat.ARFF:
+        return None
+
+    minio_base_url = "https://openml1.win.tue.nl"
+    return f"{minio_base_url}/dataset{dataset['did']}/dataset_{dataset['did']}.pq"
+
+
+def _format_dataset_url(dataset: dict[str, Any]) -> str:
+    base_url = "https://test.openml.org"
+    filename = f"{html.escape(dataset['name'])}.{dataset['format'].lower()}"
+    return f"{base_url}/data/v1/download/{dataset['file_id']}/{filename}"
+
+
+def _safe_unquote(text: str | None) -> str | None:
+    """Remove any open and closing quotes and return the remainder if non-empty."""
+    if not text:
+        return None
+    return text.strip("'\"") or None
+
+
+def _csv_as_list(text: str | None, *, unquote_items: bool = True) -> list[str]:
+    """Return comma-separated values in `text` as list, optionally remove quotes."""
+    if not text:
+        return []
+    chars_to_strip = "'\"\t " if unquote_items else "\t "
+    return [item.strip(chars_to_strip) for item in text.split(",")]
+
+
 @router.get(
     path="/{dataset_id}",
-    description="Get old-style wrapped meta-data for dataset with ID `dataset_id`.",
+    description="Get meta-data for dataset with ID `dataset_id`.",
 )
-def get_dataset_wrapped(
+def get_dataset(
     dataset_id: int,
     api_key: APIKey | None = None,
     user_db: Annotated[Connection, Depends(userdb_connection)] = None,
     expdb_db: Annotated[Connection, Depends(expdb_connection)] = None,
-) -> dict[str, dict[str, Any]]:
-    try:
-        dataset = get_dataset(
-            user_db=user_db,
-            expdb_db=expdb_db,
-            dataset_id=dataset_id,
-            api_key=api_key,
-        ).model_dump(by_alias=True)
-    except HTTPException as e:
-        raise HTTPException(
-            status_code=http.client.PRECONDITION_FAILED,
-            detail=e.detail,
-        ) from None
-    if processing_data := dataset.get("processing_date"):
-        dataset["processing_date"] = str(processing_data).replace("T", " ")
-    if parquet_url := dataset.get("parquet_url"):
-        dataset["parquet_url"] = str(parquet_url).replace("https", "http")
-    if minio_url := dataset.get("minio_url"):
-        dataset["minio_url"] = str(minio_url).replace("https", "http")
-
-    manual = []
-    # ref test.openml.org/d/33 (contributor) and d/34 (creator)
-    #   contributor/creator in database is '""'
-    #   json content is []
-    for field in ["contributor", "creator"]:
-        if dataset[field] == [""]:
-            dataset[field] = []
-            manual.append(field)
-
-    if isinstance(dataset["original_data_url"], list):
-        dataset["original_data_url"] = ", ".join(str(url) for url in dataset["original_data_url"])
-
-    for field, value in list(dataset.items()):
-        if field in manual:
-            continue
-        if isinstance(value, int):
-            dataset[field] = str(value)
-        elif isinstance(value, list) and len(value) == 1:
-            dataset[field] = str(value[0])
-        if not dataset[field]:
-            del dataset[field]
-
-    if "description" not in dataset:
-        dataset["description"] = []
-
-    return {"data_set_description": dataset}
+) -> DatasetMetadata:
+    if not (dataset := db_get_dataset(dataset_id, expdb_db)):
+        error = _format_error(code=DatasetError.NOT_FOUND, message="Unknown dataset")
+        raise HTTPException(status_code=http.client.NOT_FOUND, detail=error)
+
+    if not _user_has_access(dataset=dataset, connection=user_db, api_key=api_key):
+        error = _format_error(code=DatasetError.NO_ACCESS, message="No access granted")
+        raise HTTPException(status_code=http.client.FORBIDDEN, detail=error)
+
+    if not (dataset_file := get_file(dataset["file_id"], user_db)):
+        error = _format_error(
+            code=DatasetError.NO_DATA_FILE,
+            message="No data file found",
+        )
+        raise HTTPException(status_code=http.client.PRECONDITION_FAILED, detail=error)
+
+    tags = get_tags(dataset_id, expdb_db)
+    description = get_latest_dataset_description(dataset_id, expdb_db)
+    processing_result = _get_processing_information(dataset_id, expdb_db)
+    status = get_latest_status_update(dataset_id, expdb_db)
+
+    status_ = DatasetStatus(status["status"]) if status else DatasetStatus.IN_PREPARATION
+
+    description_ = ""
+    if description:
+        description_ = description["description"].replace("\r", "").strip()
+
+    dataset_url = _format_dataset_url(dataset)
+    parquet_url = _format_parquet_url(dataset)
+
+    contributors = _csv_as_list(dataset["contributor"], unquote_items=True)
+    creators = _csv_as_list(dataset["creator"], unquote_items=True)
+    ignore_attribute = _csv_as_list(dataset["ignore_attribute"], unquote_items=True)
+    row_id_attribute = _csv_as_list(dataset["row_id_attribute"], unquote_items=True)
+    original_data_url = _csv_as_list(dataset["original_data_url"], unquote_items=True)
+
+    # Not sure which properties are set by this bit:
+    # foreach( $this->xml_fields_dataset['csv'] as $field ) {
+    #   $dataset->{$field} = getcsv( $dataset->{$field} );
+    # }
+
+    return DatasetMetadata(
+        id=dataset["did"],
+        visibility=dataset["visibility"],
+        status=status_,
+        name=dataset["name"],
+        licence=dataset["licence"],
+        version=dataset["version"],
+        version_label=dataset["version_label"] or "",
+        language=dataset["language"] or "",
+        creator=creators,
+        contributor=contributors,
+        citation=dataset["citation"] or "",
+        upload_date=dataset["upload_date"],
+        processing_date=processing_result.date,
+        warning=processing_result.warning,
+        error=processing_result.error,
+        description=description_,
+        description_version=description["version"] if description else 0,
+        tag=tags,
+        default_target_attribute=_safe_unquote(dataset["default_target_attribute"]),
+        ignore_attribute=ignore_attribute,
+        row_id_attribute=row_id_attribute,
+        url=dataset_url,
+        parquet_url=parquet_url,
+        minio_url=parquet_url,
+        file_id=dataset["file_id"],
+        format=dataset["format"].lower(),
+        paper_url=dataset["paper_url"] or None,
+        original_data_url=original_data_url,
+        collection_date=dataset["collection_date"],
+        md5_checksum=dataset_file["md5_hash"],
+    )
diff --git a/src/routers/v1/qualities.py b/src/routers/openml/qualities.py
similarity index 95%
rename from src/routers/v1/qualities.py
rename to src/routers/openml/qualities.py
index 366e208d..569f5d03 100644
--- a/src/routers/v1/qualities.py
+++ b/src/routers/openml/qualities.py
@@ -8,9 +8,9 @@
 from sqlalchemy import Connection, text
 
 from routers.dependencies import expdb_connection, fetch_user
-from routers.v2.datasets import DatasetError
+from routers.openml.datasets import DatasetError
 
-router = APIRouter(prefix="/v1/datasets", tags=["datasets"])
+router = APIRouter(prefix="/datasets", tags=["datasets"])
 
 
 @router.get("/qualities/list")
diff --git a/src/routers/v1/tasktype.py b/src/routers/openml/tasktype.py
similarity index 98%
rename from src/routers/v1/tasktype.py
rename to src/routers/openml/tasktype.py
index 49930843..909cea5f 100644
--- a/src/routers/v1/tasktype.py
+++ b/src/routers/openml/tasktype.py
@@ -9,7 +9,7 @@
 
 from routers.dependencies import expdb_connection
 
-router = APIRouter(prefix="/v1/tasktype", tags=["tasks"])
+router = APIRouter(prefix="/tasktype", tags=["tasks"])
 
 
 def _normalize_task_type(task_type: dict[str, str | int]) -> dict[str, str | list[Any]]:
diff --git a/src/routers/v2/datasets.py b/src/routers/v2/datasets.py
deleted file mode 100644
index 864a9eef..00000000
--- a/src/routers/v2/datasets.py
+++ /dev/null
@@ -1,186 +0,0 @@
-import html
-import http.client
-from collections import namedtuple
-from enum import IntEnum
-from typing import Annotated, Any
-
-from database.datasets import get_dataset as db_get_dataset
-from database.datasets import (
-    get_file,
-    get_latest_dataset_description,
-    get_latest_processing_update,
-    get_latest_status_update,
-    get_tags,
-)
-from database.users import APIKey, get_user_groups_for, get_user_id_for
-from fastapi import APIRouter, Depends, HTTPException
-from routers.dependencies import expdb_connection, userdb_connection
-from schemas.datasets.openml import (
-    DatasetFileFormat,
-    DatasetMetadata,
-    DatasetStatus,
-    Visibility,
-)
-from sqlalchemy import Connection
-
-router = APIRouter(prefix="/v2/datasets", tags=["datasets"])
-
-
-class DatasetError(IntEnum):
-    NOT_FOUND = 111
-    NO_ACCESS = 112
-    NO_DATA_FILE = 113
-
-
-processing_info = namedtuple("processing_info", ["date", "warning", "error"])
-
-
-def _get_processing_information(dataset_id: int, connection: Connection) -> processing_info:
-    """Return processing information, if any. Otherwise, all fields `None`."""
-    if not (data_processed := get_latest_processing_update(dataset_id, connection)):
-        return processing_info(date=None, warning=None, error=None)
-
-    date_processed = data_processed["processing_date"]
-    warning = data_processed["warning"].strip() if data_processed["warning"] else None
-    error = data_processed["error"].strip() if data_processed["error"] else None
-    return processing_info(date=date_processed, warning=warning, error=error)
-
-
-def _format_error(*, code: DatasetError, message: str) -> dict[str, str]:
-    """Formatter for JSON bodies of OpenML error codes."""
-    return {"code": str(code), "message": message}
-
-
-def _user_has_access(
-    dataset: dict[str, Any],
-    connection: Connection,
-    api_key: APIKey | None = None,
-) -> bool:
-    """Determine if user of `api_key` has the right to view `dataset`."""
-    if dataset["visibility"] == Visibility.PUBLIC:
-        return True
-    if not api_key:
-        return False
-
-    if not (user_id := get_user_id_for(api_key=api_key, connection=connection)):
-        return False
-
-    if user_id == dataset["uploader"]:
-        return True
-
-    user_groups = get_user_groups_for(user_id=user_id, connection=connection)
-    ADMIN_GROUP = 1
-    return ADMIN_GROUP in user_groups
-
-
-def _format_parquet_url(dataset: dict[str, Any]) -> str | None:
-    if dataset["format"].lower() != DatasetFileFormat.ARFF:
-        return None
-
-    minio_base_url = "https://openml1.win.tue.nl"
-    return f"{minio_base_url}/dataset{dataset['did']}/dataset_{dataset['did']}.pq"
-
-
-def _format_dataset_url(dataset: dict[str, Any]) -> str:
-    base_url = "https://test.openml.org"
-    filename = f"{html.escape(dataset['name'])}.{dataset['format'].lower()}"
-    return f"{base_url}/data/v1/download/{dataset['file_id']}/{filename}"
-
-
-def _safe_unquote(text: str | None) -> str | None:
-    """Remove any open and closing quotes and return the remainder if non-empty."""
-    if not text:
-        return None
-    return text.strip("'\"") or None
-
-
-def _csv_as_list(text: str | None, *, unquote_items: bool = True) -> list[str]:
-    """Return comma-separated values in `text` as list, optionally remove quotes."""
-    if not text:
-        return []
-    chars_to_strip = "'\"\t " if unquote_items else "\t "
-    return [item.strip(chars_to_strip) for item in text.split(",")]
-
-
-@router.get(
-    path="/{dataset_id}",
-    description="Get meta-data for dataset with ID `dataset_id`.",
-)
-def get_dataset(
-    dataset_id: int,
-    api_key: APIKey | None = None,
-    user_db: Annotated[Connection, Depends(userdb_connection)] = None,
-    expdb_db: Annotated[Connection, Depends(expdb_connection)] = None,
-) -> DatasetMetadata:
-    if not (dataset := db_get_dataset(dataset_id, expdb_db)):
-        error = _format_error(code=DatasetError.NOT_FOUND, message="Unknown dataset")
-        raise HTTPException(status_code=http.client.NOT_FOUND, detail=error)
-
-    if not _user_has_access(dataset=dataset, connection=user_db, api_key=api_key):
-        error = _format_error(code=DatasetError.NO_ACCESS, message="No access granted")
-        raise HTTPException(status_code=http.client.FORBIDDEN, detail=error)
-
-    if not (dataset_file := get_file(dataset["file_id"], user_db)):
-        error = _format_error(
-            code=DatasetError.NO_DATA_FILE,
-            message="No data file found",
-        )
-        raise HTTPException(status_code=http.client.PRECONDITION_FAILED, detail=error)
-
-    tags = get_tags(dataset_id, expdb_db)
-    description = get_latest_dataset_description(dataset_id, expdb_db)
-    processing_result = _get_processing_information(dataset_id, expdb_db)
-    status = get_latest_status_update(dataset_id, expdb_db)
-
-    status_ = DatasetStatus(status["status"]) if status else DatasetStatus.IN_PREPARATION
-
-    description_ = ""
-    if description:
-        description_ = description["description"].replace("\r", "").strip()
-
-    dataset_url = _format_dataset_url(dataset)
-    parquet_url = _format_parquet_url(dataset)
-
-    contributors = _csv_as_list(dataset["contributor"], unquote_items=True)
-    creators = _csv_as_list(dataset["creator"], unquote_items=True)
-    ignore_attribute = _csv_as_list(dataset["ignore_attribute"], unquote_items=True)
-    row_id_attribute = _csv_as_list(dataset["row_id_attribute"], unquote_items=True)
-    original_data_url = _csv_as_list(dataset["original_data_url"], unquote_items=True)
-
-    # Not sure which properties are set by this bit:
-    # foreach( $this->xml_fields_dataset['csv'] as $field ) {
-    #   $dataset->{$field} = getcsv( $dataset->{$field} );
-    # }
-
-    return DatasetMetadata(
-        id=dataset["did"],
-        visibility=dataset["visibility"],
-        status=status_,
-        name=dataset["name"],
-        licence=dataset["licence"],
-        version=dataset["version"],
-        version_label=dataset["version_label"] or "",
-        language=dataset["language"] or "",
-        creator=creators,
-        contributor=contributors,
-        citation=dataset["citation"] or "",
-        upload_date=dataset["upload_date"],
-        processing_date=processing_result.date,
-        warning=processing_result.warning,
-        error=processing_result.error,
-        description=description_,
-        description_version=description["version"] if description else 0,
-        tag=tags,
-        default_target_attribute=_safe_unquote(dataset["default_target_attribute"]),
-        ignore_attribute=ignore_attribute,
-        row_id_attribute=row_id_attribute,
-        url=dataset_url,
-        parquet_url=parquet_url,
-        minio_url=parquet_url,
-        file_id=dataset["file_id"],
-        format=dataset["format"].lower(),
-        paper_url=dataset["paper_url"] or None,
-        original_data_url=original_data_url,
-        collection_date=dataset["collection_date"],
-        md5_checksum=dataset_file["md5_hash"],
-    )
diff --git a/tests/routers/v1/dataset_tag_test.py b/tests/routers/openml/dataset_tag_test.py
similarity index 60%
rename from tests/routers/v1/dataset_tag_test.py
rename to tests/routers/openml/dataset_tag_test.py
index b3d0e250..f868f2cf 100644
--- a/tests/routers/v1/dataset_tag_test.py
+++ b/tests/routers/openml/dataset_tag_test.py
@@ -21,7 +21,7 @@ def test_dataset_tag_rejects_unauthorized(key: ApiKey, api_client: TestClient) -
     response = cast(
         httpx.Response,
         api_client.post(
-            f"/v1/datasets/tag{apikey}",
+            f"/datasets/tag{apikey}",
             json={"data_id": constants.PRIVATE_DATASET_ID, "tag": "test"},
         ),
     )
@@ -39,7 +39,7 @@ def test_dataset_tag(key: ApiKey, expdb_test: Connection, api_client: TestClient
     response = cast(
         httpx.Response,
         api_client.post(
-            f"/v1/datasets/tag?api_key={key}",
+            f"/datasets/tag?api_key={key}",
             json={"data_id": dataset_id, "tag": tag},
         ),
     )
@@ -55,7 +55,7 @@ def test_dataset_tag_returns_existing_tags(api_client: TestClient) -> None:
     response = cast(
         httpx.Response,
         api_client.post(
-            f"/v1/datasets/tag?api_key={ApiKey.ADMIN}",
+            f"/datasets/tag?api_key={ApiKey.ADMIN}",
             json={"data_id": dataset_id, "tag": tag},
         ),
     )
@@ -68,7 +68,7 @@ def test_dataset_tag_fails_if_tag_exists(api_client: TestClient) -> None:
     response = cast(
         httpx.Response,
         api_client.post(
-            f"/v1/datasets/tag?api_key={ApiKey.ADMIN}",
+            f"/datasets/tag?api_key={ApiKey.ADMIN}",
             json={"data_id": dataset_id, "tag": tag},
         ),
     )
@@ -95,64 +95,10 @@ def test_dataset_tag_invalid_tag_is_rejected(
     new = cast(
         httpx.Response,
         api_client.post(
-            f"/v1/datasets/tag?api_key{ApiKey.ADMIN}",
+            f"/datasets/tag?api_key{ApiKey.ADMIN}",
             json={"data_id": 1, "tag": tag},
         ),
     )
 
     assert new.status_code == http.client.UNPROCESSABLE_ENTITY
     assert ["body", "tag"] == new.json()["detail"][0]["loc"]
-
-
-@pytest.mark.php()
-@pytest.mark.parametrize(
-    "dataset_id",
-    list(range(1, 10)) + [101],
-)
-@pytest.mark.parametrize(
-    "api_key",
-    [ApiKey.ADMIN, ApiKey.REGULAR_USER, ApiKey.OWNER_USER],
-    ids=["Administrator", "regular user", "possible owner"],
-)
-@pytest.mark.parametrize(
-    "tag",
-    ["study_14", "totally_new_tag_for_migration_testing"],
-    ids=["typically existing tag", "new tag"],
-)
-def test_dataset_tag_response_is_identical(
-    dataset_id: int,
-    tag: str,
-    api_key: str,
-    api_client: TestClient,
-) -> None:
-    original = httpx.post(
-        "http://server-api-php-api-1:80/api/v1/json/data/tag",
-        data={"api_key": api_key, "tag": tag, "data_id": dataset_id},
-    )
-    if (
-        original.status_code == http.client.PRECONDITION_FAILED
-        and original.json()["error"]["message"] == "An Elastic Search Exception occured."
-    ):
-        pytest.skip("Encountered Elastic Search error.")
-    if original.status_code == http.client.OK:
-        # undo the tag, because we don't want to persist this change to the database
-        httpx.post(
-            "http://server-api-php-api-1:80/api/v1/json/data/untag",
-            data={"api_key": api_key, "tag": tag, "data_id": dataset_id},
-        )
-    new = cast(
-        httpx.Response,
-        api_client.post(
-            f"/v1/datasets/tag?api_key={api_key}",
-            json={"data_id": dataset_id, "tag": tag},
-        ),
-    )
-
-    assert original.status_code == new.status_code, original.json()
-    if new.status_code != http.client.OK:
-        assert original.json()["error"] == new.json()["detail"]
-        return
-
-    original = original.json()
-    new = new.json()
-    assert original == new
diff --git a/tests/routers/v1/datasets_list_datasets_test.py b/tests/routers/openml/datasets_list_datasets_test.py
similarity index 94%
rename from tests/routers/v1/datasets_list_datasets_test.py
rename to tests/routers/openml/datasets_list_datasets_test.py
index 0de8e83b..e341735d 100644
--- a/tests/routers/v1/datasets_list_datasets_test.py
+++ b/tests/routers/openml/datasets_list_datasets_test.py
@@ -20,7 +20,7 @@ def _assert_empty_result(
 
 
 def test_list(api_client: TestClient) -> None:
-    response = api_client.get("/v1/datasets/list/")
+    response = api_client.get("/datasets/list/")
     assert response.status_code == http.client.OK
     assert "data" in response.json()
     assert "dataset" in response.json()["data"]
@@ -40,7 +40,7 @@ def test_list(api_client: TestClient) -> None:
 )
 def test_list_filter_active(status: str, amount: int, api_client: TestClient) -> None:
     response = api_client.post(
-        "/v1/datasets/list",
+        "/datasets/list",
         json={"status": status, "pagination": {"limit": constants.NUMBER_OF_DATASETS}},
     )
     assert response.status_code == http.client.OK, response.json()
@@ -60,7 +60,7 @@ def test_list_filter_active(status: str, amount: int, api_client: TestClient) ->
 def test_list_accounts_privacy(api_key: ApiKey | None, amount: int, api_client: TestClient) -> None:
     key = f"?api_key={api_key}" if api_key else ""
     response = api_client.post(
-        f"/v1/datasets/list{key}",
+        f"/datasets/list{key}",
         json={"status": "all", "pagination": {"limit": 1000}},
     )
     assert response.status_code == http.client.OK, response.json()
@@ -75,7 +75,7 @@ def test_list_accounts_privacy(api_key: ApiKey | None, amount: int, api_client:
 def test_list_data_name_present(name: str, count: int, api_client: TestClient) -> None:
     # The second iris dataset is private, so we need to authenticate.
     response = api_client.post(
-        f"/v1/datasets/list?api_key={ApiKey.ADMIN}",
+        f"/datasets/list?api_key={ApiKey.ADMIN}",
         json={"status": "all", "data_name": name},
     )
     assert response.status_code == http.client.OK
@@ -90,7 +90,7 @@ def test_list_data_name_present(name: str, count: int, api_client: TestClient) -
 )
 def test_list_data_name_absent(name: str, api_client: TestClient) -> None:
     response = api_client.post(
-        f"/v1/datasets/list?api_key={ApiKey.ADMIN}",
+        f"/datasets/list?api_key={ApiKey.ADMIN}",
         json={"status": "all", "data_name": name},
     )
     _assert_empty_result(response)
@@ -116,7 +116,7 @@ def test_list_pagination(limit: int | None, offset: int | None, api_client: Test
     offset_body = {} if offset is None else {"offset": offset}
     limit_body = {} if limit is None else {"limit": limit}
     filters = {"status": "all", "pagination": offset_body | limit_body}
-    response = api_client.post("/v1/datasets/list", json=filters)
+    response = api_client.post("/datasets/list", json=filters)
 
     if offset in [130, 200]:
         _assert_empty_result(response)
@@ -133,7 +133,7 @@ def test_list_pagination(limit: int | None, offset: int | None, api_client: Test
 )
 def test_list_data_version(version: int, count: int, api_client: TestClient) -> None:
     response = api_client.post(
-        f"/v1/datasets/list?api_key={ApiKey.ADMIN}",
+        f"/datasets/list?api_key={ApiKey.ADMIN}",
         json={"status": "all", "data_version": version},
     )
     assert response.status_code == http.client.OK
@@ -144,7 +144,7 @@ def test_list_data_version(version: int, count: int, api_client: TestClient) ->
 
 def test_list_data_version_no_result(api_client: TestClient) -> None:
     response = api_client.post(
-        f"/v1/datasets/list?api_key={ApiKey.ADMIN}",
+        f"/datasets/list?api_key={ApiKey.ADMIN}",
         json={"status": "all", "data_version": 4},
     )
     _assert_empty_result(response)
@@ -160,7 +160,7 @@ def test_list_data_version_no_result(api_client: TestClient) -> None:
 )
 def test_list_uploader(user_id: int, count: int, key: str, api_client: TestClient) -> None:
     response = api_client.post(
-        f"/v1/datasets/list?api_key={key}",
+        f"/datasets/list?api_key={key}",
         json={"status": "all", "uploader": user_id},
     )
     # The dataset of user 16 is private, so can not be retrieved by other users.
@@ -179,7 +179,7 @@ def test_list_uploader(user_id: int, count: int, key: str, api_client: TestClien
 )
 def test_list_data_id(data_id: list[int], api_client: TestClient) -> None:
     response = api_client.post(
-        "/v1/datasets/list",
+        "/datasets/list",
         json={"status": "all", "data_id": data_id},
     )
 
@@ -195,7 +195,7 @@ def test_list_data_id(data_id: list[int], api_client: TestClient) -> None:
 )
 def test_list_data_tag(tag: str, count: int, api_client: TestClient) -> None:
     response = api_client.post(
-        "/v1/datasets/list",
+        "/datasets/list",
         # study_14 has 100 datasets, we overwrite the default `limit` because otherwise
         # we don't know if the results are limited by filtering on the tag.
         json={"status": "all", "tag": tag, "pagination": {"limit": 101}},
@@ -207,7 +207,7 @@ def test_list_data_tag(tag: str, count: int, api_client: TestClient) -> None:
 
 def test_list_data_tag_empty(api_client: TestClient) -> None:
     response = api_client.post(
-        "/v1/datasets/list",
+        "/datasets/list",
         json={"status": "all", "tag": "not-a-tag"},
     )
     _assert_empty_result(response)
@@ -228,7 +228,7 @@ def test_list_data_tag_empty(api_client: TestClient) -> None:
 )
 def test_list_data_quality(quality: str, range_: str, count: int, api_client: TestClient) -> None:
     response = api_client.post(
-        "/v1/datasets/list",
+        "/datasets/list",
         json={"status": "all", quality: range_},
     )
     assert response.status_code == http.client.OK, response.json()
@@ -276,7 +276,7 @@ def test_list_data_identical(
         new_style["pagination"]["offset"] = offset
 
     response = api_client.post(
-        f"/v1/datasets/list{api_key_query}",
+        f"/datasets/list{api_key_query}",
         json=new_style,
     )
 
diff --git a/tests/routers/v2/datasets_test.py b/tests/routers/openml/datasets_test.py
similarity index 91%
rename from tests/routers/v2/datasets_test.py
rename to tests/routers/openml/datasets_test.py
index 3b538290..1aff3dc2 100644
--- a/tests/routers/v2/datasets_test.py
+++ b/tests/routers/openml/datasets_test.py
@@ -19,7 +19,7 @@ def test_error_unknown_dataset(
     response_code: int,
     api_client: TestClient,
 ) -> None:
-    response = cast(httpx.Response, api_client.get(f"v2/datasets/{dataset_id}"))
+    response = cast(httpx.Response, api_client.get(f"/datasets/{dataset_id}"))
 
     assert response.status_code == response_code
     assert {"code": "111", "message": "Unknown dataset"} == response.json()["detail"]
@@ -38,7 +38,7 @@ def test_private_dataset_no_user_no_access(
     response_code: int,
 ) -> None:
     query = f"?api_key={api_key}" if api_key else ""
-    response = cast(httpx.Response, api_client.get(f"v2/datasets/130{query}"))
+    response = cast(httpx.Response, api_client.get(f"/datasets/130{query}"))
 
     assert response.status_code == response_code
     assert {"code": "112", "message": "No access granted"} == response.json()["detail"]
diff --git a/tests/routers/openml/migration/datasets_migration_test.py b/tests/routers/openml/migration/datasets_migration_test.py
new file mode 100644
index 00000000..0266e5be
--- /dev/null
+++ b/tests/routers/openml/migration/datasets_migration_test.py
@@ -0,0 +1,187 @@
+import http.client
+import json
+from typing import Any, cast
+
+import httpx
+import pytest
+from starlette.testclient import TestClient
+
+from tests.conftest import ApiKey
+
+
+@pytest.mark.php()
+@pytest.mark.parametrize(
+    "dataset_id",
+    range(1, 132),
+)
+def test_dataset_response_is_identical(dataset_id: int, api_client: TestClient) -> None:
+    original = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/data/{dataset_id}")
+    new = api_client.get(f"/datasets/{dataset_id}")
+
+    if new.status_code == http.client.FORBIDDEN:
+        assert original.status_code == http.client.PRECONDITION_FAILED
+    else:
+        assert original.status_code == new.status_code
+
+    if new.status_code != http.client.OK:
+        assert original.json()["error"] == new.json()["detail"]
+        return
+
+    try:
+        original = original.json()["data_set_description"]
+    except json.decoder.JSONDecodeError:
+        pytest.skip("A PHP error occurred on the test server.")
+
+    if "div" in original:
+        pytest.skip("A PHP error occurred on the test server.")
+
+    # There are a few changes between the old API and the new API, so we convert here:
+    # The new API has normalized `format` field:
+    original["format"] = original["format"].lower()
+
+    # There is odd behavior in the live server that I don't want to recreate:
+    # when the creator is a list of csv names, it can either be a str or a list
+    # depending on whether the names are quoted. E.g.:
+    # '"Alice", "Bob"' -> ["Alice", "Bob"]
+    # 'Alice, Bob' -> 'Alice, Bob'
+    if (
+        "creator" in original
+        and isinstance(original["creator"], str)
+        and len(original["creator"].split(",")) > 1
+    ):
+        original["creator"] = [name.strip() for name in original["creator"].split(",")]
+
+    new_body = new.json()
+    if processing_data := new_body.get("processing_date"):
+        new_body["processing_date"] = str(processing_data).replace("T", " ")
+    if parquet_url := new_body.get("parquet_url"):
+        new_body["parquet_url"] = str(parquet_url).replace("https", "http")
+    if minio_url := new_body.get("minio_url"):
+        new_body["minio_url"] = str(minio_url).replace("https", "http")
+
+    manual = []
+    # ref test.openml.org/d/33 (contributor) and d/34 (creator)
+    #   contributor/creator in database is '""'
+    #   json content is []
+    for field in ["contributor", "creator"]:
+        if new_body[field] == [""]:
+            new_body[field] = []
+            manual.append(field)
+
+    if isinstance(new_body["original_data_url"], list):
+        new_body["original_data_url"] = ", ".join(str(url) for url in new_body["original_data_url"])
+
+    for field, value in list(new_body.items()):
+        if field in manual:
+            continue
+        if isinstance(value, int):
+            new_body[field] = str(value)
+        elif isinstance(value, list) and len(value) == 1:
+            new_body[field] = str(value[0])
+        if not new_body[field]:
+            del new_body[field]
+
+    if "description" not in new_body:
+        new_body["description"] = []
+    assert original == new_body
+
+
+@pytest.mark.parametrize(
+    "dataset_id",
+    [-1, 138, 100_000],
+)
+def test_error_unknown_dataset(
+    dataset_id: int,
+    api_client: TestClient,
+) -> None:
+    response = cast(httpx.Response, api_client.get(f"/datasets/{dataset_id}"))
+
+    # The new API has "404 Not Found" instead of "412 PRECONDITION_FAILED"
+    assert response.status_code == http.client.NOT_FOUND
+    assert {"code": "111", "message": "Unknown dataset"} == response.json()["detail"]
+
+
+@pytest.mark.parametrize(
+    "api_key",
+    [None, "a" * 32],
+)
+def test_private_dataset_no_user_no_access(
+    api_client: TestClient,
+    api_key: str | None,
+) -> None:
+    query = f"?api_key={api_key}" if api_key else ""
+    response = cast(httpx.Response, api_client.get(f"/datasets/130{query}"))
+
+    # New response is 403: Forbidden instead of 412: PRECONDITION FAILED
+    assert response.status_code == http.client.FORBIDDEN
+    assert {"code": "112", "message": "No access granted"} == response.json()["detail"]
+
+
+@pytest.mark.skip("Not sure how to include apikey in test yet.")
+def test_private_dataset_owner_access(
+    api_client: TestClient,
+    dataset_130: dict[str, Any],
+) -> None:
+    response = cast(httpx.Response, api_client.get("/datasets/130?api_key=..."))
+    assert response.status_code == http.client.OK
+    assert dataset_130 == response.json()
+
+
+@pytest.mark.skip("Not sure how to include apikey in test yet.")
+def test_private_dataset_admin_access(api_client: TestClient) -> None:
+    cast(httpx.Response, api_client.get("/datasets/130?api_key=..."))
+    # test against cached response
+
+
+@pytest.mark.php()
+@pytest.mark.parametrize(
+    "dataset_id",
+    list(range(1, 10)) + [101],
+)
+@pytest.mark.parametrize(
+    "api_key",
+    [ApiKey.ADMIN, ApiKey.REGULAR_USER, ApiKey.OWNER_USER],
+    ids=["Administrator", "regular user", "possible owner"],
+)
+@pytest.mark.parametrize(
+    "tag",
+    ["study_14", "totally_new_tag_for_migration_testing"],
+    ids=["typically existing tag", "new tag"],
+)
+def test_dataset_tag_response_is_identical(
+    dataset_id: int,
+    tag: str,
+    api_key: str,
+    api_client: TestClient,
+) -> None:
+    original = httpx.post(
+        "http://server-api-php-api-1:80/api/v1/json/data/tag",
+        data={"api_key": api_key, "tag": tag, "data_id": dataset_id},
+    )
+    if (
+        original.status_code == http.client.PRECONDITION_FAILED
+        and original.json()["error"]["message"] == "An Elastic Search Exception occured."
+    ):
+        pytest.skip("Encountered Elastic Search error.")
+    if original.status_code == http.client.OK:
+        # undo the tag, because we don't want to persist this change to the database
+        httpx.post(
+            "http://server-api-php-api-1:80/api/v1/json/data/untag",
+            data={"api_key": api_key, "tag": tag, "data_id": dataset_id},
+        )
+    new = cast(
+        httpx.Response,
+        api_client.post(
+            f"/datasets/tag?api_key={api_key}",
+            json={"data_id": dataset_id, "tag": tag},
+        ),
+    )
+
+    assert original.status_code == new.status_code, original.json()
+    if new.status_code != http.client.OK:
+        assert original.json()["error"] == new.json()["detail"]
+        return
+
+    original = original.json()
+    new = new.json()
+    assert original == new
diff --git a/tests/routers/v1/qualities_test.py b/tests/routers/openml/qualities_test.py
similarity index 97%
rename from tests/routers/v1/qualities_test.py
rename to tests/routers/openml/qualities_test.py
index d2de98b6..eb30b442 100644
--- a/tests/routers/v1/qualities_test.py
+++ b/tests/routers/openml/qualities_test.py
@@ -30,14 +30,14 @@ def _remove_quality_from_database(quality_name: str, expdb_test: Connection) ->
 @pytest.mark.php()
 def test_list_qualities_identical(api_client: TestClient) -> None:
     original = httpx.get("http://server-api-php-api-1:80/api/v1/json/data/qualities/list")
-    new = api_client.get("/v1/datasets/qualities/list")
+    new = api_client.get("/datasets/qualities/list")
     assert original.status_code == new.status_code
     assert original.json() == new.json()
     # To keep the test idempotent, we cannot test if reaction to database changes is identical
 
 
 def test_list_qualities(api_client: TestClient, expdb_test: Connection) -> None:
-    response = api_client.get("/v1/datasets/qualities/list")
+    response = api_client.get("/datasets/qualities/list")
     assert response.status_code == http.client.OK
     expected = {
         "data_qualities_list": {
@@ -157,13 +157,13 @@ def test_list_qualities(api_client: TestClient, expdb_test: Connection) -> None:
     deleted = expected["data_qualities_list"]["quality"].pop()
     _remove_quality_from_database(quality_name=deleted, expdb_test=expdb_test)
 
-    response = api_client.get("/v1/datasets/qualities/list")
+    response = api_client.get("/datasets/qualities/list")
     assert response.status_code == http.client.OK
     assert expected == response.json()
 
 
 def test_get_quality(api_client: TestClient) -> None:
-    response = api_client.get("/v1/datasets/qualities/1")
+    response = api_client.get("/datasets/qualities/1")
     assert response.status_code == http.client.OK
     expected = [
         {"name": "AutoCorrelation", "value": 0.6064659977703456},
@@ -284,7 +284,7 @@ def test_get_quality(api_client: TestClient) -> None:
 )
 def test_get_quality_identical(data_id: int, api_client: TestClient) -> None:
     php_response = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/data/qualities/{data_id}")
-    python_response = api_client.get(f"/v1/datasets/qualities/{data_id}")
+    python_response = api_client.get(f"/datasets/qualities/{data_id}")
     assert python_response.status_code == php_response.status_code
 
     expected = [
@@ -308,7 +308,7 @@ def test_get_quality_identical_error(data_id: int, api_client: TestClient) -> No
     if data_id in [116]:
         pytest.skip("Detailed error for code 362 (no qualities) not yet supported.")
     php_response = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/data/qualities/{data_id}")
-    python_response = api_client.get(f"/v1/datasets/qualities/{data_id}")
+    python_response = api_client.get(f"/datasets/qualities/{data_id}")
     assert python_response.status_code == php_response.status_code
     # The "dataset unknown" error currently has a separate code in PHP depending on
     # where it occurs (e.g., get dataset->113 get quality->361)
diff --git a/tests/routers/v1/task_type_test.py b/tests/routers/openml/task_type_test.py
similarity index 89%
rename from tests/routers/v1/task_type_test.py
rename to tests/routers/openml/task_type_test.py
index bfc2381b..2d91cab8 100644
--- a/tests/routers/v1/task_type_test.py
+++ b/tests/routers/openml/task_type_test.py
@@ -8,7 +8,7 @@
 
 @pytest.mark.php()
 def test_list_task_type(api_client: TestClient) -> None:
-    response = api_client.get("/v1/tasktype/list")
+    response = api_client.get("/tasktype/list")
     original = httpx.get("http://server-api-php-api-1:80/api/v1/json/tasktype/list")
     assert response.status_code == original.status_code
     assert response.json() == original.json()
@@ -20,7 +20,7 @@ def test_list_task_type(api_client: TestClient) -> None:
     list(range(1, 12)),
 )
 def test_get_task_type(ttype_id: int, api_client: TestClient) -> None:
-    response = api_client.get(f"/v1/tasktype/{ttype_id}")
+    response = api_client.get(f"/tasktype/{ttype_id}")
     original = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/tasktype/{ttype_id}")
     assert response.status_code == original.status_code
 
@@ -37,6 +37,6 @@ def test_get_task_type(ttype_id: int, api_client: TestClient) -> None:
 
 
 def test_get_task_type_unknown(api_client: TestClient) -> None:
-    response = api_client.get("/v1/tasktype/1000")
+    response = api_client.get("/tasktype/1000")
     assert response.status_code == http.client.PRECONDITION_FAILED
     assert response.json() == {"detail": {"code": "241", "message": "Unknown task type."}}
diff --git a/tests/routers/v1/datasets_old_test.py b/tests/routers/v1/datasets_old_test.py
deleted file mode 100644
index dff98368..00000000
--- a/tests/routers/v1/datasets_old_test.py
+++ /dev/null
@@ -1,98 +0,0 @@
-import http.client
-import json.decoder
-from typing import Any, cast
-
-import httpx
-import pytest
-from starlette.testclient import TestClient
-
-
-@pytest.mark.php()
-@pytest.mark.parametrize(
-    "dataset_id",
-    range(1, 132),
-)
-def test_dataset_response_is_identical(dataset_id: int, api_client: TestClient) -> None:
-    original = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/data/{dataset_id}")
-    new = cast(httpx.Response, api_client.get(f"/v1/datasets/{dataset_id}"))
-    assert original.status_code == new.status_code
-    assert new.json()
-
-    if new.status_code != http.client.OK:
-        assert original.json()["error"] == new.json()["detail"]
-        return
-
-    assert "data_set_description" in new.json()
-
-    try:
-        original = original.json()["data_set_description"]
-    except json.decoder.JSONDecodeError:
-        pytest.skip("A PHP error occurred on the test server.")
-
-    new = new.json()["data_set_description"]
-
-    if "div" in original:
-        pytest.skip("A PHP error occurred on the test server.")
-
-    # The new API has normalized `format` field:
-    original["format"] = original["format"].lower()
-
-    # There is odd behavior in the live server that I don't want to recreate:
-    # when the creator is a list of csv names, it can either be a str or a list
-    # depending on whether the names are quoted. E.g.:
-    # '"Alice", "Bob"' -> ["Alice", "Bob"]
-    # 'Alice, Bob' -> 'Alice, Bob'
-    if (
-        "creator" in original
-        and isinstance(original["creator"], str)
-        and len(original["creator"].split(",")) > 1
-    ):
-        original["creator"] = [name.strip() for name in original["creator"].split(",")]
-
-    # The remainder of the fields should be identical:
-    assert original == new
-
-
-@pytest.mark.parametrize(
-    "dataset_id",
-    [-1, 138, 100_000],
-)
-def test_error_unknown_dataset(
-    dataset_id: int,
-    api_client: TestClient,
-) -> None:
-    response = cast(httpx.Response, api_client.get(f"v1/datasets/{dataset_id}"))
-
-    assert response.status_code == http.client.PRECONDITION_FAILED
-    assert {"code": "111", "message": "Unknown dataset"} == response.json()["detail"]
-
-
-@pytest.mark.parametrize(
-    "api_key",
-    [None, "a" * 32],
-)
-def test_private_dataset_no_user_no_access(
-    api_client: TestClient,
-    api_key: str | None,
-) -> None:
-    query = f"?api_key={api_key}" if api_key else ""
-    response = cast(httpx.Response, api_client.get(f"v1/datasets/130{query}"))
-
-    assert response.status_code == http.client.PRECONDITION_FAILED
-    assert {"code": "112", "message": "No access granted"} == response.json()["detail"]
-
-
-@pytest.mark.skip("Not sure how to include apikey in test yet.")
-def test_private_dataset_owner_access(
-    api_client: TestClient,
-    dataset_130: dict[str, Any],
-) -> None:
-    response = cast(httpx.Response, api_client.get("/v1/datasets/130?api_key=..."))
-    assert response.status_code == http.client.OK
-    assert dataset_130 == response.json()
-
-
-@pytest.mark.skip("Not sure how to include apikey in test yet.")
-def test_private_dataset_admin_access(api_client: TestClient) -> None:
-    cast(httpx.Response, api_client.get("/v1/datasets/130?api_key=..."))
-    # test against cached response

From 33b6e42354845b1490815e9d4043da8d67ffec2c Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Tue, 28 Nov 2023 15:56:35 +0100
Subject: [PATCH 2/3] Use a httpx.client interface for calling the php server

This puts the configuration in one place (the fixture) and makes
it easier to add new calls / understand where the calls go.
---
 tests/conftest.py                             |  9 ++-
 tests/routers/openml/dataset_tag_test.py      | 20 +++---
 .../openml/datasets_list_datasets_test.py     | 61 ++++++++++---------
 tests/routers/openml/datasets_test.py         | 16 ++---
 .../migration/datasets_migration_test.py      | 39 ++++++------
 tests/routers/openml/qualities_test.py        | 32 +++++-----
 tests/routers/openml/task_type_test.py        | 16 ++---
 7 files changed, 105 insertions(+), 88 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 287b947a..6ccad271 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -4,6 +4,7 @@
 from pathlib import Path
 from typing import Any, Generator
 
+import httpx
 import pytest
 from database.setup import expdb_database, user_database
 from fastapi.testclient import TestClient
@@ -40,7 +41,13 @@ def user_test() -> Connection:
 
 
 @pytest.fixture()
-def api_client(expdb_test: Connection, user_test: Connection) -> TestClient:
+def php_api() -> httpx.Client:
+    with httpx.Client(base_url="http://server-api-php-api-1:80/api/v1/json") as client:
+        yield client
+
+
+@pytest.fixture()
+def py_api(expdb_test: Connection, user_test: Connection) -> TestClient:
     app = create_api()
     # We use the lambda definitions because fixtures may not be called directly.
     app.dependency_overrides[expdb_connection] = lambda: expdb_test
diff --git a/tests/routers/openml/dataset_tag_test.py b/tests/routers/openml/dataset_tag_test.py
index f868f2cf..f315ffbe 100644
--- a/tests/routers/openml/dataset_tag_test.py
+++ b/tests/routers/openml/dataset_tag_test.py
@@ -16,11 +16,11 @@
     [None, ApiKey.INVALID],
     ids=["no authentication", "invalid key"],
 )
-def test_dataset_tag_rejects_unauthorized(key: ApiKey, api_client: TestClient) -> None:
+def test_dataset_tag_rejects_unauthorized(key: ApiKey, py_api: TestClient) -> None:
     apikey = "" if key is None else f"?api_key={key}"
     response = cast(
         httpx.Response,
-        api_client.post(
+        py_api.post(
             f"/datasets/tag{apikey}",
             json={"data_id": constants.PRIVATE_DATASET_ID, "tag": "test"},
         ),
@@ -34,11 +34,11 @@ def test_dataset_tag_rejects_unauthorized(key: ApiKey, api_client: TestClient) -
     [ApiKey.ADMIN, ApiKey.REGULAR_USER, ApiKey.OWNER_USER],
     ids=["administrator", "non-owner", "owner"],
 )
-def test_dataset_tag(key: ApiKey, expdb_test: Connection, api_client: TestClient) -> None:
+def test_dataset_tag(key: ApiKey, expdb_test: Connection, py_api: TestClient) -> None:
     dataset_id, tag = constants.PRIVATE_DATASET_ID, "test"
     response = cast(
         httpx.Response,
-        api_client.post(
+        py_api.post(
             f"/datasets/tag?api_key={key}",
             json={"data_id": dataset_id, "tag": tag},
         ),
@@ -50,11 +50,11 @@ def test_dataset_tag(key: ApiKey, expdb_test: Connection, api_client: TestClient
     assert tag in tags
 
 
-def test_dataset_tag_returns_existing_tags(api_client: TestClient) -> None:
+def test_dataset_tag_returns_existing_tags(py_api: TestClient) -> None:
     dataset_id, tag = 1, "test"
     response = cast(
         httpx.Response,
-        api_client.post(
+        py_api.post(
             f"/datasets/tag?api_key={ApiKey.ADMIN}",
             json={"data_id": dataset_id, "tag": tag},
         ),
@@ -63,11 +63,11 @@ def test_dataset_tag_returns_existing_tags(api_client: TestClient) -> None:
     assert {"data_tag": {"id": str(dataset_id), "tag": ["study_14", tag]}} == response.json()
 
 
-def test_dataset_tag_fails_if_tag_exists(api_client: TestClient) -> None:
+def test_dataset_tag_fails_if_tag_exists(py_api: TestClient) -> None:
     dataset_id, tag = 1, "study_14"  # Dataset 1 already is tagged with 'study_14'
     response = cast(
         httpx.Response,
-        api_client.post(
+        py_api.post(
             f"/datasets/tag?api_key={ApiKey.ADMIN}",
             json={"data_id": dataset_id, "tag": tag},
         ),
@@ -90,11 +90,11 @@ def test_dataset_tag_fails_if_tag_exists(api_client: TestClient) -> None:
 )
 def test_dataset_tag_invalid_tag_is_rejected(
     tag: str,
-    api_client: TestClient,
+    py_api: TestClient,
 ) -> None:
     new = cast(
         httpx.Response,
-        api_client.post(
+        py_api.post(
             f"/datasets/tag?api_key{ApiKey.ADMIN}",
             json={"data_id": 1, "tag": tag},
         ),
diff --git a/tests/routers/openml/datasets_list_datasets_test.py b/tests/routers/openml/datasets_list_datasets_test.py
index e341735d..92f70557 100644
--- a/tests/routers/openml/datasets_list_datasets_test.py
+++ b/tests/routers/openml/datasets_list_datasets_test.py
@@ -19,8 +19,8 @@ def _assert_empty_result(
     assert response.json()["detail"] == {"code": "372", "message": "No results"}
 
 
-def test_list(api_client: TestClient) -> None:
-    response = api_client.get("/datasets/list/")
+def test_list(py_api: TestClient) -> None:
+    response = py_api.get("/datasets/list/")
     assert response.status_code == http.client.OK
     assert "data" in response.json()
     assert "dataset" in response.json()["data"]
@@ -38,8 +38,8 @@ def test_list(api_client: TestClient) -> None:
         ("all", constants.NUMBER_OF_DATASETS - constants.NUMBER_OF_PRIVATE_DATASETS),
     ],
 )
-def test_list_filter_active(status: str, amount: int, api_client: TestClient) -> None:
-    response = api_client.post(
+def test_list_filter_active(status: str, amount: int, py_api: TestClient) -> None:
+    response = py_api.post(
         "/datasets/list",
         json={"status": status, "pagination": {"limit": constants.NUMBER_OF_DATASETS}},
     )
@@ -57,9 +57,9 @@ def test_list_filter_active(status: str, amount: int, api_client: TestClient) ->
         (None, constants.NUMBER_OF_DATASETS - constants.NUMBER_OF_PRIVATE_DATASETS),
     ],
 )
-def test_list_accounts_privacy(api_key: ApiKey | None, amount: int, api_client: TestClient) -> None:
+def test_list_accounts_privacy(api_key: ApiKey | None, amount: int, py_api: TestClient) -> None:
     key = f"?api_key={api_key}" if api_key else ""
-    response = api_client.post(
+    response = py_api.post(
         f"/datasets/list{key}",
         json={"status": "all", "pagination": {"limit": 1000}},
     )
@@ -72,9 +72,9 @@ def test_list_accounts_privacy(api_key: ApiKey | None, amount: int, api_client:
     ("name", "count"),
     [("abalone", 1), ("iris", 2)],
 )
-def test_list_data_name_present(name: str, count: int, api_client: TestClient) -> None:
+def test_list_data_name_present(name: str, count: int, py_api: TestClient) -> None:
     # The second iris dataset is private, so we need to authenticate.
-    response = api_client.post(
+    response = py_api.post(
         f"/datasets/list?api_key={ApiKey.ADMIN}",
         json={"status": "all", "data_name": name},
     )
@@ -88,8 +88,8 @@ def test_list_data_name_present(name: str, count: int, api_client: TestClient) -
     "name",
     ["ir", "long_name_without_overlap"],
 )
-def test_list_data_name_absent(name: str, api_client: TestClient) -> None:
-    response = api_client.post(
+def test_list_data_name_absent(name: str, py_api: TestClient) -> None:
+    response = py_api.post(
         f"/datasets/list?api_key={ApiKey.ADMIN}",
         json={"status": "all", "data_name": name},
     )
@@ -102,7 +102,7 @@ def test_list_quality_filers() -> None:
 
 @pytest.mark.parametrize("limit", [None, 5, 10, 200])
 @pytest.mark.parametrize("offset", [None, 0, 5, 129, 130, 200])
-def test_list_pagination(limit: int | None, offset: int | None, api_client: TestClient) -> None:
+def test_list_pagination(limit: int | None, offset: int | None, py_api: TestClient) -> None:
     all_ids = [
         did
         for did in range(1, 1 + constants.NUMBER_OF_DATASETS)
@@ -116,7 +116,7 @@ def test_list_pagination(limit: int | None, offset: int | None, api_client: Test
     offset_body = {} if offset is None else {"offset": offset}
     limit_body = {} if limit is None else {"limit": limit}
     filters = {"status": "all", "pagination": offset_body | limit_body}
-    response = api_client.post("/datasets/list", json=filters)
+    response = py_api.post("/datasets/list", json=filters)
 
     if offset in [130, 200]:
         _assert_empty_result(response)
@@ -131,8 +131,8 @@ def test_list_pagination(limit: int | None, offset: int | None, api_client: Test
     ("version", "count"),
     [(1, 100), (2, 6), (5, 1)],
 )
-def test_list_data_version(version: int, count: int, api_client: TestClient) -> None:
-    response = api_client.post(
+def test_list_data_version(version: int, count: int, py_api: TestClient) -> None:
+    response = py_api.post(
         f"/datasets/list?api_key={ApiKey.ADMIN}",
         json={"status": "all", "data_version": version},
     )
@@ -142,8 +142,8 @@ def test_list_data_version(version: int, count: int, api_client: TestClient) ->
     assert {dataset["version"] for dataset in datasets} == {version}
 
 
-def test_list_data_version_no_result(api_client: TestClient) -> None:
-    response = api_client.post(
+def test_list_data_version_no_result(py_api: TestClient) -> None:
+    response = py_api.post(
         f"/datasets/list?api_key={ApiKey.ADMIN}",
         json={"status": "all", "data_version": 4},
     )
@@ -158,8 +158,8 @@ def test_list_data_version_no_result(api_client: TestClient) -> None:
     ("user_id", "count"),
     [(1, 59), (2, 34), (16, 1)],
 )
-def test_list_uploader(user_id: int, count: int, key: str, api_client: TestClient) -> None:
-    response = api_client.post(
+def test_list_uploader(user_id: int, count: int, key: str, py_api: TestClient) -> None:
+    response = py_api.post(
         f"/datasets/list?api_key={key}",
         json={"status": "all", "uploader": user_id},
     )
@@ -177,8 +177,8 @@ def test_list_uploader(user_id: int, count: int, key: str, api_client: TestClien
     "data_id",
     [[1], [1, 2, 3], [1, 2, 3, 3000], [1, 2, 3, 130]],
 )
-def test_list_data_id(data_id: list[int], api_client: TestClient) -> None:
-    response = api_client.post(
+def test_list_data_id(data_id: list[int], py_api: TestClient) -> None:
+    response = py_api.post(
         "/datasets/list",
         json={"status": "all", "data_id": data_id},
     )
@@ -193,8 +193,8 @@ def test_list_data_id(data_id: list[int], api_client: TestClient) -> None:
     ("tag", "count"),
     [("study_14", 100), ("study_15", 1)],
 )
-def test_list_data_tag(tag: str, count: int, api_client: TestClient) -> None:
-    response = api_client.post(
+def test_list_data_tag(tag: str, count: int, py_api: TestClient) -> None:
+    response = py_api.post(
         "/datasets/list",
         # study_14 has 100 datasets, we overwrite the default `limit` because otherwise
         # we don't know if the results are limited by filtering on the tag.
@@ -205,8 +205,8 @@ def test_list_data_tag(tag: str, count: int, api_client: TestClient) -> None:
     assert len(datasets) == count
 
 
-def test_list_data_tag_empty(api_client: TestClient) -> None:
-    response = api_client.post(
+def test_list_data_tag_empty(py_api: TestClient) -> None:
+    response = py_api.post(
         "/datasets/list",
         json={"status": "all", "tag": "not-a-tag"},
     )
@@ -226,8 +226,8 @@ def test_list_data_tag_empty(api_client: TestClient) -> None:
         ("number_missing_values", "2..100000", 22),
     ],
 )
-def test_list_data_quality(quality: str, range_: str, count: int, api_client: TestClient) -> None:
-    response = api_client.post(
+def test_list_data_quality(quality: str, range_: str, count: int, py_api: TestClient) -> None:
+    response = py_api.post(
         "/datasets/list",
         json={"status": "all", quality: range_},
     )
@@ -258,7 +258,8 @@ def test_list_data_quality(quality: str, range_: str, count: int, api_client: Te
     api_key=st.sampled_from([None, ApiKey.REGULAR_USER, ApiKey.OWNER_USER]),
 )  # type: ignore[misc]  # https://github.com/openml/server-api/issues/108
 def test_list_data_identical(
-    api_client: TestClient,
+    py_api: TestClient,
+    php_api: httpx.Client,
     **kwargs: dict[str, Any],
 ) -> Any:
     limit, offset = kwargs["limit"], kwargs["offset"]
@@ -275,7 +276,7 @@ def test_list_data_identical(
     if offset is not None:
         new_style["pagination"]["offset"] = offset
 
-    response = api_client.post(
+    response = py_api.post(
         f"/datasets/list{api_key_query}",
         json=new_style,
     )
@@ -286,11 +287,11 @@ def test_list_data_identical(
         for filter_, value in kwargs.items()
         if value is not None
     ]
-    uri = "http://server-api-php-api-1:80/api/v1/json/data/list"
+    uri = "/data/list"
     if query:
         uri += f"/{'/'.join([str(v) for q in query for v in q])}"
     uri += api_key_query
-    original = httpx.get(uri)
+    original = php_api.get(uri)
 
     assert original.status_code == response.status_code, response.json()
     if original.status_code == http.client.PRECONDITION_FAILED:
diff --git a/tests/routers/openml/datasets_test.py b/tests/routers/openml/datasets_test.py
index 1aff3dc2..98a11941 100644
--- a/tests/routers/openml/datasets_test.py
+++ b/tests/routers/openml/datasets_test.py
@@ -17,9 +17,9 @@
 def test_error_unknown_dataset(
     dataset_id: int,
     response_code: int,
-    api_client: TestClient,
+    py_api: TestClient,
 ) -> None:
-    response = cast(httpx.Response, api_client.get(f"/datasets/{dataset_id}"))
+    response = cast(httpx.Response, py_api.get(f"/datasets/{dataset_id}"))
 
     assert response.status_code == response_code
     assert {"code": "111", "message": "Unknown dataset"} == response.json()["detail"]
@@ -33,12 +33,12 @@ def test_error_unknown_dataset(
     ],
 )
 def test_private_dataset_no_user_no_access(
-    api_client: TestClient,
+    py_api: TestClient,
     api_key: str | None,
     response_code: int,
 ) -> None:
     query = f"?api_key={api_key}" if api_key else ""
-    response = cast(httpx.Response, api_client.get(f"/datasets/130{query}"))
+    response = cast(httpx.Response, py_api.get(f"/datasets/130{query}"))
 
     assert response.status_code == response_code
     assert {"code": "112", "message": "No access granted"} == response.json()["detail"]
@@ -46,15 +46,15 @@ def test_private_dataset_no_user_no_access(
 
 @pytest.mark.skip("Not sure how to include apikey in test yet.")
 def test_private_dataset_owner_access(
-    api_client: TestClient,
+    py_api: TestClient,
     dataset_130: dict[str, Any],
 ) -> None:
-    response = cast(httpx.Response, api_client.get("/v2/datasets/130?api_key=..."))
+    response = cast(httpx.Response, py_api.get("/v2/datasets/130?api_key=..."))
     assert response.status_code == http.client.OK
     assert dataset_130 == response.json()
 
 
 @pytest.mark.skip("Not sure how to include apikey in test yet.")
-def test_private_dataset_admin_access(api_client: TestClient) -> None:
-    cast(httpx.Response, api_client.get("/v2/datasets/130?api_key=..."))
+def test_private_dataset_admin_access(py_api: TestClient) -> None:
+    cast(httpx.Response, py_api.get("/v2/datasets/130?api_key=..."))
     # test against cached response
diff --git a/tests/routers/openml/migration/datasets_migration_test.py b/tests/routers/openml/migration/datasets_migration_test.py
index 0266e5be..6d93827c 100644
--- a/tests/routers/openml/migration/datasets_migration_test.py
+++ b/tests/routers/openml/migration/datasets_migration_test.py
@@ -14,9 +14,13 @@
     "dataset_id",
     range(1, 132),
 )
-def test_dataset_response_is_identical(dataset_id: int, api_client: TestClient) -> None:
-    original = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/data/{dataset_id}")
-    new = api_client.get(f"/datasets/{dataset_id}")
+def test_dataset_response_is_identical(
+    dataset_id: int,
+    py_api: TestClient,
+    php_api: httpx.Client,
+) -> None:
+    original = php_api.get(f"/data/{dataset_id}")
+    new = py_api.get(f"/datasets/{dataset_id}")
 
     if new.status_code == http.client.FORBIDDEN:
         assert original.status_code == http.client.PRECONDITION_FAILED
@@ -92,9 +96,9 @@ def test_dataset_response_is_identical(dataset_id: int, api_client: TestClient)
 )
 def test_error_unknown_dataset(
     dataset_id: int,
-    api_client: TestClient,
+    py_api: TestClient,
 ) -> None:
-    response = cast(httpx.Response, api_client.get(f"/datasets/{dataset_id}"))
+    response = cast(httpx.Response, py_api.get(f"/datasets/{dataset_id}"))
 
     # The new API has "404 Not Found" instead of "412 PRECONDITION_FAILED"
     assert response.status_code == http.client.NOT_FOUND
@@ -106,11 +110,11 @@ def test_error_unknown_dataset(
     [None, "a" * 32],
 )
 def test_private_dataset_no_user_no_access(
-    api_client: TestClient,
+    py_api: TestClient,
     api_key: str | None,
 ) -> None:
     query = f"?api_key={api_key}" if api_key else ""
-    response = cast(httpx.Response, api_client.get(f"/datasets/130{query}"))
+    response = cast(httpx.Response, py_api.get(f"/datasets/130{query}"))
 
     # New response is 403: Forbidden instead of 412: PRECONDITION FAILED
     assert response.status_code == http.client.FORBIDDEN
@@ -119,17 +123,17 @@ def test_private_dataset_no_user_no_access(
 
 @pytest.mark.skip("Not sure how to include apikey in test yet.")
 def test_private_dataset_owner_access(
-    api_client: TestClient,
+    py_api: TestClient,
     dataset_130: dict[str, Any],
 ) -> None:
-    response = cast(httpx.Response, api_client.get("/datasets/130?api_key=..."))
+    response = cast(httpx.Response, py_api.get("/datasets/130?api_key=..."))
     assert response.status_code == http.client.OK
     assert dataset_130 == response.json()
 
 
 @pytest.mark.skip("Not sure how to include apikey in test yet.")
-def test_private_dataset_admin_access(api_client: TestClient) -> None:
-    cast(httpx.Response, api_client.get("/datasets/130?api_key=..."))
+def test_private_dataset_admin_access(py_api: TestClient) -> None:
+    cast(httpx.Response, py_api.get("/datasets/130?api_key=..."))
     # test against cached response
 
 
@@ -152,10 +156,11 @@ def test_dataset_tag_response_is_identical(
     dataset_id: int,
     tag: str,
     api_key: str,
-    api_client: TestClient,
+    py_api: TestClient,
+    php_api: httpx.Client,
 ) -> None:
-    original = httpx.post(
-        "http://server-api-php-api-1:80/api/v1/json/data/tag",
+    original = php_api.post(
+        "/data/tag",
         data={"api_key": api_key, "tag": tag, "data_id": dataset_id},
     )
     if (
@@ -165,13 +170,13 @@ def test_dataset_tag_response_is_identical(
         pytest.skip("Encountered Elastic Search error.")
     if original.status_code == http.client.OK:
         # undo the tag, because we don't want to persist this change to the database
-        httpx.post(
-            "http://server-api-php-api-1:80/api/v1/json/data/untag",
+        php_api.post(
+            "/data/untag",
             data={"api_key": api_key, "tag": tag, "data_id": dataset_id},
         )
     new = cast(
         httpx.Response,
-        api_client.post(
+        py_api.post(
             f"/datasets/tag?api_key={api_key}",
             json={"data_id": dataset_id, "tag": tag},
         ),
diff --git a/tests/routers/openml/qualities_test.py b/tests/routers/openml/qualities_test.py
index eb30b442..851f7791 100644
--- a/tests/routers/openml/qualities_test.py
+++ b/tests/routers/openml/qualities_test.py
@@ -28,16 +28,16 @@ def _remove_quality_from_database(quality_name: str, expdb_test: Connection) ->
 
 
 @pytest.mark.php()
-def test_list_qualities_identical(api_client: TestClient) -> None:
-    original = httpx.get("http://server-api-php-api-1:80/api/v1/json/data/qualities/list")
-    new = api_client.get("/datasets/qualities/list")
+def test_list_qualities_identical(py_api: TestClient, php_api: httpx.Client) -> None:
+    original = php_api.get("/data/qualities/list")
+    new = py_api.get("/datasets/qualities/list")
     assert original.status_code == new.status_code
     assert original.json() == new.json()
     # To keep the test idempotent, we cannot test if reaction to database changes is identical
 
 
-def test_list_qualities(api_client: TestClient, expdb_test: Connection) -> None:
-    response = api_client.get("/datasets/qualities/list")
+def test_list_qualities(py_api: TestClient, expdb_test: Connection) -> None:
+    response = py_api.get("/datasets/qualities/list")
     assert response.status_code == http.client.OK
     expected = {
         "data_qualities_list": {
@@ -157,13 +157,13 @@ def test_list_qualities(api_client: TestClient, expdb_test: Connection) -> None:
     deleted = expected["data_qualities_list"]["quality"].pop()
     _remove_quality_from_database(quality_name=deleted, expdb_test=expdb_test)
 
-    response = api_client.get("/datasets/qualities/list")
+    response = py_api.get("/datasets/qualities/list")
     assert response.status_code == http.client.OK
     assert expected == response.json()
 
 
-def test_get_quality(api_client: TestClient) -> None:
-    response = api_client.get("/datasets/qualities/1")
+def test_get_quality(py_api: TestClient) -> None:
+    response = py_api.get("/datasets/qualities/1")
     assert response.status_code == http.client.OK
     expected = [
         {"name": "AutoCorrelation", "value": 0.6064659977703456},
@@ -282,9 +282,9 @@ def test_get_quality(api_client: TestClient) -> None:
     "data_id",
     list(set(range(1, 132)) - {55, 56, 59, 116, 130}),
 )
-def test_get_quality_identical(data_id: int, api_client: TestClient) -> None:
-    php_response = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/data/qualities/{data_id}")
-    python_response = api_client.get(f"/datasets/qualities/{data_id}")
+def test_get_quality_identical(data_id: int, py_api: TestClient, php_api: httpx.Client) -> None:
+    php_response = php_api.get(f"/data/qualities/{data_id}")
+    python_response = py_api.get(f"/datasets/qualities/{data_id}")
     assert python_response.status_code == php_response.status_code
 
     expected = [
@@ -302,13 +302,17 @@ def test_get_quality_identical(data_id: int, api_client: TestClient) -> None:
     "data_id",
     [55, 56, 59, 116, 130, 132],
 )
-def test_get_quality_identical_error(data_id: int, api_client: TestClient) -> None:
+def test_get_quality_identical_error(
+    data_id: int,
+    py_api: TestClient,
+    php_api: httpx.Client,
+) -> None:
     if data_id in [55, 56, 59]:
         pytest.skip("Detailed error for code 364 (failed processing) not yet supported.")
     if data_id in [116]:
         pytest.skip("Detailed error for code 362 (no qualities) not yet supported.")
-    php_response = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/data/qualities/{data_id}")
-    python_response = api_client.get(f"/datasets/qualities/{data_id}")
+    php_response = php_api.get(f"/data/qualities/{data_id}")
+    python_response = py_api.get(f"/datasets/qualities/{data_id}")
     assert python_response.status_code == php_response.status_code
     # The "dataset unknown" error currently has a separate code in PHP depending on
     # where it occurs (e.g., get dataset->113 get quality->361)
diff --git a/tests/routers/openml/task_type_test.py b/tests/routers/openml/task_type_test.py
index 2d91cab8..c5182716 100644
--- a/tests/routers/openml/task_type_test.py
+++ b/tests/routers/openml/task_type_test.py
@@ -7,9 +7,9 @@
 
 
 @pytest.mark.php()
-def test_list_task_type(api_client: TestClient) -> None:
-    response = api_client.get("/tasktype/list")
-    original = httpx.get("http://server-api-php-api-1:80/api/v1/json/tasktype/list")
+def test_list_task_type(py_api: TestClient, php_api: httpx.Client) -> None:
+    response = py_api.get("/tasktype/list")
+    original = php_api.get("/tasktype/list")
     assert response.status_code == original.status_code
     assert response.json() == original.json()
 
@@ -19,9 +19,9 @@ def test_list_task_type(api_client: TestClient) -> None:
     "ttype_id",
     list(range(1, 12)),
 )
-def test_get_task_type(ttype_id: int, api_client: TestClient) -> None:
-    response = api_client.get(f"/tasktype/{ttype_id}")
-    original = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/tasktype/{ttype_id}")
+def test_get_task_type(ttype_id: int, py_api: TestClient, php_api: httpx.Client) -> None:
+    response = py_api.get(f"/tasktype/{ttype_id}")
+    original = php_api.get(f"/tasktype/{ttype_id}")
     assert response.status_code == original.status_code
 
     py_json = response.json()
@@ -36,7 +36,7 @@ def test_get_task_type(ttype_id: int, api_client: TestClient) -> None:
     assert not differences
 
 
-def test_get_task_type_unknown(api_client: TestClient) -> None:
-    response = api_client.get("/tasktype/1000")
+def test_get_task_type_unknown(py_api: TestClient) -> None:
+    response = py_api.get("/tasktype/1000")
     assert response.status_code == http.client.PRECONDITION_FAILED
     assert response.json() == {"detail": {"code": "241", "message": "Unknown task type."}}

From 714eaf25d90c8de1435123db7605867f31c319b0 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Tue, 28 Nov 2023 16:21:36 +0100
Subject: [PATCH 3/3] Update for removal of distinct V1 and V2 endpoints

---
 docs/migration.md | 90 ++++++++++++++++-------------------------------
 1 file changed, 30 insertions(+), 60 deletions(-)

diff --git a/docs/migration.md b/docs/migration.md
index e3e9c38d..dc49e33c 100644
--- a/docs/migration.md
+++ b/docs/migration.md
@@ -1,31 +1,22 @@
 # Migration
-This implementation currently maintains two separate endpoints.
-There are "old" endpoints (Python-V1), which mimic responses of the PHP REST API (PHP-V1)
-as closely as possible, and new endpoints (V2) which have some additional changes and
-will be updated going forward.
-
-The advised way to upgrade connector packages that currently interact with the old
-JSON API is to first migrate from the old PHP API to our re-implemented V1 API.
-See "[V1: PHP to Python](#v1--php-to-python)" for the differences between the PHP and
-Python API. After that migration, continue with the "[V1 to V2](#v1-to-v2)" guide.
-
-Connectors currently using the XML API are recommended to upgrade to V2 directly,
-in which case using the generated REST API documentation is recommended.
-
-## V1: PHP to Python
-
-The first iteration of the new server has nearly identical responses to the old JSON
-endpoints, but there are exceptions. Most exceptions either bug fixes, or arise from
-technical limitations. This list covers the most important changes, but there may
-be some undocumented changes for edge cases. The PHP API was underspecified, and we
-decided that reverse engineering the specifications which mostly arise from
-implementation details was not worth the effort. If there is a behavioral change which
-was not documented but affects you, please [open a bug report](https://github.com/openml/server-api/issues/new?assignees=&labels=bug%2C+triage&projects=&template=bug-report.md&title=).
-
-### All Endpoints
+The Python reimplementation provides the same endpoints as the old API, which are
+largely functioning the same way. However, there are a few major deviations:
+
+ * Use of typed JSON: e.g., when a value represents an integer, it is returned as integer.
+ * Lists when multiple values are possible: if a field can have none, one, or multiple entries (e.g., authors), we always return a list.
+ * Restriction or expansion of input types as appropriate.
+ * Standardizing authentication and access messages, and consistently execute those checks
+  before fetching data or providing error messages about the data.
+
+The list above is not exhaustive. Minor changes include, for example, bug fixes and the removal of unnecessary nesting.
+There may be undocumented changes, especially in edge cases which may not have occurred in the test environment.
+As the PHP API was underspecified, the re-implementation is based on a mix of reading old code and probing the API.
+If there is a behavioral change which was not documented but affects you, please [open a bug report](https://github.com/openml/server-api/issues/new?assignees=&labels=bug%2C+triage&projects=&template=bug-report.md&title=).
+
+## All Endpoints
 The following changes affect all endpoints.
 
-#### Error on Invalid Input
+### Error on Invalid Input
 When providing input of invalid types (e.g., a non-integer dataset id) the HTTP header
 and JSON content will be different.
 
@@ -45,7 +36,7 @@ and JSON content will be different.
    These endpoints now do enforce stricter input constraints.
    Constraints for each endpoint parameter are documented in the API docs.
 
-#### Other Errors
+### Other Errors
 For any other error messages, the response is identical except that outer field will be `"detail"` instead of `"error"`:
 
 ```diff title="JSON Content"
@@ -53,9 +44,7 @@ For any other error messages, the response is identical except that outer field
 + {"detail":{"code":"112","message":"No access granted"}}
 ```
 
-In some cases the JSON endpoints previously returned XML ([example](https://github.com/openml/OpenML/issues/1200)).
-Python-V1 will always return JSON.
-
+In some cases the JSON endpoints previously returned XML ([example](https://github.com/openml/OpenML/issues/1200)), the new API always returns JSON.
 
 ```diff title="XML replaced by JSON"
 - <oml:error xmlns:oml="http://openml.org/openml">
@@ -65,20 +54,26 @@ Python-V1 will always return JSON.
 + {"detail": {"code":"103", "message": "Authentication failed" } }
 ```
 
-### Datasets
+## Datasets
 
-#### `GET /{dataset_id}`
+### `GET /{dataset_id}`
  - Dataset format names are normalized to be all lower-case
    (`"Sparse_ARFF"` ->  `"sparse_arff"`).
- - Non-`arff` datasets will not incorrectly have a `"parquet_url"`:
-   https://github.com/openml/OpenML/issues/1189
+ - Non-`arff` datasets will not incorrectly have a `"parquet_url"` ([openml#1189](https://github.com/openml/OpenML/issues/1189)).
  - If `"creator"` contains multiple comma-separated creators it is always returned
    as a list, instead of it depending on the quotation used by the original uploader.
  - For (some?) datasets that have multiple values in `"ignore_attribute"`, this field
    is correctly populated instead of omitted.
+ - Processing date is formatted with a `T` in the middle:
+  ```diff title="processing_date"
+  - "2019-07-09 15:22:03"
+  + "2019-07-09T15:22:03"
+  ```
+ - Fields which may contain lists of values (e.g., `creator`, `contributor`) now always
+  returns a list (which may also be empty or contain a single element).
+ - Fields without a set value are no longer automatically removed from the response.
 
-
-#### `GET /data/list/{filters}`
+### `GET /data/list/{filters}`
 
 The endpoint now accepts the filters in the body of the request, instead of as query parameters.
 ```diff
@@ -95,28 +90,3 @@ includes datasets which are private.
 
 The `limit` and `offset` parameters can now be used independently, you no longer need
 to provide both if you wish to set only one.
-
-## V1 to V2
-Most of the changes are focused on standardizing responses, working on:
-
- * using JSON types.
- * removing levels of nesting for endpoints which return single-field JSON.
- * always returning lists for fields which may contain multiple values even if it
-   contains only one element or no element.
- * restricting or expanding input types as appropriate.
- * standardizing authentication and access messages, and consistently execute those checks
-   before fetching data or providing error messages about the data.
-
-
-### Datasets
-
-#### `GET /{dataset_id}`
-
- - Processing date is formatted with a `T` in the middle:
-   ```diff title="processing_date"
-   - "2019-07-09 15:22:03"
-   + "2019-07-09T15:22:03"
-   ```
- - Fields which may contain lists of values (e.g., `creator`, `contributor`) now always
-   returns a list (which may also be empty or contain a single element).
- - Fields without a set value are no longer automatically removed from the response.