From 75755573da398cab5034ef864afd871b9c878678 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Mon, 27 Nov 2023 15:04:20 +0100
Subject: [PATCH 1/3] Add data/quality/{id} endpoint
---
src/routers/v1/qualities.py | 21 +++++-
src/schemas/datasets/openml.py | 5 ++
tests/routers/v1/qualities_test.py | 115 +++++++++++++++++++++++++++++
3 files changed, 140 insertions(+), 1 deletion(-)
diff --git a/src/routers/v1/qualities.py b/src/routers/v1/qualities.py
index a00372d8..cc254e02 100644
--- a/src/routers/v1/qualities.py
+++ b/src/routers/v1/qualities.py
@@ -2,7 +2,8 @@
from database.datasets import list_all_qualities
from fastapi import APIRouter, Depends
-from sqlalchemy import Connection
+from schemas.datasets.openml import Quality
+from sqlalchemy import Connection, text
from routers.dependencies import expdb_connection
@@ -19,3 +20,21 @@ def list_qualities(
"quality": qualities,
},
}
+
+
+@router.get("/qualities/{dataset_id}")
+def get_qualities(
+ dataset_id: int,
+ expdb: Annotated[Connection, Depends(expdb_connection)],
+) -> list[Quality]:
+ rows = expdb.execute(
+ text(
+ """
+ SELECT `quality`,`value`
+ FROM data_quality
+ WHERE `data`=:dataset_id
+ """,
+ ),
+ parameters={"dataset_id": dataset_id},
+ )
+ return [Quality(name=row.quality, value=row.value) for row in rows]
diff --git a/src/schemas/datasets/openml.py b/src/schemas/datasets/openml.py
index 389d663f..6ff4d49b 100644
--- a/src/schemas/datasets/openml.py
+++ b/src/schemas/datasets/openml.py
@@ -24,6 +24,11 @@ class DatasetStatus(StrEnum):
IN_PREPARATION = "in_preparation"
+class Quality(BaseModel):
+ name: str
+ value: float
+
+
class DatasetMetadata(BaseModel):
id_: int = Field(json_schema_extra={"example": 1}, alias="id")
visibility: Visibility = Field(json_schema_extra={"example": Visibility.PUBLIC})
diff --git a/tests/routers/v1/qualities_test.py b/tests/routers/v1/qualities_test.py
index f1a7a605..368cf679 100644
--- a/tests/routers/v1/qualities_test.py
+++ b/tests/routers/v1/qualities_test.py
@@ -160,3 +160,118 @@ def test_list_qualities(api_client: TestClient, expdb_test: Connection) -> None:
response = api_client.get("/v1/datasets/qualities/list")
assert response.status_code == http.client.OK
assert expected == response.json()
+
+
+def test_get_quality(api_client: TestClient) -> None:
+ response = api_client.get("/v1/datasets/qualities/1")
+ assert response.status_code == http.client.OK
+ expected = [
+ {"name": "AutoCorrelation", "value": 0.6064659977703456},
+ {"name": "CfsSubsetEval_DecisionStumpAUC", "value": 0.9067742570970945},
+ {"name": "CfsSubsetEval_DecisionStumpErrRate", "value": 0.13251670378619154},
+ {"name": "CfsSubsetEval_DecisionStumpKappa", "value": 0.6191022730108037},
+ {"name": "CfsSubsetEval_NaiveBayesAUC", "value": 0.9067742570970945},
+ {"name": "CfsSubsetEval_NaiveBayesErrRate", "value": 0.13251670378619154},
+ {"name": "CfsSubsetEval_NaiveBayesKappa", "value": 0.6191022730108037},
+ {"name": "CfsSubsetEval_kNN1NAUC", "value": 0.9067742570970945},
+ {"name": "CfsSubsetEval_kNN1NErrRate", "value": 0.13251670378619154},
+ {"name": "CfsSubsetEval_kNN1NKappa", "value": 0.6191022730108037},
+ {"name": "ClassEntropy", "value": 1.189833856204398},
+ {"name": "DecisionStumpAUC", "value": 0.8652735384332186},
+ {"name": "DecisionStumpErrRate", "value": 0.22828507795100222},
+ {"name": "DecisionStumpKappa", "value": 0.4503332218612649},
+ {"name": "Dimensionality", "value": 0.043429844097995544},
+ {"name": "EquivalentNumberOfAtts", "value": 26.839183802676523},
+ {"name": "J48.00001.AUC", "value": 0.9391585368767195},
+ {"name": "J48.00001.ErrRate", "value": 0.10356347438752785},
+ {"name": "J48.00001.Kappa", "value": 0.7043302166347443},
+ {"name": "J48.0001.AUC", "value": 0.9391585368767195},
+ {"name": "J48.0001.ErrRate", "value": 0.10356347438752785},
+ {"name": "J48.0001.Kappa", "value": 0.7043302166347443},
+ {"name": "J48.001.AUC", "value": 0.9391585368767195},
+ {"name": "J48.001.ErrRate", "value": 0.10356347438752785},
+ {"name": "J48.001.Kappa", "value": 0.7043302166347443},
+ {"name": "MajorityClassPercentage", "value": 76.16926503340757},
+ {"name": "MajorityClassSize", "value": 684.0},
+ {"name": "MaxAttributeEntropy", "value": 1.8215224482924186},
+ {"name": "MaxKurtosisOfNumericAtts", "value": 13.215477213878724},
+ {"name": "MaxMeansOfNumericAtts", "value": 1263.0946547884187},
+ {"name": "MaxMutualInformation", "value": 0.40908953764451},
+ {"name": "MaxNominalAttDistinctValues", "value": 7.0},
+ {"name": "MaxSkewnessOfNumericAtts", "value": 3.7616019689156888},
+ {"name": "MaxStdDevOfNumericAtts", "value": 1871.3991072665933},
+ {"name": "MeanAttributeEntropy", "value": 0.2515351603742048},
+ {"name": "MeanKurtosisOfNumericAtts", "value": 4.6480244352098286},
+ {"name": "MeanMeansOfNumericAtts", "value": 348.50426818856715},
+ {"name": "MeanMutualInformation", "value": 0.044331968697414056},
+ {"name": "MeanNoiseToSignalRatio", "value": 4.673900071775454},
+ {"name": "MeanNominalAttDistinctValues", "value": 1.6363636363636362},
+ {"name": "MeanSkewnessOfNumericAtts", "value": 2.0269825910719437},
+ {"name": "MeanStdDevOfNumericAtts", "value": 405.17326983791025},
+ {"name": "MinAttributeEntropy", "value": -0.0},
+ {"name": "MinKurtosisOfNumericAtts", "value": -0.9723842038435437},
+ {"name": "MinMeansOfNumericAtts", "value": 1.1985489977728285},
+ {"name": "MinMutualInformation", "value": 0.0},
+ {"name": "MinNominalAttDistinctValues", "value": 0.0},
+ {"name": "MinSkewnessOfNumericAtts", "value": 0.07299048442083138},
+ {"name": "MinStdDevOfNumericAtts", "value": 0.871208280971892},
+ {"name": "MinorityClassPercentage", "value": 0.8908685968819599},
+ {"name": "MinorityClassSize", "value": 8.0},
+ {"name": "NaiveBayesAUC", "value": 0.9315907109421729},
+ {"name": "NaiveBayesErrRate", "value": 0.24610244988864144},
+ {"name": "NaiveBayesKappa", "value": 0.5569590016631507},
+ {"name": "NumberOfBinaryFeatures", "value": 4.0},
+ {"name": "NumberOfClasses", "value": 5.0},
+ {"name": "NumberOfFeatures", "value": 39.0},
+ {"name": "NumberOfInstances", "value": 898.0},
+ {"name": "NumberOfInstancesWithMissingValues", "value": 898.0},
+ {"name": "NumberOfMissingValues", "value": 22175.0},
+ {"name": "NumberOfNumericFeatures", "value": 6.0},
+ {"name": "NumberOfSymbolicFeatures", "value": 33.0},
+ {"name": "PercentageOfBinaryFeatures", "value": 10.256410256410255},
+ {"name": "PercentageOfInstancesWithMissingValues", "value": 100.0},
+ {"name": "PercentageOfMissingValues", "value": 63.317343384158534},
+ {"name": "PercentageOfNumericFeatures", "value": 15.384615384615385},
+ {"name": "PercentageOfSymbolicFeatures", "value": 84.61538461538461},
+ {"name": "Quartile1AttributeEntropy", "value": 0.0},
+ {"name": "Quartile1KurtosisOfNumericAtts", "value": -0.40305022089010156},
+ {"name": "Quartile1MeansOfNumericAtts", "value": 3.025695155902005},
+ {"name": "Quartile1MutualInformation", "value": 0.0},
+ {"name": "Quartile1SkewnessOfNumericAtts", "value": 0.967384603629726},
+ {"name": "Quartile1StdDevOfNumericAtts", "value": 10.505435772171138},
+ {"name": "Quartile2AttributeEntropy", "value": 0.0},
+ {"name": "Quartile2KurtosisOfNumericAtts", "value": 1.6372437439142264},
+ {"name": "Quartile2MeansOfNumericAtts", "value": 21.222160356347437},
+ {"name": "Quartile2MutualInformation", "value": 0.0},
+ {"name": "Quartile2SkewnessOfNumericAtts", "value": 1.6547313364025702},
+ {"name": "Quartile2StdDevOfNumericAtts", "value": 69.85338529046133},
+ {"name": "Quartile3AttributeEntropy", "value": 0.2385631077559124},
+ {"name": "Quartile3KurtosisOfNumericAtts", "value": 12.741748058445403},
+ {"name": "Quartile3MeansOfNumericAtts", "value": 901.2636692650334},
+ {"name": "Quartile3MutualInformation", "value": 0.0206465881071925},
+ {"name": "Quartile3SkewnessOfNumericAtts", "value": 3.7546438249219056},
+ {"name": "Quartile3StdDevOfNumericAtts", "value": 771.8590427889504},
+ {"name": "REPTreeDepth1AUC", "value": 0.962680369298288},
+ {"name": "REPTreeDepth1ErrRate", "value": 0.08463251670378619},
+ {"name": "REPTreeDepth1Kappa", "value": 0.768583383630482},
+ {"name": "REPTreeDepth2AUC", "value": 0.962680369298288},
+ {"name": "REPTreeDepth2ErrRate", "value": 0.08463251670378619},
+ {"name": "REPTreeDepth2Kappa", "value": 0.768583383630482},
+ {"name": "REPTreeDepth3AUC", "value": 0.962680369298288},
+ {"name": "REPTreeDepth3ErrRate", "value": 0.08463251670378619},
+ {"name": "REPTreeDepth3Kappa", "value": 0.768583383630482},
+ {"name": "RandomTreeDepth1AUC", "value": 0.9296999989655875},
+ {"name": "RandomTreeDepth1ErrRate", "value": 0.0801781737193764},
+ {"name": "RandomTreeDepth1Kappa", "value": 0.7953250436852635},
+ {"name": "RandomTreeDepth2AUC", "value": 0.9296999989655875},
+ {"name": "RandomTreeDepth2ErrRate", "value": 0.0801781737193764},
+ {"name": "RandomTreeDepth2Kappa", "value": 0.7953250436852635},
+ {"name": "RandomTreeDepth3AUC", "value": 0.9296999989655875},
+ {"name": "RandomTreeDepth3ErrRate", "value": 0.0801781737193764},
+ {"name": "RandomTreeDepth3Kappa", "value": 0.7953250436852635},
+ {"name": "StdvNominalAttDistinctValues", "value": 1.5576059718800395},
+ {"name": "kNN1NAUC", "value": 0.8721948540771287},
+ {"name": "kNN1NErrRate", "value": 0.06347438752783964},
+ {"name": "kNN1NKappa", "value": 0.8261102938928316},
+ ]
+ assert response.json() == expected
From 5e737b73540b71d9e400bfebd96c937015a6a2bb Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Tue, 28 Nov 2023 10:42:20 +0100
Subject: [PATCH 2/3] Add authentication check, document unsupported error
messaging
---
src/routers/v1/qualities.py | 33 ++++++++++++++++++++++++++----
src/schemas/datasets/openml.py | 2 +-
tests/routers/v1/qualities_test.py | 25 ++++++++++++++++++++++
3 files changed, 55 insertions(+), 5 deletions(-)
diff --git a/src/routers/v1/qualities.py b/src/routers/v1/qualities.py
index cc254e02..366e208d 100644
--- a/src/routers/v1/qualities.py
+++ b/src/routers/v1/qualities.py
@@ -1,11 +1,14 @@
-from typing import Annotated, Literal
+import http.client
+from typing import Annotated, Any, Literal
-from database.datasets import list_all_qualities
-from fastapi import APIRouter, Depends
+from database.datasets import get_dataset, list_all_qualities
+from database.users import User, UserGroup
+from fastapi import APIRouter, Depends, HTTPException
from schemas.datasets.openml import Quality
from sqlalchemy import Connection, text
-from routers.dependencies import expdb_connection
+from routers.dependencies import expdb_connection, fetch_user
+from routers.v2.datasets import DatasetError
router = APIRouter(prefix="/v1/datasets", tags=["datasets"])
@@ -22,11 +25,26 @@ def list_qualities(
}
+def _user_can_see_dataset(dataset: dict[str, Any], user: User) -> bool:
+ if dataset["visibility"] == "public":
+ return True
+ return user is not None and (
+ dataset["uploader"] == user.user_id or UserGroup.ADMIN in user.groups
+ )
+
+
@router.get("/qualities/{dataset_id}")
def get_qualities(
dataset_id: int,
+ user: Annotated[User, Depends(fetch_user)],
expdb: Annotated[Connection, Depends(expdb_connection)],
) -> list[Quality]:
+ dataset = get_dataset(dataset_id, expdb)
+ if not dataset or not _user_can_see_dataset(dataset, user):
+ raise HTTPException(
+ status_code=http.client.PRECONDITION_FAILED,
+ detail={"code": DatasetError.NO_DATA_FILE, "message": "Unknown dataset"},
+ ) from None
rows = expdb.execute(
text(
"""
@@ -38,3 +56,10 @@ def get_qualities(
parameters={"dataset_id": dataset_id},
)
return [Quality(name=row.quality, value=row.value) for row in rows]
+ # The PHP API provided (sometime) helpful error messages
+ # if not qualities:
+ # check if dataset exists: error 360
+ # check if user has access: error 361
+ # check if there is a data processed entry and forward the error: 364
+ # if nothing in process table: 363
+ # otherwise: error 362
diff --git a/src/schemas/datasets/openml.py b/src/schemas/datasets/openml.py
index 6ff4d49b..e1360006 100644
--- a/src/schemas/datasets/openml.py
+++ b/src/schemas/datasets/openml.py
@@ -26,7 +26,7 @@ class DatasetStatus(StrEnum):
class Quality(BaseModel):
name: str
- value: float
+ value: float | None
class DatasetMetadata(BaseModel):
diff --git a/tests/routers/v1/qualities_test.py b/tests/routers/v1/qualities_test.py
index 368cf679..729c6395 100644
--- a/tests/routers/v1/qualities_test.py
+++ b/tests/routers/v1/qualities_test.py
@@ -275,3 +275,28 @@ def test_get_quality(api_client: TestClient) -> None:
{"name": "kNN1NKappa", "value": 0.8261102938928316},
]
assert response.json() == expected
+
+
+@pytest.mark.php()
+@pytest.mark.parametrize(
+ "data_id",
+ list(range(1, 130)),
+)
+def test_get_quality_identical(data_id: int, api_client: TestClient) -> None:
+ php_response = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/data/qualities/{data_id}")
+ if php_response.status_code == http.client.PRECONDITION_FAILED and php_response.json()["error"][
+ "code"
+ ] in ["362", "364"]:
+ pytest.skip("Detailed error reporting not yet re-implemented.")
+
+ python_response = api_client.get(f"/v1/datasets/qualities/{data_id}")
+ assert python_response.status_code == php_response.status_code
+
+ expected = [
+ {
+ "name": quality["name"],
+ "value": None if quality["value"] == [] else float(quality["value"]),
+ }
+ for quality in php_response.json()["data_qualities"]["quality"]
+ ]
+ assert python_response.json() == expected
From dcb93cd316a4c79fab764634d0144f5f66d16731 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Tue, 28 Nov 2023 10:55:30 +0100
Subject: [PATCH 3/3] Test failure cases and document them in separate test
---
tests/routers/v1/qualities_test.py | 25 +++++++++++++++++++------
1 file changed, 19 insertions(+), 6 deletions(-)
diff --git a/tests/routers/v1/qualities_test.py b/tests/routers/v1/qualities_test.py
index 729c6395..d2de98b6 100644
--- a/tests/routers/v1/qualities_test.py
+++ b/tests/routers/v1/qualities_test.py
@@ -280,15 +280,10 @@ def test_get_quality(api_client: TestClient) -> None:
@pytest.mark.php()
@pytest.mark.parametrize(
"data_id",
- list(range(1, 130)),
+ list(set(range(1, 132)) - {55, 56, 59, 116, 130}),
)
def test_get_quality_identical(data_id: int, api_client: TestClient) -> None:
php_response = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/data/qualities/{data_id}")
- if php_response.status_code == http.client.PRECONDITION_FAILED and php_response.json()["error"][
- "code"
- ] in ["362", "364"]:
- pytest.skip("Detailed error reporting not yet re-implemented.")
-
python_response = api_client.get(f"/v1/datasets/qualities/{data_id}")
assert python_response.status_code == php_response.status_code
@@ -300,3 +295,21 @@ def test_get_quality_identical(data_id: int, api_client: TestClient) -> None:
for quality in php_response.json()["data_qualities"]["quality"]
]
assert python_response.json() == expected
+
+
+@pytest.mark.php()
+@pytest.mark.parametrize(
+ "data_id",
+ [55, 56, 59, 116, 130, 132],
+)
+def test_get_quality_identical_error(data_id: int, api_client: TestClient) -> None:
+ if data_id in [55, 56, 59]:
+ pytest.skip("Detailed error for code 364 (failed processing) not yet supported.")
+ if data_id in [116]:
+ pytest.skip("Detailed error for code 362 (no qualities) not yet supported.")
+ php_response = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/data/qualities/{data_id}")
+ python_response = api_client.get(f"/v1/datasets/qualities/{data_id}")
+ assert python_response.status_code == php_response.status_code
+ # The "dataset unknown" error currently has a separate code in PHP depending on
+ # where it occurs (e.g., get dataset->113 get quality->361)
+ assert python_response.json()["detail"]["message"] == php_response.json()["error"]["message"]